update

2025-12-10 15:50:11 +00:00
parent 50b9888004
commit 4e8b60f77c
12 changed files with 247 additions and 106 deletions
@@ -87,7 +87,8 @@ class ChromaClient:
            
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
+            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
            
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
        
        if not feed.entries:
            print(f"   ⚠ No entries found in feed")
-            return 0
+            return {
+                'crawled': 0,
+                'summarized': 0,
+                'failed_summaries': 0
+            }
        
        crawled_count = 0
        summarized_count = 0
@@ -37,12 +37,12 @@ def main():
    """Main scheduler loop"""
    print("🤖 Munich News Crawler Scheduler")
    print("="*60)
-    print("Schedule: Daily at 6:00 AM Berlin time")
+    print("Schedule: Every 3 hours")
    print("Timezone: Europe/Berlin (CET/CEST)")
    print("="*60)
    
-    # Schedule the crawler to run at 6 AM Berlin time
-    schedule.every().day.at("06:00").do(run_crawler)
+    # Schedule the crawler to run every 3 hours
+    schedule.every(3).hours.do(run_crawler)
    
    # Show next run time
    berlin_time = datetime.now(BERLIN_TZ)