This commit is contained in:
2025-12-10 15:50:11 +00:00
parent 50b9888004
commit 4e8b60f77c
12 changed files with 247 additions and 106 deletions

View File

@@ -87,7 +87,8 @@ class ChromaClient:
# Prepare text for embedding (Title + Summary + Start of Content)
# This gives semantic search a good overview
title = article.get('title', '')
# Use English title if available, otherwise original
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
summary = article.get('summary') or ''
content_snippet = article.get('content', '')[:1000]

View File

@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
if not feed.entries:
print(f" ⚠ No entries found in feed")
return 0
return {
'crawled': 0,
'summarized': 0,
'failed_summaries': 0
}
crawled_count = 0
summarized_count = 0

View File

@@ -37,12 +37,12 @@ def main():
"""Main scheduler loop"""
print("🤖 Munich News Crawler Scheduler")
print("="*60)
print("Schedule: Daily at 6:00 AM Berlin time")
print("Schedule: Every 3 hours")
print("Timezone: Europe/Berlin (CET/CEST)")
print("="*60)
# Schedule the crawler to run at 6 AM Berlin time
schedule.every().day.at("06:00").do(run_crawler)
# Schedule the crawler to run every 3 hours
schedule.every(3).hours.do(run_crawler)
# Show next run time
berlin_time = datetime.now(BERLIN_TZ)