update
This commit is contained in:
@@ -87,7 +87,8 @@ class ChromaClient:
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
title = article.get('title', '')
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
|
||||
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
|
||||
if not feed.entries:
|
||||
print(f" ⚠ No entries found in feed")
|
||||
return 0
|
||||
return {
|
||||
'crawled': 0,
|
||||
'summarized': 0,
|
||||
'failed_summaries': 0
|
||||
}
|
||||
|
||||
crawled_count = 0
|
||||
summarized_count = 0
|
||||
|
||||
@@ -37,12 +37,12 @@ def main():
|
||||
"""Main scheduler loop"""
|
||||
print("🤖 Munich News Crawler Scheduler")
|
||||
print("="*60)
|
||||
print("Schedule: Daily at 6:00 AM Berlin time")
|
||||
print("Schedule: Every 3 hours")
|
||||
print("Timezone: Europe/Berlin (CET/CEST)")
|
||||
print("="*60)
|
||||
|
||||
# Schedule the crawler to run at 6 AM Berlin time
|
||||
schedule.every().day.at("06:00").do(run_crawler)
|
||||
# Schedule the crawler to run every 3 hours
|
||||
schedule.every(3).hours.do(run_crawler)
|
||||
|
||||
# Show next run time
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
|
||||
Reference in New Issue
Block a user