update
This commit is contained in:
75
news_crawler/scheduled_crawler.py
Executable file
75
news_crawler/scheduled_crawler.py
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scheduled crawler that runs daily at 6 AM Berlin time
|
||||
"""
|
||||
import schedule
|
||||
import time
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from crawler_service import crawl_all_feeds
|
||||
|
||||
# Berlin timezone
|
||||
BERLIN_TZ = pytz.timezone('Europe/Berlin')
|
||||
|
||||
def run_crawler():
|
||||
"""Run the crawler and log the execution"""
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🕐 Scheduled crawler started at {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
try:
|
||||
# Run crawler with max 20 articles per feed
|
||||
result = crawl_all_feeds(max_articles_per_feed=20)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✓ Scheduled crawler completed successfully")
|
||||
print(f" Articles crawled: {result['total_articles_crawled']}")
|
||||
print(f" Duration: {result['duration_seconds']}s")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✗ Scheduled crawler failed: {e}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
def main():
|
||||
"""Main scheduler loop"""
|
||||
print("🤖 Munich News Crawler Scheduler")
|
||||
print("="*60)
|
||||
print("Schedule: Daily at 6:00 AM Berlin time")
|
||||
print("Timezone: Europe/Berlin (CET/CEST)")
|
||||
print("="*60)
|
||||
|
||||
# Schedule the crawler to run at 6 AM Berlin time
|
||||
schedule.every().day.at("06:00").do(run_crawler)
|
||||
|
||||
# Show next run time
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
print(f"\nCurrent time (Berlin): {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
|
||||
# Get next scheduled run
|
||||
next_run = schedule.next_run()
|
||||
if next_run:
|
||||
# Convert to Berlin time for display
|
||||
next_run_berlin = next_run.astimezone(BERLIN_TZ)
|
||||
print(f"Next scheduled run: {next_run_berlin.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
|
||||
print("\n⏳ Scheduler is running... (Press Ctrl+C to stop)\n")
|
||||
|
||||
# Run immediately on startup (optional - comment out if you don't want this)
|
||||
print("🚀 Running initial crawl on startup...")
|
||||
run_crawler()
|
||||
|
||||
# Keep the scheduler running
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Scheduler stopped by user")
|
||||
except Exception as e:
|
||||
print(f"\n\n✗ Scheduler error: {e}")
|
||||
Reference in New Issue
Block a user