From f0e552b1b16af198467f8be8e339ef869b7c8b36 Mon Sep 17 00:00:00 2001 From: Dongho Kim Date: Fri, 14 Nov 2025 13:00:36 +0100 Subject: [PATCH] transport --- transport_crawler/requirements.txt | 1 + transport_crawler/scheduled_crawler.py | 71 ++++++++++++++++++++++++++ transport_crawler/start.sh | 9 ++-- 3 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 transport_crawler/scheduled_crawler.py diff --git a/transport_crawler/requirements.txt b/transport_crawler/requirements.txt index 201070f..2cb6b90 100644 --- a/transport_crawler/requirements.txt +++ b/transport_crawler/requirements.txt @@ -8,3 +8,4 @@ selenium==4.15.2 webdriver-manager==4.0.1 flask==3.0.0 redis==5.0.1 +schedule==1.2.0 diff --git a/transport_crawler/scheduled_crawler.py b/transport_crawler/scheduled_crawler.py new file mode 100644 index 0000000..960e25e --- /dev/null +++ b/transport_crawler/scheduled_crawler.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Scheduled transport crawler that runs daily at 6 AM Berlin time +""" +import schedule +import time +from datetime import datetime +import pytz +from crawler_service import run_crawler + +# Berlin timezone +BERLIN_TZ = pytz.timezone('Europe/Berlin') + +def run_transport_crawler(): + """Run the transport crawler and log the execution""" + berlin_time = datetime.now(BERLIN_TZ) + print(f"\n{'='*60}") + print(f"šŸ• Scheduled transport crawler started at {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + print(f"{'='*60}\n") + + try: + # Run crawler + result = run_crawler() + + print(f"\n{'='*60}") + print(f"āœ“ Scheduled transport crawler completed successfully") + print(f" Total disruptions: {result['total_disruptions']}") + print(f" MVG disruptions: {result['mvg_disruptions']}") + print(f" S-Bahn disruptions: {result['sbahn_disruptions']}") + print(f"{'='*60}\n") + + except Exception as e: + print(f"\n{'='*60}") + print(f"āœ— Scheduled transport crawler failed: {e}") + print(f"{'='*60}\n") + +def main(): + """Main scheduler loop""" + print("šŸš‡ Munich Transport Crawler Scheduler") + print("="*60) + print("Schedule: Daily at 6:00 AM Berlin time") + print("Timezone: Europe/Berlin (CET/CEST)") + print("="*60) + + # Schedule the crawler to run at 6 AM Berlin time + schedule.every().day.at("06:00").do(run_transport_crawler) + + # Show next run time + berlin_time = datetime.now(BERLIN_TZ) + print(f"\nCurrent time (Berlin): {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + next_run = schedule.next_run() + if next_run: + print(f"Next scheduled run: {next_run.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + print("\nā³ Scheduler is running... (Press Ctrl+C to stop)\n") + + # Run once on startup + print("šŸš€ Running initial crawl on startup...\n") + run_transport_crawler() + + # Keep running + while True: + schedule.run_pending() + time.sleep(60) # Check every minute + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + print("\n\nšŸ‘‹ Scheduler stopped by user") diff --git a/transport_crawler/start.sh b/transport_crawler/start.sh index a46db25..60f0457 100644 --- a/transport_crawler/start.sh +++ b/transport_crawler/start.sh @@ -1,8 +1,11 @@ #!/bin/bash -# Start both the API server and the worker +# Start the scheduler, worker, and API server # Start the worker in the background python -u worker.py & -# Start the API server in the foreground -python -u api_service.py +# Start the API server in the background +python -u api_service.py & + +# Start the scheduler in the foreground +python -u scheduled_crawler.py