update

2025-11-14 12:51:18 +01:00
parent 433a16ee0e
commit 869ca3a894
20 changed files with 1606 additions and 38 deletions
@@ -10,6 +10,7 @@ from routes.newsletter_routes import newsletter_bp
 from routes.tracking_routes import tracking_bp
 from routes.analytics_routes import analytics_bp
 from routes.admin_routes import admin_bp
 from routes.transport_routes import transport_bp
 # Initialize Flask app
 app = Flask(__name__)
@@ -27,6 +28,7 @@ app.register_blueprint(newsletter_bp)
 app.register_blueprint(tracking_bp)
 app.register_blueprint(analytics_bp)
 app.register_blueprint(admin_bp)
 app.register_blueprint(transport_bp)
 # Health check endpoint
@app.route('/health')
@@ -5,4 +5,5 @@ python-dotenv==1.0.0
 pymongo==4.6.1
 requests==2.31.0
 Jinja2==3.1.2
 redis==5.0.1
@@ -12,7 +12,8 @@ admin_bp = Blueprint('admin', __name__)
@admin_bp.route('/api/admin/trigger-crawl', methods=['POST'])
 def trigger_crawl():
    """
-    Manually trigger the news crawler
+    Manually trigger the news crawler asynchronously via Redis queue
    Uses Redis message queue for non-blocking execution
    Request body (optional):
    {
@@ -20,6 +21,9 @@ def trigger_crawl():
    }
    """
    try:
        import redis
        import json
        # Handle both JSON and empty body
        try:
            data = request.get_json(silent=True) or {}
@@ -35,41 +39,29 @@ def trigger_crawl():
                'error': 'max_articles must be an integer between 1 and 100'
            }), 400
-        # Execute crawler in crawler container using docker exec
+        # Get Redis client
-        try:
+        redis_url = os.getenv('REDIS_URL', 'redis://redis:6379')
-            result = subprocess.run(
+        r = redis.from_url(redis_url, decode_responses=True)
                ['docker', 'exec', 'munich-news-crawler', 'python', 'crawler_service.py', str(max_articles)],
                capture_output=True,
                text=True,
                timeout=300  # 5 minute timeout
            )
-            # Check result
+        # Publish message to Redis queue
-            success = result.returncode == 0
+        message = {
            'task': 'crawl_news',
            'max_articles': max_articles,
            'timestamp': str(os.times())
        }
        r.lpush('news_crawl_queue', json.dumps(message))
-            return jsonify({
+        # Return immediately without waiting
                'success': success,
                'message': f'Crawler {"executed successfully" if success else "failed"}',
                'max_articles': max_articles,
                'output': result.stdout[-1000:] if result.stdout else '',  # Last 1000 chars
                'errors': result.stderr[-500:] if result.stderr else ''
            }), 200 if success else 500
        except FileNotFoundError:
            return jsonify({
                'success': False,
                'error': 'Docker command not found. Make sure Docker is installed and the socket is mounted.'
            }), 500
    except subprocess.TimeoutExpired:
        return jsonify({
-            'success': False,
+            'success': True,
-            'error': 'Crawler timed out after 5 minutes'
+            'message': 'News crawl task queued',
-        }), 500
+            'max_articles': max_articles
        }), 202  # 202 Accepted
    except Exception as e:
        return jsonify({
            'success': False,
-            'error': f'Failed to run crawler: {str(e)}'
+            'error': f'Failed to queue news crawl: {str(e)}'
        }), 500
@@ -0,0 +1,74 @@
 from flask import Blueprint, jsonify
 from database import db
 import redis
 import os
 import json
 transport_bp = Blueprint('transport', __name__)
 REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
 def get_redis_client():
    """Get Redis client"""
    return redis.from_url(REDIS_URL, decode_responses=True)
@transport_bp.route('/api/transport/crawl', methods=['POST'])
 def trigger_transport_crawl():
    """Trigger transport disruption crawl asynchronously via Redis queue"""
    try:
        r = get_redis_client()
        # Publish message to Redis queue
        message = {
            'task': 'crawl_transport',
            'timestamp': str(os.times())
        }
        r.lpush('transport_crawl_queue', json.dumps(message))
        # Return immediately without waiting
        return jsonify({
            'status': 'success',
            'message': 'Transport crawl task queued'
        }), 202  # 202 Accepted
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': 'Failed to queue transport crawl',
            'details': str(e)
        }), 500
@transport_bp.route('/api/transport/disruptions', methods=['GET'])
 def get_transport_disruptions():
    """Get current transport disruptions from MongoDB"""
    try:
        collection = db['transport_alerts']
        # Get active disruptions
        disruptions = list(collection.find(
            {'is_active': True},
            {'_id': 0}
        ).sort('updated_at', -1))
        # Convert datetime to ISO format
        for d in disruptions:
            if d.get('start_time'):
                d['start_time'] = d['start_time'].isoformat()
            if d.get('end_time'):
                d['end_time'] = d['end_time'].isoformat()
            if d.get('updated_at'):
                d['updated_at'] = d['updated_at'].isoformat()
        return jsonify({
            'total': len(disruptions),
            'disruptions': disruptions
        }), 200
    except Exception as e:
        return jsonify({
            'error': 'Failed to fetch disruptions from database',
            'details': str(e)
        }), 500
@@ -57,6 +57,20 @@ services:
    command: sh /setup-ollama-model.sh
    restart: on-failure
  # Redis - Message queue for async tasks (Internal only - not exposed to host)
  redis:
    image: redis:7-alpine
    container_name: munich-news-redis
    restart: unless-stopped
    # No ports exposed - only accessible within Docker network
    networks:
      - munich-news-network
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 30s
      timeout: 10s
      retries: 3
  # MongoDB Database (Internal only - not exposed to host)
  mongodb:
    image: mongo:latest
@@ -90,8 +104,10 @@ services:
    depends_on:
      - mongodb
      - ollama
      - redis
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
      - REDIS_URL=redis://redis:6379
      - TZ=Europe/Berlin
    volumes:
      - ./backend/.env:/app/.env:ro
@@ -112,10 +128,12 @@ services:
    restart: unless-stopped
    depends_on:
      - mongodb
      - redis
    ports:
      - "5001:5001"
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
      - REDIS_URL=redis://redis:6379
      - FLASK_PORT=5001
      - TZ=Europe/Berlin
    volumes:
@@ -130,6 +148,32 @@ services:
      retries: 3
      start_period: 40s
  # Transport Crawler - API service for MVG disruptions (Internal only - not exposed to host)
  transport-crawler:
    build:
      context: ./transport_crawler
      dockerfile: Dockerfile
    container_name: munich-news-transport-crawler
    restart: unless-stopped
    depends_on:
      - mongodb
      - redis
    # No ports exposed - only accessible within Docker network
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
      - REDIS_URL=redis://redis:6379
      - TZ=Europe/Berlin
    volumes:
      - ./backend/.env:/app/.env:ro
    networks:
      - munich-news-network
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
  # Newsletter Sender - Runs at 7 AM Berlin time
  sender:
    build:
@@ -141,6 +185,7 @@ services:
      - mongodb
      - backend
      - crawler
      - transport-crawler
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
      - TZ=Europe/Berlin
@@ -12,12 +12,12 @@ COPY backend/config.py /app/config.py
 # Copy crawler files (includes ollama_client.py)
 COPY news_crawler/ /app/
-# Make the scheduler executable
+# Make scripts executable
-RUN chmod +x scheduled_crawler.py
+RUN chmod +x scheduled_crawler.py start.sh
 # Set timezone to Berlin
 ENV TZ=Europe/Berlin
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-# Run the scheduled crawler
+# Run both scheduler and worker
-CMD ["python", "-u", "scheduled_crawler.py"]
+CMD ["/app/start.sh"]
@@ -6,3 +6,4 @@ pymongo==4.6.1
 python-dotenv==1.0.0
 schedule==1.2.0
 pytz==2023.3
 redis==5.0.1
@@ -0,0 +1,8 @@
 #!/bin/bash
 # Start both the scheduler and the worker
 # Start the worker in the background
 python -u worker.py &
 # Start the scheduler in the foreground
 python -u scheduled_crawler.py
@@ -0,0 +1,72 @@
 #!/usr/bin/env python3
 """
 News Crawler Worker - Listens to Redis queue and processes crawl tasks
 """
 import redis
 import json
 import os
 import time
 from crawler_service import crawl_all_feeds
 REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
 QUEUE_NAME = 'news_crawl_queue'
 def get_redis_client():
    """Get Redis client"""
    return redis.from_url(REDIS_URL, decode_responses=True)
 def process_crawl_task(message):
    """Process a crawl task"""
    try:
        max_articles = message.get('max_articles', 10)
        print(f"\n📨 Received task: {message.get('task')}")
        print(f"   Max articles per feed: {max_articles}")
        print(f"   Timestamp: {message.get('timestamp')}")
        # Run the crawler
        result = crawl_all_feeds(max_articles_per_feed=max_articles)
        print(f"✅ Task completed: {result.get('total_articles_crawled')} articles crawled")
        return True
    except Exception as e:
        print(f"❌ Task failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def main():
    """Main worker loop"""
    print("="*70)
    print("📰 News Crawler Worker Starting")
    print("="*70)
    print(f"Redis URL: {REDIS_URL}")
    print(f"Queue: {QUEUE_NAME}")
    print("Waiting for tasks...")
    print("="*70)
    r = get_redis_client()
    while True:
        try:
            # Block and wait for messages (timeout 1 second)
            result = r.brpop(QUEUE_NAME, timeout=1)
            if result:
                queue_name, message_json = result
                message = json.loads(message_json)
                process_crawl_task(message)
        except KeyboardInterrupt:
            print("\n\n👋 Worker stopped by user")
            break
        except Exception as e:
            print(f"\n❌ Worker error: {e}")
            time.sleep(5)  # Wait before retrying
 if __name__ == '__main__':
    main()
@@ -247,6 +247,70 @@
                        </td>
                    </tr>
                    {% if transport_disruptions and transport_disruptions|length > 0 %}
                    <!-- Divider -->
                    <tr>
                        <td style="padding: 0 40px;">
                            <div style="height: 2px; background-color: #e0e0e0;"></div>
                        </td>
                    </tr>
                    <!-- Transport Disruptions Section -->
                    <tr>
                        <td style="padding: 30px 40px;">
                            <h2 style="margin: 0 0 20px 0; font-size: 22px; font-weight: 700; color: #1a1a1a;">
                                🚆 S-Bahn Disruptions Today
                            </h2>
                            <p style="margin: 0 0 20px 0; font-size: 14px; color: #666666;">
                                Current service disruptions affecting Munich S-Bahn:
                            </p>
                            {% for disruption in transport_disruptions %}
                            <!-- Disruption Card -->
                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-bottom: 15px; background-color: #fff8f0; border-left: 4px solid #ff9800; border-radius: 4px;">
                                <tr>
                                    <td style="padding: 15px 20px;">
                                        <!-- Severity and Lines -->
                                        <p style="margin: 0 0 8px 0; font-size: 13px; color: #666666;">
                                            {{ disruption.severity_icon }} <strong style="color: #000000;">{{ disruption.lines_str }}</strong>
                                        </p>
                                        <!-- Title -->
                                        <p style="margin: 0 0 8px 0; font-size: 15px; font-weight: 700; color: #1a1a1a; line-height: 1.4;">
                                            {{ disruption.title }}
                                        </p>
                                        <!-- Description -->
                                        {% if disruption.description %}
                                        <p style="margin: 0 0 8px 0; font-size: 14px; color: #333333; line-height: 1.5;">
                                            {{ disruption.description }}
                                        </p>
                                        {% endif %}
                                        <!-- Time -->
                                        {% if disruption.start_time_str or disruption.end_time_str %}
                                        <p style="margin: 0; font-size: 13px; color: #666666;">
                                            ⏰ 
                                            {% if disruption.start_time_str %}
                                            From {{ disruption.start_time_str }}
                                            {% endif %}
                                            {% if disruption.end_time_str %}
                                            until {{ disruption.end_time_str }}
                                            {% endif %}
                                        </p>
                                        {% endif %}
                                    </td>
                                </tr>
                            </table>
                            {% endfor %}
                            <p style="margin: 15px 0 0 0; font-size: 12px; color: #999999; font-style: italic;">
                                💡 Plan your commute accordingly. Check <a href="https://www.mvg.de" style="color: #667eea; text-decoration: none;">MVG.de</a> for real-time updates.
                            </p>
                        </td>
                    </tr>
                    {% endif %}
                    <!-- Footer -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
@@ -77,6 +77,72 @@ client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 articles_collection = db['articles']
 subscribers_collection = db['subscribers']
 transport_alerts_collection = db['transport_alerts']
 def get_today_transport_disruptions():
    """
    Get active S-Bahn disruptions for today
    Fetches from MongoDB transport_alerts collection
    Returns:
        list: Active disruptions with details
    """
    try:
        from datetime import datetime
        # Get active disruptions
        disruptions = list(transport_alerts_collection.find(
            {'is_active': True},
            {'_id': 0}
        ).sort('severity', -1).sort('updated_at', -1))
        # Filter for disruptions happening today
        today = datetime.utcnow().date()
        today_disruptions = []
        for d in disruptions:
            # Check if disruption is active today
            start_time = d.get('start_time')
            end_time = d.get('end_time')
            is_today = False
            if start_time and end_time:
                start_date = start_time.date() if hasattr(start_time, 'date') else today
                end_date = end_time.date() if hasattr(end_time, 'date') else today
                is_today = start_date <= today <= end_date
            elif start_time:
                start_date = start_time.date() if hasattr(start_time, 'date') else today
                is_today = start_date <= today
            else:
                is_today = True  # No time info, assume it's relevant
            if is_today:
                # Format times for display
                if start_time:
                    d['start_time_str'] = start_time.strftime('%H:%M') if hasattr(start_time, 'strftime') else str(start_time)
                if end_time:
                    d['end_time_str'] = end_time.strftime('%H:%M') if hasattr(end_time, 'strftime') else str(end_time)
                # Format lines as comma-separated string
                d['lines_str'] = ', '.join(d.get('lines', []))
                # Get severity icon
                severity_icons = {
                    'high': '🔴',
                    'medium': '🟡',
                    'low': '🟢'
                }
                d['severity_icon'] = severity_icons.get(d.get('severity', 'medium'), '🟡')
                today_disruptions.append(d)
        print(f"✓ Found {len(today_disruptions)} transport disruptions for today")
        return today_disruptions
    except Exception as e:
        print(f"✗ Error fetching transport disruptions: {e}")
        return []
 def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
@@ -296,6 +362,9 @@ def render_newsletter_html(articles, subscriber_categories=None, tracking_enable
    from weather_service import get_munich_weather
    weather = get_munich_weather()
    # Get transport disruptions for today
    transport_disruptions = get_today_transport_disruptions()
    # Prepare template data
    now = datetime.now()
    total_articles = sum(len(section['articles']) for section in category_sections)
@@ -308,7 +377,8 @@ def render_newsletter_html(articles, subscriber_categories=None, tracking_enable
        'preferences_link': f'{Config.WEBSITE_URL}/preferences.html',
        'website_link': Config.WEBSITE_URL,
        'tracking_enabled': tracking_enabled,
-        'weather': weather
+        'weather': weather,
        'transport_disruptions': transport_disruptions
    }
    # Render HTML
@@ -17,7 +17,7 @@ if command -v nvidia-smi &> /dev/null; then
        echo "✓ NVIDIA Docker runtime is available"
        echo ""
        echo "Starting services with GPU support..."
-        docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
+        docker compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
        echo ""
        echo "✓ Services started with GPU acceleration!"
        echo ""
@@ -0,0 +1,30 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install Chromium and dependencies for Selenium (works on both ARM64 and AMD64)
 RUN apt-get update && apt-get install -y \
    chromium \
    chromium-driver \
    && rm -rf /var/lib/apt/lists/*
 # Set environment variable for Chromium
 ENV CHROME_BIN=/usr/bin/chromium
 ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy crawler files
 COPY *.py /app/
 COPY *.sh /app/
 # Make start script executable
 RUN chmod +x /app/start.sh
 # Expose port for API
 EXPOSE 5002
 # Run both API and worker
 CMD ["/app/start.sh"]
@@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 """
 Transport Crawler API - Flask service for triggering crawls
 """
 from flask import Flask, jsonify
 from crawler_service import run_crawler
 import os
 app = Flask(__name__)
@app.route('/health', methods=['GET'])
 def health():
    """Health check endpoint"""
    return jsonify({'status': 'ok'}), 200
@app.route('/api/transport/crawl', methods=['POST'])
 def trigger_crawl():
    """Trigger a transport disruption crawl"""
    try:
        result = run_crawler()
        return jsonify(result), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@app.route('/api/transport/disruptions', methods=['GET'])
 def get_disruptions():
    """Get current active disruptions from MongoDB"""
    try:
        from pymongo import MongoClient
        mongo_uri = os.getenv('MONGODB_URI', 'mongodb://admin:changeme@mongodb:27017/')
        client = MongoClient(mongo_uri)
        db = client['munich_news']
        collection = db['transport_alerts']
        # Get active disruptions
        disruptions = list(collection.find(
            {'is_active': True},
            {'_id': 0}
        ))
        # Convert datetime to ISO format
        for d in disruptions:
            if d.get('start_time'):
                d['start_time'] = d['start_time'].isoformat()
            if d.get('end_time'):
                d['end_time'] = d['end_time'].isoformat()
            if d.get('updated_at'):
                d['updated_at'] = d['updated_at'].isoformat()
        return jsonify({
            'total': len(disruptions),
            'disruptions': disruptions
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
 if __name__ == '__main__':
    port = int(os.getenv('FLASK_PORT', 5002))
    app.run(host='0.0.0.0', port=port, debug=False)
@@ -0,0 +1,233 @@
 #!/usr/bin/env python3
 """
 Transport Crawler Service - Main orchestrator
 Fetches disruptions from multiple sources and displays them
 """
 from datetime import datetime
 from mvg_api_client import MVGClient
 from db_api_client import DBClient
 def print_header():
    """Print header"""
    print("\n" + "="*70)
    print("🚇 Munich Transport Disruption Crawler")
    print("="*70)
    print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*70)
 def print_disruption_summary(all_disruptions):
    """Print summary of all disruptions"""
    if not all_disruptions:
        print("\n✅ No disruptions found - All lines operating normally!")
        return
    print(f"\n📊 SUMMARY: {len(all_disruptions)} Active Disruptions")
    print("="*70)
    # Group by type
    by_type = {}
    for d in all_disruptions:
        dtype = d.get('type', 'unknown')
        by_type[dtype] = by_type.get(dtype, 0) + 1
    print("\nBy Type:")
    for dtype, count in sorted(by_type.items()):
        icon = {
            'maintenance': '🔧',
            'disruption': '⚠️',
            'delay': '⏱️',
            'info': 'ℹ️'
        }.get(dtype, '❓')
        print(f"  {icon} {dtype.title()}: {count}")
    # Group by source
    by_source = {}
    for d in all_disruptions:
        source = d.get('source', 'unknown')
        by_source[source] = by_source.get(source, 0) + 1
    print("\nBy Source:")
    for source, count in sorted(by_source.items()):
        print(f"  • {source}: {count}")
 def print_disruptions(disruptions, title):
    """Print disruptions in a formatted way"""
    if not disruptions:
        return
    print(f"\n{title}")
    print("-"*70)
    for i, d in enumerate(disruptions, 1):
        # Icon based on type
        icon = {
            'maintenance': '🔧',
            'disruption': '⚠️',
            'delay': '⏱️',
            'info': 'ℹ️'
        }.get(d.get('type', 'info'), '❓')
        print(f"\n{icon} [{i}] {d.get('title', 'No title')}")
        # Lines affected
        lines = d.get('lines', [])
        if lines:
            line_str = ', '.join(lines)
            print(f"    🚇 Lines: {line_str}")
        # Time range
        start = d.get('start_time')
        end = d.get('end_time')
        if start or end:
            time_str = ""
            if start:
                time_str += f"From: {start.strftime('%d.%m %H:%M')}"
            if end:
                if time_str:
                    time_str += " → "
                time_str += f"Until: {end.strftime('%d.%m %H:%M')}"
            print(f"    ⏰ {time_str}")
        # Description
        desc = d.get('description', '')
        if desc:
            # Truncate long descriptions
            if len(desc) > 150:
                desc = desc[:150] + "..."
            print(f"    📝 {desc}")
        # Severity
        severity = d.get('severity', 'medium')
        severity_icon = {
            'high': '🔴',
            'medium': '🟡',
            'low': '🟢'
        }.get(severity, '⚪')
        print(f"    {severity_icon} Severity: {severity}")
 def save_to_mongodb(disruptions):
    """Save disruptions to MongoDB"""
    try:
        from pymongo import MongoClient
        import os
        mongo_uri = os.getenv('MONGODB_URI', 'mongodb://admin:changeme@mongodb:27017/')
        client = MongoClient(mongo_uri)
        db = client['munich_news']
        collection = db['transport_alerts']
        print("\n💾 Saving to MongoDB...")
        # Mark all existing alerts as inactive
        collection.update_many({}, {'$set': {'is_active': False}})
        # Insert or update current disruptions
        saved_count = 0
        for d in disruptions:
            # Use disruption ID as unique identifier
            collection.update_one(
                {'alert_id': d['id']},
                {
                    '$set': {
                        'alert_id': d['id'],
                        'title': d['title'],
                        'description': d['description'],
                        'lines': d['lines'],
                        'type': d['type'],
                        'severity': d['severity'],
                        'start_time': d['start_time'],
                        'end_time': d['end_time'],
                        'source': d['source'],
                        'is_active': True,
                        'updated_at': datetime.utcnow()
                    }
                },
                upsert=True
            )
            saved_count += 1
        print(f"✓ Saved {saved_count} disruptions to MongoDB")
        return True
    except Exception as e:
        print(f"✗ MongoDB error: {e}")
        return False
 def run_crawler():
    """Main crawler function"""
    print_header()
    all_disruptions = []
    # 1. Fetch MVG disruptions (U-Bahn, Tram, Bus)
    print("\n📡 Fetching data from sources...")
    print("-"*70)
    mvg_client = MVGClient()
    mvg_disruptions = mvg_client.get_disruptions()
    all_disruptions.extend(mvg_disruptions)
    # 2. Fetch S-Bahn disruptions
    db_client = DBClient()
    sbahn_disruptions = db_client.get_sbahn_disruptions()
    all_disruptions.extend(sbahn_disruptions)
    # 3. Print summary
    print_disruption_summary(all_disruptions)
    # 4. Print detailed disruptions
    if mvg_disruptions:
        print_disruptions(mvg_disruptions, "\n🚇 MVG DISRUPTIONS (U-Bahn, Tram, Bus)")
    if sbahn_disruptions:
        print_disruptions(sbahn_disruptions, "\n🚆 S-BAHN DISRUPTIONS")
    # 5. Output JSON
    print("\n" + "="*70)
    print("📄 JSON OUTPUT")
    print("="*70)
    import json
    output = {
        'timestamp': datetime.now().isoformat(),
        'total_disruptions': len(all_disruptions),
        'mvg_disruptions': len(mvg_disruptions),
        'sbahn_disruptions': len(sbahn_disruptions),
        'disruptions': []
    }
    for d in all_disruptions:
        output['disruptions'].append({
            'id': d.get('id'),
            'title': d.get('title'),
            'description': d.get('description'),
            'lines': d.get('lines', []),
            'type': d.get('type'),
            'severity': d.get('severity'),
            'start_time': d.get('start_time').isoformat() if d.get('start_time') else None,
            'end_time': d.get('end_time').isoformat() if d.get('end_time') else None,
            'source': d.get('source')
        })
    print(json.dumps(output, indent=2, ensure_ascii=False))
    # 6. Save to MongoDB
    save_to_mongodb(all_disruptions)
    # Footer
    print("\n" + "="*70)
    print("✓ Crawler finished")
    print("="*70 + "\n")
    return output
 if __name__ == '__main__':
    try:
        disruptions = run_crawler()
    except KeyboardInterrupt:
        print("\n\n👋 Crawler stopped by user")
    except Exception as e:
        print(f"\n\n❌ Crawler error: {e}")
        import traceback
        traceback.print_exc()
@@ -0,0 +1,789 @@
 #!/usr/bin/env python3
 """
 Deutsche Bahn API Client - Fetch S-Bahn disruptions using Selenium
 """
 import requests
 from datetime import datetime
 import time
 class DBClient:
    """Client for Deutsche Bahn (S-Bahn) disruptions"""
    # DB S-Bahn München map page
    MAP_URL = "https://karte.bahn.de/en/region/DB_SBahn_Muenchen"
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
        })
    def get_sbahn_disruptions(self):
        """
        Fetch S-Bahn disruptions for Munich from DB Karte using Selenium
        Returns:
            list: Disruption data
        """
        print("\n🔍 Fetching S-Bahn disruptions from DB Karte (using Selenium)...")
        driver = None
        try:
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            from selenium.webdriver.chrome.service import Service
            from selenium.webdriver.common.by import By
            from selenium.webdriver.support.ui import WebDriverWait
            from selenium.webdriver.support import expected_conditions as EC
            import os
            # Setup Chrome options for Chromium
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
            chrome_options.add_argument('--window-size=1920,1080')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            # Set realistic user agent
            chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
            # Use system Chromium if available (Docker container)
            chrome_bin = os.getenv('CHROME_BIN', '/usr/bin/chromium')
            chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
            if os.path.exists(chrome_bin):
                chrome_options.binary_location = chrome_bin
                print(f"   Using system Chromium: {chrome_bin}")
            print("   Starting Chromium browser...")
            # Try to use system chromedriver
            try:
                if os.path.exists(chromedriver_path):
                    service = Service(chromedriver_path)
                    driver = webdriver.Chrome(service=service, options=chrome_options)
                else:
                    driver = webdriver.Chrome(options=chrome_options)
            except Exception as e:
                print(f"   ✗ Failed to start Chromium: {e}")
                print(f"   ℹ️  Falling back to webdriver-manager...")
                try:
                    from webdriver_manager.chrome import ChromeDriverManager
                    service = Service(ChromeDriverManager().install())
                    driver = webdriver.Chrome(service=service, options=chrome_options)
                except Exception as e2:
                    print(f"   ✗ webdriver-manager also failed: {e2}")
                    raise
            print(f"   Loading: {self.MAP_URL}")
            driver.get(self.MAP_URL)
            # Wait for page to load
            print("   Waiting for page to load...")
            # Wait for disruption boxes to appear
            try:
                print("   Waiting for disruption boxes...")
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']"))
                )
                # Give extra time for all boxes to load
                time.sleep(3)
                print("   ✓ Disruption boxes should be loaded")
            except Exception as e:
                print(f"   ⚠ Timeout waiting for disruption boxes: {e}")
                time.sleep(5)
            print(f"   ✓ Page loaded (title: {driver.title[:50]}...)")
            # Debug: Save screenshot and page source
            try:
                screenshot_path = "/tmp/db_karte_screenshot.png"
                driver.save_screenshot(screenshot_path)
                print(f"   📸 Screenshot saved to: {screenshot_path}")
            except:
                pass
            # Debug: Print page structure
            print("   Analyzing page structure...")
            page_source = driver.page_source
            # Save page source for inspection
            try:
                with open("/tmp/db_karte_source.html", "w", encoding="utf-8") as f:
                    f.write(page_source)
                print(f"   📄 Page source saved to: /tmp/db_karte_source.html")
            except:
                pass
            # Look for disruption markers/icons on the map
            disruptions = self._find_and_click_disruptions(driver)
            # If no disruptions found via clicking, parse the page source
            if not disruptions:
                print("   No clickable disruptions found, parsing page source...")
                # Debug: Show what elements are on the page
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(page_source, 'html.parser')
                # Count different element types
                print(f"   Page stats: {len(soup.find_all('div'))} divs, {len(soup.find_all('button'))} buttons")
                # Look for any text mentioning disruptions
                text = soup.get_text().lower()
                if 'disruption' in text or 'störung' in text or 'incident' in text:
                    print(f"   ℹ️  Page contains disruption-related text")
                # Check for common map libraries
                if 'leaflet' in page_source.lower():
                    print(f"   ℹ️  Page uses Leaflet maps")
                if 'mapbox' in page_source.lower():
                    print(f"   ℹ️  Page uses Mapbox")
                if 'google.maps' in page_source.lower():
                    print(f"   ℹ️  Page uses Google Maps")
                disruptions = self._parse_selenium_page(page_source, driver)
            if disruptions:
                print(f"✓ Found {len(disruptions)} S-Bahn disruptions")
            else:
                print(f"   ℹ️  No S-Bahn disruptions found (all lines operating normally)")
            return disruptions
        except ImportError as e:
            print(f"   ✗ Selenium not available: {e}")
            print(f"   ℹ️  Install with: pip install selenium webdriver-manager")
            return []
        except Exception as e:
            print(f"   ✗ Error: {e}")
            import traceback
            traceback.print_exc()
            return []
        finally:
            if driver:
                driver.quit()
    def _find_and_click_disruptions(self, driver):
        """Find disruption boxes in the sidebar"""
        try:
            from selenium.webdriver.common.by import By
            disruptions = []
            print("   Looking for disruption boxes...")
            # Find all disruption boxes in the sidebar
            disruption_boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']")
            if not disruption_boxes:
                print("   No disruption boxes found")
                return []
            print(f"   Found {len(disruption_boxes)} disruption boxes")
            # First pass: collect all basic info without clicking
            basic_info = []
            for i, box in enumerate(disruption_boxes):
                try:
                    # Extract disruption ID
                    disruption_id = box.get_attribute('id')
                    # Extract title
                    title_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxTitle']")
                    title = title_elem.text.strip()
                    # Extract subtitle (type)
                    subtitle_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxSubtitle']")
                    subtitle = subtitle_elem.text.strip()
                    # Extract affected lines
                    lines = []
                    badge_list = box.find_element(By.CSS_SELECTOR, "div[data-cy='disruptionBadgeList']")
                    badges = badge_list.find_elements(By.CSS_SELECTOR, "span[data-cy='disruptionBadge']")
                    for badge in badges:
                        line_text = badge.text.strip()
                        if line_text and line_text.startswith('S'):
                            lines.append(line_text)
                    # Determine severity from icon
                    severity = 'medium'
                    try:
                        icon = box.find_element(By.CSS_SELECTOR, "img[data-cy='disruptionboxIcon']")
                        icon_src = icon.get_attribute('src')
                        if 'red' in icon_src:
                            severity = 'high'
                        elif 'orange' in icon_src:
                            severity = 'medium'
                        elif 'yellow' in icon_src:
                            severity = 'low'
                    except:
                        pass
                    # Store basic info
                    basic_info.append({
                        'id': disruption_id or f"sbahn_{i}",
                        'title': title,
                        'subtitle': subtitle,
                        'lines': lines,
                        'severity': severity,
                        'index': i
                    })
                    print(f"   ✓ [{i}] {title[:60]}... (Lines: {', '.join(lines)})")
                except Exception as e:
                    print(f"   ✗ Error extracting disruption {i}: {e}")
                    continue
            # Second pass: click each one to get time details
            print(f"\n   Extracting time details for {len(basic_info)} disruptions...")
            for info in basic_info:
                print(f"   Processing disruption {info['index']}...")
                try:
                    # Make sure we're back at the list view
                    driver.execute_script("window.scrollTo(0, 0);")
                    time.sleep(0.5)
                    # Wait for boxes to be present again
                    try:
                        WebDriverWait(driver, 3).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']"))
                        )
                    except:
                        pass
                    # Refetch boxes each time
                    boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']")
                    print(f"   Found {len(boxes)} boxes after refetch")
                    if info['index'] >= len(boxes):
                        print(f"   ⚠ Box {info['index']} not found (only {len(boxes)} boxes available)")
                        continue
                    # Get fresh reference to the box and button
                    box = boxes[info['index']]
                    button = box.find_element(By.TAG_NAME, "button")
                    # Click to open details
                    driver.execute_script("arguments[0].scrollIntoView(true);", button)
                    time.sleep(0.3)
                    driver.execute_script("arguments[0].click();", button)  # Use JS click
                    time.sleep(1.5)  # Wait for detail panel to fully open
                    # Extract time from page text
                    detail_text = driver.find_element(By.TAG_NAME, "body").text
                    # Debug: show a snippet of the detail text
                    if "From:" in detail_text and "To:" in detail_text:
                        snippet_start = detail_text.find("From:")
                        snippet_end = detail_text.find("To:", snippet_start) + 50
                        snippet = detail_text[snippet_start:snippet_end]
                        print(f"   Time snippet: {snippet.replace(chr(10), ' ')}")
                    start_time, end_time = self._extract_time_range(detail_text)
                    # Go back to original page to reset the view
                    driver.get(self.MAP_URL)
                    time.sleep(3)  # Wait for page to reload and boxes to appear
                    # Create disruption object
                    disruption_type = self._classify_type(info['title'] + ' ' + info['subtitle'])
                    disruption = {
                        'id': info['id'],
                        'title': info['title'],
                        'description': info['subtitle'],
                        'lines': info['lines'],
                        'type': disruption_type,
                        'start_time': start_time,
                        'end_time': end_time,
                        'severity': info['severity'],
                        'source': 'db_karte_sidebar',
                        'created_at': datetime.utcnow()
                    }
                    disruptions.append(disruption)
                    time_info = ""
                    if start_time:
                        time_info += f" From: {start_time.strftime('%d.%m %H:%M')}"
                    if end_time:
                        time_info += f" To: {end_time.strftime('%d.%m %H:%M')}"
                    if time_info:
                        print(f"   ✓ [{info['index']}]{time_info}")
                except Exception as e:
                    print(f"   ⚠ Could not get time for disruption {info['index']}: {e}")
                    # Still add the disruption without time info
                    disruption = {
                        'id': info['id'],
                        'title': info['title'],
                        'description': info['subtitle'],
                        'lines': info['lines'],
                        'type': self._classify_type(info['title']),
                        'start_time': None,
                        'end_time': None,
                        'severity': info['severity'],
                        'source': 'db_karte_sidebar',
                        'created_at': datetime.utcnow()
                    }
                    disruptions.append(disruption)
            return disruptions
        except Exception as e:
            print(f"   ✗ Error finding disruption boxes: {e}")
            return []
    def _extract_disruption_details(self, driver):
        """Extract disruption details from popup/modal"""
        try:
            from selenium.webdriver.common.by import By
            # Look for popup/modal/tooltip containers
            popup_selectors = [
                "div[class*='popup']",
                "div[class*='modal']",
                "div[class*='tooltip']",
                "div[class*='detail']",
                "div[class*='info']",
                "[role='dialog']",
                "[role='tooltip']",
            ]
            popup = None
            for selector in popup_selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for elem in elements:
                        if elem.is_displayed() and len(elem.text) > 20:
                            popup = elem
                            break
                    if popup:
                        break
                except:
                    continue
            if not popup:
                # Try to get any recently appeared text
                body = driver.find_element(By.TAG_NAME, "body")
                popup_text = body.text
            else:
                popup_text = popup.text
            # Check if it's S-Bahn related
            if not self._contains_sbahn_reference(popup_text):
                return None
            # Extract title (usually first line or heading)
            title = popup_text.split('\n')[0][:100] if '\n' in popup_text else popup_text[:100]
            # Extract time information
            start_time, end_time = self._extract_time_range(popup_text)
            # Extract affected lines
            lines = self._extract_lines_from_text(popup_text)
            return {
                'id': f"sbahn_detail_{hash(popup_text) % 10000}",
                'title': title,
                'description': popup_text[:500],
                'lines': lines,
                'type': self._classify_type(title),
                'start_time': start_time,
                'end_time': end_time,
                'severity': self._determine_severity(popup_text),
                'source': 'db_karte_detail',
                'created_at': datetime.utcnow()
            }
        except Exception as e:
            return None
    def _extract_time_range(self, text):
        """Extract start and end time from text"""
        import re
        from datetime import datetime
        start_time = None
        end_time = None
        # Look for the specific format with possible newlines
        # Pattern: From:XX.YYYY-MM-DD, HH:MMTo:XX.YYYY-MM-DD, HH:MM
        # Remove newlines first to make matching easier
        text_clean = text.replace('\n', ' ').replace('\r', ' ')
        pattern = r'From:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})\s*To:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})'
        match = re.search(pattern, text_clean)
        if match:
            try:
                start_date = match.group(1)  # 2025-11-13
                start_time_str = match.group(2)  # 10:02
                end_date = match.group(3)  # 2025-11-13
                end_time_str = match.group(4)  # 14:30
                start_time = datetime.strptime(f"{start_date} {start_time_str}", "%Y-%m-%d %H:%M")
                end_time = datetime.strptime(f"{end_date} {end_time_str}", "%Y-%m-%d %H:%M")
            except Exception as e:
                print(f"   ⚠ Error parsing time: {e}")
        # Fallback: Try other German formats
        if not start_time:
            # Look for "ab DD.MM.YYYY HH:MM" or "bis DD.MM.YYYY HH:MM"
            ab_pattern = r'ab\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})'
            bis_pattern = r'bis\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})'
            ab_match = re.search(ab_pattern, text, re.IGNORECASE)
            if ab_match:
                try:
                    start_time = datetime.strptime(f"{ab_match.group(1)} {ab_match.group(2)}", "%d.%m.%Y %H:%M")
                except:
                    pass
            bis_match = re.search(bis_pattern, text, re.IGNORECASE)
            if bis_match:
                try:
                    end_time = datetime.strptime(f"{bis_match.group(1)} {bis_match.group(2)}", "%d.%m.%Y %H:%M")
                except:
                    pass
        return start_time, end_time
    def _determine_severity(self, text):
        """Determine severity based on keywords"""
        text_lower = text.lower()
        if any(word in text_lower for word in ['ausfall', 'gesperrt', 'eingestellt', 'komplett']):
            return 'high'
        elif any(word in text_lower for word in ['verspätung', 'verzögerung', 'teilweise']):
            return 'medium'
        else:
            return 'low'
    def _parse_selenium_page(self, page_source, driver):
        """Parse page loaded by Selenium"""
        try:
            from bs4 import BeautifulSoup
            from selenium.webdriver.common.by import By
            print("   Analyzing rendered page...")
            soup = BeautifulSoup(page_source, 'html.parser')
            disruptions = []
            # Method 1: Try to find disruption elements directly via Selenium
            try:
                # Look for common disruption indicators
                selectors = [
                    "div[class*='disruption']",
                    "div[class*='stoerung']",
                    "div[class*='incident']",
                    "div[class*='message']",
                    "div[class*='alert']",
                    "[data-disruption]",
                    "[data-incident]"
                ]
                for selector in selectors:
                    try:
                        elements = driver.find_elements(By.CSS_SELECTOR, selector)
                        if elements:
                            print(f"   Found {len(elements)} elements with selector: {selector}")
                            for elem in elements:
                                text = elem.text.strip()
                                if len(text) > 20 and self._contains_sbahn_reference(text):
                                    disruptions.append(self._create_disruption_from_text(text))
                    except:
                        continue
            except Exception as e:
                print(f"   ✗ Selenium element search error: {e}")
            # Method 2: Parse the page source with BeautifulSoup
            if not disruptions:
                print("   Trying BeautifulSoup parsing...")
                disruptions = self._parse_map_page(page_source.encode(), page_source)
            # Method 3: Check for any text mentioning S-Bahn lines with disruptions
            if not disruptions:
                print("   Checking page text for S-Bahn mentions...")
                page_text = soup.get_text()
                if self._contains_sbahn_reference(page_text):
                    # Extract paragraphs or sections mentioning S-Bahn
                    for elem in soup.find_all(['p', 'div', 'span']):
                        text = elem.get_text(strip=True)
                        if len(text) > 30 and self._contains_sbahn_reference(text):
                            lines = self._extract_lines_from_text(text)
                            if lines:
                                disruptions.append(self._create_disruption_from_text(text))
            # Remove duplicates
            seen = set()
            unique = []
            for d in disruptions:
                key = d['title'][:50]
                if key not in seen:
                    seen.add(key)
                    unique.append(d)
            return unique
        except Exception as e:
            print(f"   ✗ Parse error: {e}")
            import traceback
            traceback.print_exc()
            return []
    def _contains_sbahn_reference(self, text):
        """Check if text contains S-Bahn line references"""
        import re
        return bool(re.search(r'S[\s-]?[1-8]', text, re.IGNORECASE))
    def _create_disruption_from_text(self, text):
        """Create disruption object from text"""
        # Extract first sentence or first 100 chars as title
        sentences = text.split('.')
        title = sentences[0][:100] if sentences else text[:100]
        return {
            'id': f"sbahn_{hash(text) % 10000}",
            'title': title,
            'description': text[:500],
            'lines': self._extract_lines_from_text(text),
            'type': self._classify_type(title),
            'start_time': None,
            'end_time': None,
            'severity': 'medium',
            'source': 'db_karte_selenium',
            'created_at': datetime.utcnow()
        }
    def _parse_map_page(self, html_content, html_text):
        """Parse DB Karte map page for S-Bahn disruptions"""
        try:
            from bs4 import BeautifulSoup
            import re
            import json
            disruptions = []
            # Method 1: Look for embedded JSON data in script tags
            print("   Analyzing page for disruption data...")
            # The map page likely has JSON data embedded in <script> tags
            soup = BeautifulSoup(html_content, 'html.parser')
            scripts = soup.find_all('script')
            for script in scripts:
                if script.string:
                    # Look for JSON data containing disruption/störung information
                    script_text = script.string
                    # Try to find JSON objects
                    json_pattern = r'\{[^{}]*(?:"disruption"|"störung"|"incident"|"message")[^{}]*\}'
                    matches = re.finditer(json_pattern, script_text, re.IGNORECASE)
                    for match in matches:
                        try:
                            data = json.loads(match.group())
                            # Process found JSON data
                            if self._is_disruption_data(data):
                                disruption = self._parse_disruption_json(data)
                                if disruption:
                                    disruptions.append(disruption)
                        except json.JSONDecodeError:
                            continue
            # Method 2: Look for API endpoint URLs in the page
            api_pattern = r'https?://[^\s"\']+(?:api|disruption|stoerung)[^\s"\']+'
            api_urls = re.findall(api_pattern, html_text, re.IGNORECASE)
            if api_urls:
                print(f"   Found {len(api_urls)} potential API endpoints")
                for api_url in set(api_urls[:3]):  # Try first 3 unique URLs
                    try:
                        print(f"   Trying API: {api_url[:60]}...")
                        api_response = self.session.get(api_url, timeout=10)
                        if api_response.status_code == 200:
                            api_data = api_response.json()
                            api_disruptions = self._parse_api_response(api_data)
                            disruptions.extend(api_disruptions)
                    except:
                        continue
            # Method 3: Look for visible disruption messages on the page
            if not disruptions:
                print("   Checking for visible disruption messages...")
                disruptions = self._scrape_visible_disruptions(soup)
            # Remove duplicates based on title
            seen_titles = set()
            unique_disruptions = []
            for d in disruptions:
                if d['title'] not in seen_titles:
                    seen_titles.add(d['title'])
                    unique_disruptions.append(d)
            return unique_disruptions
        except Exception as e:
            print(f"   ✗ Parse error: {e}")
            import traceback
            traceback.print_exc()
            return []
    def _is_disruption_data(self, data):
        """Check if JSON data contains disruption information"""
        if not isinstance(data, dict):
            return False
        disruption_keys = ['disruption', 'störung', 'incident', 'message', 'title', 'description']
        return any(key in str(data).lower() for key in disruption_keys)
    def _parse_disruption_json(self, data):
        """Parse disruption from JSON data"""
        try:
            title = data.get('title') or data.get('headline') or data.get('message', '')
            if not title or len(title) < 5:
                return None
            return {
                'id': data.get('id', f"json_{hash(title)}"),
                'title': title,
                'description': data.get('description') or data.get('text') or data.get('content', ''),
                'lines': self._extract_lines_from_text(title),
                'type': self._classify_type(title),
                'start_time': None,
                'end_time': None,
                'severity': data.get('severity', 'medium'),
                'source': 'db_karte_json',
                'created_at': datetime.utcnow()
            }
        except:
            return None
    def _parse_api_response(self, data):
        """Parse API response for disruptions"""
        disruptions = []
        try:
            # Handle different response formats
            if isinstance(data, dict):
                if 'disruptions' in data:
                    data = data['disruptions']
                elif 'items' in data:
                    data = data['items']
                elif 'data' in data:
                    data = data['data']
                else:
                    data = [data]
            if isinstance(data, list):
                for item in data:
                    disruption = self._parse_disruption_json(item)
                    if disruption:
                        disruptions.append(disruption)
        except:
            pass
        return disruptions
    def _scrape_visible_disruptions(self, soup):
        """Scrape visible disruption messages from the page"""
        disruptions = []
        try:
            # Look for common disruption container classes
            selectors = [
                'div[class*="disruption"]',
                'div[class*="stoerung"]',
                'div[class*="incident"]',
                'div[class*="message"]',
                'div[class*="alert"]',
                'article[class*="disruption"]',
            ]
            for selector in selectors:
                elements = soup.select(selector)
                for elem in elements:
                    text = elem.get_text(strip=True)
                    if len(text) > 20 and any(word in text.lower() for word in ['s-bahn', 's1', 's2', 's3', 's4', 's6', 's7', 's8']):
                        # Extract title (first line or heading)
                        title_elem = elem.find(['h1', 'h2', 'h3', 'h4', 'strong'])
                        title = title_elem.get_text(strip=True) if title_elem else text[:100]
                        disruptions.append({
                            'id': f"visible_{len(disruptions)}",
                            'title': title,
                            'description': text[:500],
                            'lines': self._extract_lines_from_text(text),
                            'type': self._classify_type(title),
                            'start_time': None,
                            'end_time': None,
                            'severity': 'medium',
                            'source': 'db_karte_page',
                            'created_at': datetime.utcnow()
                        })
        except:
            pass
        return disruptions
    def _extract_lines_from_text(self, text):
        """Extract S-Bahn line numbers from text"""
        import re
        # Match S1, S2, S 3, S-4, etc.
        pattern = r'S[\s-]?[1-8]'
        matches = re.findall(pattern, text, re.IGNORECASE)
        # Normalize to format like "S1", "S2"
        lines = [re.sub(r'[^\dS]', '', m.upper()) for m in matches]
        return list(set(lines))  # Remove duplicates
    def _classify_type(self, title):
        """Classify disruption type based on title"""
        title_lower = title.lower()
        if 'bauarbeit' in title_lower or 'wartung' in title_lower:
            return 'maintenance'
        elif 'ausfall' in title_lower or 'störung' in title_lower:
            return 'disruption'
        elif 'verspätung' in title_lower:
            return 'delay'
        else:
            return 'info'
 def test_db_client():
    """Test the DB client and print results"""
    print("="*70)
    print("🚆 Deutsche Bahn S-Bahn Client Test")
    print("="*70)
    client = DBClient()
    disruptions = client.get_sbahn_disruptions()
    if not disruptions:
        print("\n⚠ No S-Bahn disruptions found (or not yet implemented)")
        return
    print(f"\n📊 Total S-Bahn Disruptions: {len(disruptions)}")
    print("="*70)
    for i, d in enumerate(disruptions, 1):
        print(f"\n[{i}] {d['title']}")
        print(f"    Lines: {', '.join(d['lines'])}")
        print(f"    Type: {d['type']}")
    print("\n" + "="*70)
 if __name__ == '__main__':
    test_db_client()
@@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 """
 MVG Client - Placeholder for U-Bahn disruptions
 """
 class MVGClient:
    """Client for MVG (Munich Transport) U-Bahn disruptions"""
    def get_disruptions(self):
        """
        Fetch U-Bahn disruptions
        Returns:
            list: Empty for now (U-Bahn scraping not implemented)
        """
        print("\n🔍 MVG U-Bahn disruptions...")
        print("   ℹ️  U-Bahn scraping not yet implemented")
        return []
 def test_mvg_client():
    """Test the MVG client"""
    print("="*70)
    print("🚇 MVG U-Bahn Client Test")
    print("="*70)
    client = MVGClient()
    disruptions = client.get_disruptions()
    print("\n⚠ U-Bahn scraping not yet implemented")
    print("="*70)
 if __name__ == '__main__':
    test_mvg_client()
@@ -0,0 +1,10 @@
 requests==2.31.0
 beautifulsoup4==4.12.2
 feedparser==6.0.10
 pymongo==4.6.1
 python-dotenv==1.0.0
 pytz==2023.3
 selenium==4.15.2
 webdriver-manager==4.0.1
 flask==3.0.0
 redis==5.0.1
@@ -0,0 +1,8 @@
 #!/bin/bash
 # Start both the API server and the worker
 # Start the worker in the background
 python -u worker.py &
 # Start the API server in the foreground
 python -u api_service.py
@@ -0,0 +1,70 @@
 #!/usr/bin/env python3
 """
 Transport Crawler Worker - Listens to Redis queue and processes crawl tasks
 """
 import redis
 import json
 import os
 import time
 from crawler_service import run_crawler
 REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
 QUEUE_NAME = 'transport_crawl_queue'
 def get_redis_client():
    """Get Redis client"""
    return redis.from_url(REDIS_URL, decode_responses=True)
 def process_crawl_task(message):
    """Process a crawl task"""
    try:
        print(f"\n📨 Received task: {message.get('task')}")
        print(f"   Timestamp: {message.get('timestamp')}")
        # Run the crawler
        result = run_crawler()
        print(f"✅ Task completed: {result.get('total_disruptions')} disruptions found")
        return True
    except Exception as e:
        print(f"❌ Task failed: {e}")
        import traceback
        traceback.print_exc()
        return False
 def main():
    """Main worker loop"""
    print("="*70)
    print("🚇 Transport Crawler Worker Starting")
    print("="*70)
    print(f"Redis URL: {REDIS_URL}")
    print(f"Queue: {QUEUE_NAME}")
    print("Waiting for tasks...")
    print("="*70)
    r = get_redis_client()
    while True:
        try:
            # Block and wait for messages (timeout 1 second)
            result = r.brpop(QUEUE_NAME, timeout=1)
            if result:
                queue_name, message_json = result
                message = json.loads(message_json)
                process_crawl_task(message)
        except KeyboardInterrupt:
            print("\n\n👋 Worker stopped by user")
            break
        except Exception as e:
            print(f"\n❌ Worker error: {e}")
            time.sleep(5)  # Wait before retrying
 if __name__ == '__main__':
    main()