update

2025-11-14 12:51:18 +01:00
parent 433a16ee0e
commit 869ca3a894
20 changed files with 1606 additions and 38 deletions
@@ -10,6 +10,7 @@ from routes.newsletter_routes import newsletter_bp
 from routes.tracking_routes import tracking_bp
 from routes.analytics_routes import analytics_bp
 from routes.admin_routes import admin_bp
+from routes.transport_routes import transport_bp

 # Initialize Flask app
 app = Flask(__name__)
@@ -27,6 +28,7 @@ app.register_blueprint(newsletter_bp)
 app.register_blueprint(tracking_bp)
 app.register_blueprint(analytics_bp)
 app.register_blueprint(admin_bp)
+app.register_blueprint(transport_bp)

 # Health check endpoint
@app.route('/health')
@@ -5,4 +5,5 @@ python-dotenv==1.0.0
 pymongo==4.6.1
 requests==2.31.0
 Jinja2==3.1.2
+redis==5.0.1

@@ -12,7 +12,8 @@ admin_bp = Blueprint('admin', __name__)
@admin_bp.route('/api/admin/trigger-crawl', methods=['POST'])
 def trigger_crawl():
    """
-    Manually trigger the news crawler
+    Manually trigger the news crawler asynchronously via Redis queue
+    Uses Redis message queue for non-blocking execution
    
    Request body (optional):
    {
@@ -20,6 +21,9 @@ def trigger_crawl():
    }
    """
    try:
+        import redis
+        import json
+        
        # Handle both JSON and empty body
        try:
            data = request.get_json(silent=True) or {}
@@ -35,41 +39,29 @@ def trigger_crawl():
                'error': 'max_articles must be an integer between 1 and 100'
            }), 400
        
-        # Execute crawler in crawler container using docker exec
-        try:
-            result = subprocess.run(
-                ['docker', 'exec', 'munich-news-crawler', 'python', 'crawler_service.py', str(max_articles)],
-                capture_output=True,
-                text=True,
-                timeout=300  # 5 minute timeout
-            )
+        # Get Redis client
+        redis_url = os.getenv('REDIS_URL', 'redis://redis:6379')
+        r = redis.from_url(redis_url, decode_responses=True)
        
-            # Check result
-            success = result.returncode == 0
-            
-            return jsonify({
-                'success': success,
-                'message': f'Crawler {"executed successfully" if success else "failed"}',
+        # Publish message to Redis queue
+        message = {
+            'task': 'crawl_news',
            'max_articles': max_articles,
-                'output': result.stdout[-1000:] if result.stdout else '',  # Last 1000 chars
-                'errors': result.stderr[-500:] if result.stderr else ''
-            }), 200 if success else 500
+            'timestamp': str(os.times())
+        }
+        r.lpush('news_crawl_queue', json.dumps(message))
        
-        except FileNotFoundError:
+        # Return immediately without waiting
        return jsonify({
-                'success': False,
-                'error': 'Docker command not found. Make sure Docker is installed and the socket is mounted.'
-            }), 500
+            'success': True,
+            'message': 'News crawl task queued',
+            'max_articles': max_articles
+        }), 202  # 202 Accepted
        
-    except subprocess.TimeoutExpired:
-        return jsonify({
-            'success': False,
-            'error': 'Crawler timed out after 5 minutes'
-        }), 500
    except Exception as e:
        return jsonify({
            'success': False,
-            'error': f'Failed to run crawler: {str(e)}'
+            'error': f'Failed to queue news crawl: {str(e)}'
        }), 500


@@ -0,0 +1,74 @@
+from flask import Blueprint, jsonify
+from database import db
+import redis
+import os
+import json
+
+transport_bp = Blueprint('transport', __name__)
+
+REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
+
+
+def get_redis_client():
+    """Get Redis client"""
+    return redis.from_url(REDIS_URL, decode_responses=True)
+
+
+@transport_bp.route('/api/transport/crawl', methods=['POST'])
+def trigger_transport_crawl():
+    """Trigger transport disruption crawl asynchronously via Redis queue"""
+    try:
+        r = get_redis_client()
+        
+        # Publish message to Redis queue
+        message = {
+            'task': 'crawl_transport',
+            'timestamp': str(os.times())
+        }
+        r.lpush('transport_crawl_queue', json.dumps(message))
+        
+        # Return immediately without waiting
+        return jsonify({
+            'status': 'success',
+            'message': 'Transport crawl task queued'
+        }), 202  # 202 Accepted
+        
+    except Exception as e:
+        return jsonify({
+            'status': 'error',
+            'message': 'Failed to queue transport crawl',
+            'details': str(e)
+        }), 500
+
+
+@transport_bp.route('/api/transport/disruptions', methods=['GET'])
+def get_transport_disruptions():
+    """Get current transport disruptions from MongoDB"""
+    try:
+        collection = db['transport_alerts']
+        
+        # Get active disruptions
+        disruptions = list(collection.find(
+            {'is_active': True},
+            {'_id': 0}
+        ).sort('updated_at', -1))
+        
+        # Convert datetime to ISO format
+        for d in disruptions:
+            if d.get('start_time'):
+                d['start_time'] = d['start_time'].isoformat()
+            if d.get('end_time'):
+                d['end_time'] = d['end_time'].isoformat()
+            if d.get('updated_at'):
+                d['updated_at'] = d['updated_at'].isoformat()
+        
+        return jsonify({
+            'total': len(disruptions),
+            'disruptions': disruptions
+        }), 200
+        
+    except Exception as e:
+        return jsonify({
+            'error': 'Failed to fetch disruptions from database',
+            'details': str(e)
+        }), 500
@@ -57,6 +57,20 @@ services:
    command: sh /setup-ollama-model.sh
    restart: on-failure

+  # Redis - Message queue for async tasks (Internal only - not exposed to host)
+  redis:
+    image: redis:7-alpine
+    container_name: munich-news-redis
+    restart: unless-stopped
+    # No ports exposed - only accessible within Docker network
+    networks:
+      - munich-news-network
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
  # MongoDB Database (Internal only - not exposed to host)
  mongodb:
    image: mongo:latest
@@ -90,8 +104,10 @@ services:
    depends_on:
      - mongodb
      - ollama
+      - redis
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
+      - REDIS_URL=redis://redis:6379
      - TZ=Europe/Berlin
    volumes:
      - ./backend/.env:/app/.env:ro
@@ -112,10 +128,12 @@ services:
    restart: unless-stopped
    depends_on:
      - mongodb
+      - redis
    ports:
      - "5001:5001"
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
+      - REDIS_URL=redis://redis:6379
      - FLASK_PORT=5001
      - TZ=Europe/Berlin
    volumes:
@@ -130,6 +148,32 @@ services:
      retries: 3
      start_period: 40s

+  # Transport Crawler - API service for MVG disruptions (Internal only - not exposed to host)
+  transport-crawler:
+    build:
+      context: ./transport_crawler
+      dockerfile: Dockerfile
+    container_name: munich-news-transport-crawler
+    restart: unless-stopped
+    depends_on:
+      - mongodb
+      - redis
+    # No ports exposed - only accessible within Docker network
+    environment:
+      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
+      - REDIS_URL=redis://redis:6379
+      - TZ=Europe/Berlin
+    volumes:
+      - ./backend/.env:/app/.env:ro
+    networks:
+      - munich-news-network
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+
  # Newsletter Sender - Runs at 7 AM Berlin time
  sender:
    build:
@@ -141,6 +185,7 @@ services:
      - mongodb
      - backend
      - crawler
+      - transport-crawler
    environment:
      - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
      - TZ=Europe/Berlin
@@ -12,12 +12,12 @@ COPY backend/config.py /app/config.py
 # Copy crawler files (includes ollama_client.py)
 COPY news_crawler/ /app/

-# Make the scheduler executable
-RUN chmod +x scheduled_crawler.py
+# Make scripts executable
+RUN chmod +x scheduled_crawler.py start.sh

 # Set timezone to Berlin
 ENV TZ=Europe/Berlin
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

-# Run the scheduled crawler
-CMD ["python", "-u", "scheduled_crawler.py"]
+# Run both scheduler and worker
+CMD ["/app/start.sh"]
@@ -6,3 +6,4 @@ pymongo==4.6.1
 python-dotenv==1.0.0
 schedule==1.2.0
 pytz==2023.3
+redis==5.0.1
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Start both the scheduler and the worker
+
+# Start the worker in the background
+python -u worker.py &
+
+# Start the scheduler in the foreground
+python -u scheduled_crawler.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""
+News Crawler Worker - Listens to Redis queue and processes crawl tasks
+"""
+import redis
+import json
+import os
+import time
+from crawler_service import crawl_all_feeds
+
+REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
+QUEUE_NAME = 'news_crawl_queue'
+
+
+def get_redis_client():
+    """Get Redis client"""
+    return redis.from_url(REDIS_URL, decode_responses=True)
+
+
+def process_crawl_task(message):
+    """Process a crawl task"""
+    try:
+        max_articles = message.get('max_articles', 10)
+        print(f"\n📨 Received task: {message.get('task')}")
+        print(f"   Max articles per feed: {max_articles}")
+        print(f"   Timestamp: {message.get('timestamp')}")
+        
+        # Run the crawler
+        result = crawl_all_feeds(max_articles_per_feed=max_articles)
+        
+        print(f"✅ Task completed: {result.get('total_articles_crawled')} articles crawled")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Task failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    """Main worker loop"""
+    print("="*70)
+    print("📰 News Crawler Worker Starting")
+    print("="*70)
+    print(f"Redis URL: {REDIS_URL}")
+    print(f"Queue: {QUEUE_NAME}")
+    print("Waiting for tasks...")
+    print("="*70)
+    
+    r = get_redis_client()
+    
+    while True:
+        try:
+            # Block and wait for messages (timeout 1 second)
+            result = r.brpop(QUEUE_NAME, timeout=1)
+            
+            if result:
+                queue_name, message_json = result
+                message = json.loads(message_json)
+                process_crawl_task(message)
+            
+        except KeyboardInterrupt:
+            print("\n\n👋 Worker stopped by user")
+            break
+        except Exception as e:
+            print(f"\n❌ Worker error: {e}")
+            time.sleep(5)  # Wait before retrying
+
+
+if __name__ == '__main__':
+    main()
@@ -247,6 +247,70 @@
                        </td>
                    </tr>

+                    {% if transport_disruptions and transport_disruptions|length > 0 %}
+                    <!-- Divider -->
+                    <tr>
+                        <td style="padding: 0 40px;">
+                            <div style="height: 2px; background-color: #e0e0e0;"></div>
+                        </td>
+                    </tr>
+
+                    <!-- Transport Disruptions Section -->
+                    <tr>
+                        <td style="padding: 30px 40px;">
+                            <h2 style="margin: 0 0 20px 0; font-size: 22px; font-weight: 700; color: #1a1a1a;">
+                                🚆 S-Bahn Disruptions Today
+                            </h2>
+                            <p style="margin: 0 0 20px 0; font-size: 14px; color: #666666;">
+                                Current service disruptions affecting Munich S-Bahn:
+                            </p>
+                            
+                            {% for disruption in transport_disruptions %}
+                            <!-- Disruption Card -->
+                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-bottom: 15px; background-color: #fff8f0; border-left: 4px solid #ff9800; border-radius: 4px;">
+                                <tr>
+                                    <td style="padding: 15px 20px;">
+                                        <!-- Severity and Lines -->
+                                        <p style="margin: 0 0 8px 0; font-size: 13px; color: #666666;">
+                                            {{ disruption.severity_icon }} <strong style="color: #000000;">{{ disruption.lines_str }}</strong>
+                                        </p>
+                                        
+                                        <!-- Title -->
+                                        <p style="margin: 0 0 8px 0; font-size: 15px; font-weight: 700; color: #1a1a1a; line-height: 1.4;">
+                                            {{ disruption.title }}
+                                        </p>
+                                        
+                                        <!-- Description -->
+                                        {% if disruption.description %}
+                                        <p style="margin: 0 0 8px 0; font-size: 14px; color: #333333; line-height: 1.5;">
+                                            {{ disruption.description }}
+                                        </p>
+                                        {% endif %}
+                                        
+                                        <!-- Time -->
+                                        {% if disruption.start_time_str or disruption.end_time_str %}
+                                        <p style="margin: 0; font-size: 13px; color: #666666;">
+                                            ⏰ 
+                                            {% if disruption.start_time_str %}
+                                            From {{ disruption.start_time_str }}
+                                            {% endif %}
+                                            {% if disruption.end_time_str %}
+                                            until {{ disruption.end_time_str }}
+                                            {% endif %}
+                                        </p>
+                                        {% endif %}
+                                    </td>
+                                </tr>
+                            </table>
+                            {% endfor %}
+                            
+                            <p style="margin: 15px 0 0 0; font-size: 12px; color: #999999; font-style: italic;">
+                                💡 Plan your commute accordingly. Check <a href="https://www.mvg.de" style="color: #667eea; text-decoration: none;">MVG.de</a> for real-time updates.
+                            </p>
+                        </td>
+                    </tr>
+                    {% endif %}
+
                    <!-- Footer -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
@@ -77,6 +77,72 @@ client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 articles_collection = db['articles']
 subscribers_collection = db['subscribers']
+transport_alerts_collection = db['transport_alerts']
+
+
+def get_today_transport_disruptions():
+    """
+    Get active S-Bahn disruptions for today
+    Fetches from MongoDB transport_alerts collection
+    
+    Returns:
+        list: Active disruptions with details
+    """
+    try:
+        from datetime import datetime
+        
+        # Get active disruptions
+        disruptions = list(transport_alerts_collection.find(
+            {'is_active': True},
+            {'_id': 0}
+        ).sort('severity', -1).sort('updated_at', -1))
+        
+        # Filter for disruptions happening today
+        today = datetime.utcnow().date()
+        today_disruptions = []
+        
+        for d in disruptions:
+            # Check if disruption is active today
+            start_time = d.get('start_time')
+            end_time = d.get('end_time')
+            
+            is_today = False
+            if start_time and end_time:
+                start_date = start_time.date() if hasattr(start_time, 'date') else today
+                end_date = end_time.date() if hasattr(end_time, 'date') else today
+                is_today = start_date <= today <= end_date
+            elif start_time:
+                start_date = start_time.date() if hasattr(start_time, 'date') else today
+                is_today = start_date <= today
+            else:
+                is_today = True  # No time info, assume it's relevant
+            
+            if is_today:
+                # Format times for display
+                if start_time:
+                    d['start_time_str'] = start_time.strftime('%H:%M') if hasattr(start_time, 'strftime') else str(start_time)
+                if end_time:
+                    d['end_time_str'] = end_time.strftime('%H:%M') if hasattr(end_time, 'strftime') else str(end_time)
+                
+                # Format lines as comma-separated string
+                d['lines_str'] = ', '.join(d.get('lines', []))
+                
+                # Get severity icon
+                severity_icons = {
+                    'high': '🔴',
+                    'medium': '🟡',
+                    'low': '🟢'
+                }
+                d['severity_icon'] = severity_icons.get(d.get('severity', 'medium'), '🟡')
+                
+                today_disruptions.append(d)
+        
+        print(f"✓ Found {len(today_disruptions)} transport disruptions for today")
+        return today_disruptions
+        
+    except Exception as e:
+        print(f"✗ Error fetching transport disruptions: {e}")
+        return []


 def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
@@ -296,6 +362,9 @@ def render_newsletter_html(articles, subscriber_categories=None, tracking_enable
    from weather_service import get_munich_weather
    weather = get_munich_weather()
    
+    # Get transport disruptions for today
+    transport_disruptions = get_today_transport_disruptions()
+    
    # Prepare template data
    now = datetime.now()
    total_articles = sum(len(section['articles']) for section in category_sections)
@@ -308,7 +377,8 @@ def render_newsletter_html(articles, subscriber_categories=None, tracking_enable
        'preferences_link': f'{Config.WEBSITE_URL}/preferences.html',
        'website_link': Config.WEBSITE_URL,
        'tracking_enabled': tracking_enabled,
-        'weather': weather
+        'weather': weather,
+        'transport_disruptions': transport_disruptions
    }
    
    # Render HTML
@@ -17,7 +17,7 @@ if command -v nvidia-smi &> /dev/null; then
        echo "✓ NVIDIA Docker runtime is available"
        echo ""
        echo "Starting services with GPU support..."
-        docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
+        docker compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
        echo ""
        echo "✓ Services started with GPU acceleration!"
        echo ""
@@ -0,0 +1,30 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install Chromium and dependencies for Selenium (works on both ARM64 and AMD64)
+RUN apt-get update && apt-get install -y \
+    chromium \
+    chromium-driver \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variable for Chromium
+ENV CHROME_BIN=/usr/bin/chromium
+ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy crawler files
+COPY *.py /app/
+COPY *.sh /app/
+
+# Make start script executable
+RUN chmod +x /app/start.sh
+
+# Expose port for API
+EXPOSE 5002
+
+# Run both API and worker
+CMD ["/app/start.sh"]
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""
+Transport Crawler API - Flask service for triggering crawls
+"""
+from flask import Flask, jsonify
+from crawler_service import run_crawler
+import os
+
+app = Flask(__name__)
+
+@app.route('/health', methods=['GET'])
+def health():
+    """Health check endpoint"""
+    return jsonify({'status': 'ok'}), 200
+
+
+@app.route('/api/transport/crawl', methods=['POST'])
+def trigger_crawl():
+    """Trigger a transport disruption crawl"""
+    try:
+        result = run_crawler()
+        return jsonify(result), 200
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+@app.route('/api/transport/disruptions', methods=['GET'])
+def get_disruptions():
+    """Get current active disruptions from MongoDB"""
+    try:
+        from pymongo import MongoClient
+        
+        mongo_uri = os.getenv('MONGODB_URI', 'mongodb://admin:changeme@mongodb:27017/')
+        client = MongoClient(mongo_uri)
+        db = client['munich_news']
+        collection = db['transport_alerts']
+        
+        # Get active disruptions
+        disruptions = list(collection.find(
+            {'is_active': True},
+            {'_id': 0}
+        ))
+        
+        # Convert datetime to ISO format
+        for d in disruptions:
+            if d.get('start_time'):
+                d['start_time'] = d['start_time'].isoformat()
+            if d.get('end_time'):
+                d['end_time'] = d['end_time'].isoformat()
+            if d.get('updated_at'):
+                d['updated_at'] = d['updated_at'].isoformat()
+        
+        return jsonify({
+            'total': len(disruptions),
+            'disruptions': disruptions
+        }), 200
+        
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
+if __name__ == '__main__':
+    port = int(os.getenv('FLASK_PORT', 5002))
+    app.run(host='0.0.0.0', port=port, debug=False)
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+Transport Crawler Service - Main orchestrator
+Fetches disruptions from multiple sources and displays them
+"""
+from datetime import datetime
+from mvg_api_client import MVGClient
+from db_api_client import DBClient
+
+def print_header():
+    """Print header"""
+    print("\n" + "="*70)
+    print("🚇 Munich Transport Disruption Crawler")
+    print("="*70)
+    print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("="*70)
+
+def print_disruption_summary(all_disruptions):
+    """Print summary of all disruptions"""
+    if not all_disruptions:
+        print("\n✅ No disruptions found - All lines operating normally!")
+        return
+    
+    print(f"\n📊 SUMMARY: {len(all_disruptions)} Active Disruptions")
+    print("="*70)
+    
+    # Group by type
+    by_type = {}
+    for d in all_disruptions:
+        dtype = d.get('type', 'unknown')
+        by_type[dtype] = by_type.get(dtype, 0) + 1
+    
+    print("\nBy Type:")
+    for dtype, count in sorted(by_type.items()):
+        icon = {
+            'maintenance': '🔧',
+            'disruption': '⚠️',
+            'delay': '⏱️',
+            'info': 'ℹ️'
+        }.get(dtype, '❓')
+        print(f"  {icon} {dtype.title()}: {count}")
+    
+    # Group by source
+    by_source = {}
+    for d in all_disruptions:
+        source = d.get('source', 'unknown')
+        by_source[source] = by_source.get(source, 0) + 1
+    
+    print("\nBy Source:")
+    for source, count in sorted(by_source.items()):
+        print(f"  • {source}: {count}")
+
+def print_disruptions(disruptions, title):
+    """Print disruptions in a formatted way"""
+    if not disruptions:
+        return
+    
+    print(f"\n{title}")
+    print("-"*70)
+    
+    for i, d in enumerate(disruptions, 1):
+        # Icon based on type
+        icon = {
+            'maintenance': '🔧',
+            'disruption': '⚠️',
+            'delay': '⏱️',
+            'info': 'ℹ️'
+        }.get(d.get('type', 'info'), '❓')
+        
+        print(f"\n{icon} [{i}] {d.get('title', 'No title')}")
+        
+        # Lines affected
+        lines = d.get('lines', [])
+        if lines:
+            line_str = ', '.join(lines)
+            print(f"    🚇 Lines: {line_str}")
+        
+        # Time range
+        start = d.get('start_time')
+        end = d.get('end_time')
+        if start or end:
+            time_str = ""
+            if start:
+                time_str += f"From: {start.strftime('%d.%m %H:%M')}"
+            if end:
+                if time_str:
+                    time_str += " → "
+                time_str += f"Until: {end.strftime('%d.%m %H:%M')}"
+            print(f"    ⏰ {time_str}")
+        
+        # Description
+        desc = d.get('description', '')
+        if desc:
+            # Truncate long descriptions
+            if len(desc) > 150:
+                desc = desc[:150] + "..."
+            print(f"    📝 {desc}")
+        
+        # Severity
+        severity = d.get('severity', 'medium')
+        severity_icon = {
+            'high': '🔴',
+            'medium': '🟡',
+            'low': '🟢'
+        }.get(severity, '⚪')
+        print(f"    {severity_icon} Severity: {severity}")
+
+def save_to_mongodb(disruptions):
+    """Save disruptions to MongoDB"""
+    try:
+        from pymongo import MongoClient
+        import os
+        
+        mongo_uri = os.getenv('MONGODB_URI', 'mongodb://admin:changeme@mongodb:27017/')
+        client = MongoClient(mongo_uri)
+        db = client['munich_news']
+        collection = db['transport_alerts']
+        
+        print("\n💾 Saving to MongoDB...")
+        
+        # Mark all existing alerts as inactive
+        collection.update_many({}, {'$set': {'is_active': False}})
+        
+        # Insert or update current disruptions
+        saved_count = 0
+        for d in disruptions:
+            # Use disruption ID as unique identifier
+            collection.update_one(
+                {'alert_id': d['id']},
+                {
+                    '$set': {
+                        'alert_id': d['id'],
+                        'title': d['title'],
+                        'description': d['description'],
+                        'lines': d['lines'],
+                        'type': d['type'],
+                        'severity': d['severity'],
+                        'start_time': d['start_time'],
+                        'end_time': d['end_time'],
+                        'source': d['source'],
+                        'is_active': True,
+                        'updated_at': datetime.utcnow()
+                    }
+                },
+                upsert=True
+            )
+            saved_count += 1
+        
+        print(f"✓ Saved {saved_count} disruptions to MongoDB")
+        return True
+        
+    except Exception as e:
+        print(f"✗ MongoDB error: {e}")
+        return False
+
+
+def run_crawler():
+    """Main crawler function"""
+    print_header()
+    
+    all_disruptions = []
+    
+    # 1. Fetch MVG disruptions (U-Bahn, Tram, Bus)
+    print("\n📡 Fetching data from sources...")
+    print("-"*70)
+    
+    mvg_client = MVGClient()
+    mvg_disruptions = mvg_client.get_disruptions()
+    all_disruptions.extend(mvg_disruptions)
+    
+    # 2. Fetch S-Bahn disruptions
+    db_client = DBClient()
+    sbahn_disruptions = db_client.get_sbahn_disruptions()
+    all_disruptions.extend(sbahn_disruptions)
+    
+    # 3. Print summary
+    print_disruption_summary(all_disruptions)
+    
+    # 4. Print detailed disruptions
+    if mvg_disruptions:
+        print_disruptions(mvg_disruptions, "\n🚇 MVG DISRUPTIONS (U-Bahn, Tram, Bus)")
+    
+    if sbahn_disruptions:
+        print_disruptions(sbahn_disruptions, "\n🚆 S-BAHN DISRUPTIONS")
+    
+    # 5. Output JSON
+    print("\n" + "="*70)
+    print("📄 JSON OUTPUT")
+    print("="*70)
+    
+    import json
+    output = {
+        'timestamp': datetime.now().isoformat(),
+        'total_disruptions': len(all_disruptions),
+        'mvg_disruptions': len(mvg_disruptions),
+        'sbahn_disruptions': len(sbahn_disruptions),
+        'disruptions': []
+    }
+    
+    for d in all_disruptions:
+        output['disruptions'].append({
+            'id': d.get('id'),
+            'title': d.get('title'),
+            'description': d.get('description'),
+            'lines': d.get('lines', []),
+            'type': d.get('type'),
+            'severity': d.get('severity'),
+            'start_time': d.get('start_time').isoformat() if d.get('start_time') else None,
+            'end_time': d.get('end_time').isoformat() if d.get('end_time') else None,
+            'source': d.get('source')
+        })
+    
+    print(json.dumps(output, indent=2, ensure_ascii=False))
+    
+    # 6. Save to MongoDB
+    save_to_mongodb(all_disruptions)
+    
+    # Footer
+    print("\n" + "="*70)
+    print("✓ Crawler finished")
+    print("="*70 + "\n")
+    
+    return output
+
+if __name__ == '__main__':
+    try:
+        disruptions = run_crawler()
+    except KeyboardInterrupt:
+        print("\n\n👋 Crawler stopped by user")
+    except Exception as e:
+        print(f"\n\n❌ Crawler error: {e}")
+        import traceback
+        traceback.print_exc()
@@ -0,0 +1,789 @@
+#!/usr/bin/env python3
+"""
+Deutsche Bahn API Client - Fetch S-Bahn disruptions using Selenium
+"""
+import requests
+from datetime import datetime
+import time
+
+class DBClient:
+    """Client for Deutsche Bahn (S-Bahn) disruptions"""
+    
+    # DB S-Bahn München map page
+    MAP_URL = "https://karte.bahn.de/en/region/DB_SBahn_Muenchen"
+    
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
+        })
+    
+    def get_sbahn_disruptions(self):
+        """
+        Fetch S-Bahn disruptions for Munich from DB Karte using Selenium
+        
+        Returns:
+            list: Disruption data
+        """
+        print("\n🔍 Fetching S-Bahn disruptions from DB Karte (using Selenium)...")
+        
+        driver = None
+        try:
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+            from selenium.webdriver.chrome.service import Service
+            from selenium.webdriver.common.by import By
+            from selenium.webdriver.support.ui import WebDriverWait
+            from selenium.webdriver.support import expected_conditions as EC
+            import os
+            
+            # Setup Chrome options for Chromium
+            chrome_options = Options()
+            chrome_options.add_argument('--headless')
+            chrome_options.add_argument('--no-sandbox')
+            chrome_options.add_argument('--disable-dev-shm-usage')
+            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+            chrome_options.add_argument('--window-size=1920,1080')
+            chrome_options.add_argument('--disable-gpu')
+            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+            chrome_options.add_experimental_option('useAutomationExtension', False)
+            
+            # Set realistic user agent
+            chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+            
+            # Use system Chromium if available (Docker container)
+            chrome_bin = os.getenv('CHROME_BIN', '/usr/bin/chromium')
+            chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
+            
+            if os.path.exists(chrome_bin):
+                chrome_options.binary_location = chrome_bin
+                print(f"   Using system Chromium: {chrome_bin}")
+            
+            print("   Starting Chromium browser...")
+            
+            # Try to use system chromedriver
+            try:
+                if os.path.exists(chromedriver_path):
+                    service = Service(chromedriver_path)
+                    driver = webdriver.Chrome(service=service, options=chrome_options)
+                else:
+                    driver = webdriver.Chrome(options=chrome_options)
+            except Exception as e:
+                print(f"   ✗ Failed to start Chromium: {e}")
+                print(f"   ℹ️  Falling back to webdriver-manager...")
+                try:
+                    from webdriver_manager.chrome import ChromeDriverManager
+                    service = Service(ChromeDriverManager().install())
+                    driver = webdriver.Chrome(service=service, options=chrome_options)
+                except Exception as e2:
+                    print(f"   ✗ webdriver-manager also failed: {e2}")
+                    raise
+            
+            print(f"   Loading: {self.MAP_URL}")
+            driver.get(self.MAP_URL)
+            
+            # Wait for page to load
+            print("   Waiting for page to load...")
+            
+            # Wait for disruption boxes to appear
+            try:
+                print("   Waiting for disruption boxes...")
+                WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']"))
+                )
+                # Give extra time for all boxes to load
+                time.sleep(3)
+                print("   ✓ Disruption boxes should be loaded")
+            except Exception as e:
+                print(f"   ⚠ Timeout waiting for disruption boxes: {e}")
+                time.sleep(5)
+            
+            print(f"   ✓ Page loaded (title: {driver.title[:50]}...)")
+            
+            # Debug: Save screenshot and page source
+            try:
+                screenshot_path = "/tmp/db_karte_screenshot.png"
+                driver.save_screenshot(screenshot_path)
+                print(f"   📸 Screenshot saved to: {screenshot_path}")
+            except:
+                pass
+            
+            # Debug: Print page structure
+            print("   Analyzing page structure...")
+            page_source = driver.page_source
+            
+            # Save page source for inspection
+            try:
+                with open("/tmp/db_karte_source.html", "w", encoding="utf-8") as f:
+                    f.write(page_source)
+                print(f"   📄 Page source saved to: /tmp/db_karte_source.html")
+            except:
+                pass
+            
+            # Look for disruption markers/icons on the map
+            disruptions = self._find_and_click_disruptions(driver)
+            
+            # If no disruptions found via clicking, parse the page source
+            if not disruptions:
+                print("   No clickable disruptions found, parsing page source...")
+                
+                # Debug: Show what elements are on the page
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(page_source, 'html.parser')
+                
+                # Count different element types
+                print(f"   Page stats: {len(soup.find_all('div'))} divs, {len(soup.find_all('button'))} buttons")
+                
+                # Look for any text mentioning disruptions
+                text = soup.get_text().lower()
+                if 'disruption' in text or 'störung' in text or 'incident' in text:
+                    print(f"   ℹ️  Page contains disruption-related text")
+                
+                # Check for common map libraries
+                if 'leaflet' in page_source.lower():
+                    print(f"   ℹ️  Page uses Leaflet maps")
+                if 'mapbox' in page_source.lower():
+                    print(f"   ℹ️  Page uses Mapbox")
+                if 'google.maps' in page_source.lower():
+                    print(f"   ℹ️  Page uses Google Maps")
+                
+                disruptions = self._parse_selenium_page(page_source, driver)
+            
+            if disruptions:
+                print(f"✓ Found {len(disruptions)} S-Bahn disruptions")
+            else:
+                print(f"   ℹ️  No S-Bahn disruptions found (all lines operating normally)")
+            
+            return disruptions
+            
+        except ImportError as e:
+            print(f"   ✗ Selenium not available: {e}")
+            print(f"   ℹ️  Install with: pip install selenium webdriver-manager")
+            return []
+        except Exception as e:
+            print(f"   ✗ Error: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+        finally:
+            if driver:
+                driver.quit()
+    
+    def _find_and_click_disruptions(self, driver):
+        """Find disruption boxes in the sidebar"""
+        try:
+            from selenium.webdriver.common.by import By
+            
+            disruptions = []
+            
+            print("   Looking for disruption boxes...")
+            
+            # Find all disruption boxes in the sidebar
+            disruption_boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']")
+            
+            if not disruption_boxes:
+                print("   No disruption boxes found")
+                return []
+            
+            print(f"   Found {len(disruption_boxes)} disruption boxes")
+            
+            # First pass: collect all basic info without clicking
+            basic_info = []
+            for i, box in enumerate(disruption_boxes):
+                try:
+                    
+                    # Extract disruption ID
+                    disruption_id = box.get_attribute('id')
+                    
+                    # Extract title
+                    title_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxTitle']")
+                    title = title_elem.text.strip()
+                    
+                    # Extract subtitle (type)
+                    subtitle_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxSubtitle']")
+                    subtitle = subtitle_elem.text.strip()
+                    
+                    # Extract affected lines
+                    lines = []
+                    badge_list = box.find_element(By.CSS_SELECTOR, "div[data-cy='disruptionBadgeList']")
+                    badges = badge_list.find_elements(By.CSS_SELECTOR, "span[data-cy='disruptionBadge']")
+                    for badge in badges:
+                        line_text = badge.text.strip()
+                        if line_text and line_text.startswith('S'):
+                            lines.append(line_text)
+                    
+                    # Determine severity from icon
+                    severity = 'medium'
+                    try:
+                        icon = box.find_element(By.CSS_SELECTOR, "img[data-cy='disruptionboxIcon']")
+                        icon_src = icon.get_attribute('src')
+                        if 'red' in icon_src:
+                            severity = 'high'
+                        elif 'orange' in icon_src:
+                            severity = 'medium'
+                        elif 'yellow' in icon_src:
+                            severity = 'low'
+                    except:
+                        pass
+                    
+                    # Store basic info
+                    basic_info.append({
+                        'id': disruption_id or f"sbahn_{i}",
+                        'title': title,
+                        'subtitle': subtitle,
+                        'lines': lines,
+                        'severity': severity,
+                        'index': i
+                    })
+                    
+                    print(f"   ✓ [{i}] {title[:60]}... (Lines: {', '.join(lines)})")
+                    
+                except Exception as e:
+                    print(f"   ✗ Error extracting disruption {i}: {e}")
+                    continue
+            
+            # Second pass: click each one to get time details
+            print(f"\n   Extracting time details for {len(basic_info)} disruptions...")
+            for info in basic_info:
+                print(f"   Processing disruption {info['index']}...")
+                try:
+                    # Make sure we're back at the list view
+                    driver.execute_script("window.scrollTo(0, 0);")
+                    time.sleep(0.5)
+                    
+                    # Wait for boxes to be present again
+                    try:
+                        WebDriverWait(driver, 3).until(
+                            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']"))
+                        )
+                    except:
+                        pass
+                    
+                    # Refetch boxes each time
+                    boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']")
+                    print(f"   Found {len(boxes)} boxes after refetch")
+                    
+                    if info['index'] >= len(boxes):
+                        print(f"   ⚠ Box {info['index']} not found (only {len(boxes)} boxes available)")
+                        continue
+                    
+                    # Get fresh reference to the box and button
+                    box = boxes[info['index']]
+                    button = box.find_element(By.TAG_NAME, "button")
+                    
+                    # Click to open details
+                    driver.execute_script("arguments[0].scrollIntoView(true);", button)
+                    time.sleep(0.3)
+                    driver.execute_script("arguments[0].click();", button)  # Use JS click
+                    time.sleep(1.5)  # Wait for detail panel to fully open
+                    
+                    # Extract time from page text
+                    detail_text = driver.find_element(By.TAG_NAME, "body").text
+                    
+                    # Debug: show a snippet of the detail text
+                    if "From:" in detail_text and "To:" in detail_text:
+                        snippet_start = detail_text.find("From:")
+                        snippet_end = detail_text.find("To:", snippet_start) + 50
+                        snippet = detail_text[snippet_start:snippet_end]
+                        print(f"   Time snippet: {snippet.replace(chr(10), ' ')}")
+                    
+                    start_time, end_time = self._extract_time_range(detail_text)
+                    
+                    # Go back to original page to reset the view
+                    driver.get(self.MAP_URL)
+                    time.sleep(3)  # Wait for page to reload and boxes to appear
+                    
+                    # Create disruption object
+                    disruption_type = self._classify_type(info['title'] + ' ' + info['subtitle'])
+                    
+                    disruption = {
+                        'id': info['id'],
+                        'title': info['title'],
+                        'description': info['subtitle'],
+                        'lines': info['lines'],
+                        'type': disruption_type,
+                        'start_time': start_time,
+                        'end_time': end_time,
+                        'severity': info['severity'],
+                        'source': 'db_karte_sidebar',
+                        'created_at': datetime.utcnow()
+                    }
+                    
+                    disruptions.append(disruption)
+                    
+                    time_info = ""
+                    if start_time:
+                        time_info += f" From: {start_time.strftime('%d.%m %H:%M')}"
+                    if end_time:
+                        time_info += f" To: {end_time.strftime('%d.%m %H:%M')}"
+                    
+                    if time_info:
+                        print(f"   ✓ [{info['index']}]{time_info}")
+                    
+                except Exception as e:
+                    print(f"   ⚠ Could not get time for disruption {info['index']}: {e}")
+                    # Still add the disruption without time info
+                    disruption = {
+                        'id': info['id'],
+                        'title': info['title'],
+                        'description': info['subtitle'],
+                        'lines': info['lines'],
+                        'type': self._classify_type(info['title']),
+                        'start_time': None,
+                        'end_time': None,
+                        'severity': info['severity'],
+                        'source': 'db_karte_sidebar',
+                        'created_at': datetime.utcnow()
+                    }
+                    disruptions.append(disruption)
+            
+            return disruptions
+            
+        except Exception as e:
+            print(f"   ✗ Error finding disruption boxes: {e}")
+            return []
+    
+    def _extract_disruption_details(self, driver):
+        """Extract disruption details from popup/modal"""
+        try:
+            from selenium.webdriver.common.by import By
+            
+            # Look for popup/modal/tooltip containers
+            popup_selectors = [
+                "div[class*='popup']",
+                "div[class*='modal']",
+                "div[class*='tooltip']",
+                "div[class*='detail']",
+                "div[class*='info']",
+                "[role='dialog']",
+                "[role='tooltip']",
+            ]
+            
+            popup = None
+            for selector in popup_selectors:
+                try:
+                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
+                    for elem in elements:
+                        if elem.is_displayed() and len(elem.text) > 20:
+                            popup = elem
+                            break
+                    if popup:
+                        break
+                except:
+                    continue
+            
+            if not popup:
+                # Try to get any recently appeared text
+                body = driver.find_element(By.TAG_NAME, "body")
+                popup_text = body.text
+            else:
+                popup_text = popup.text
+            
+            # Check if it's S-Bahn related
+            if not self._contains_sbahn_reference(popup_text):
+                return None
+            
+            # Extract title (usually first line or heading)
+            title = popup_text.split('\n')[0][:100] if '\n' in popup_text else popup_text[:100]
+            
+            # Extract time information
+            start_time, end_time = self._extract_time_range(popup_text)
+            
+            # Extract affected lines
+            lines = self._extract_lines_from_text(popup_text)
+            
+            return {
+                'id': f"sbahn_detail_{hash(popup_text) % 10000}",
+                'title': title,
+                'description': popup_text[:500],
+                'lines': lines,
+                'type': self._classify_type(title),
+                'start_time': start_time,
+                'end_time': end_time,
+                'severity': self._determine_severity(popup_text),
+                'source': 'db_karte_detail',
+                'created_at': datetime.utcnow()
+            }
+            
+        except Exception as e:
+            return None
+    
+    def _extract_time_range(self, text):
+        """Extract start and end time from text"""
+        import re
+        from datetime import datetime
+        
+        start_time = None
+        end_time = None
+        
+        # Look for the specific format with possible newlines
+        # Pattern: From:XX.YYYY-MM-DD, HH:MMTo:XX.YYYY-MM-DD, HH:MM
+        # Remove newlines first to make matching easier
+        text_clean = text.replace('\n', ' ').replace('\r', ' ')
+        
+        pattern = r'From:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})\s*To:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})'
+        match = re.search(pattern, text_clean)
+        
+        if match:
+            try:
+                start_date = match.group(1)  # 2025-11-13
+                start_time_str = match.group(2)  # 10:02
+                end_date = match.group(3)  # 2025-11-13
+                end_time_str = match.group(4)  # 14:30
+                
+                start_time = datetime.strptime(f"{start_date} {start_time_str}", "%Y-%m-%d %H:%M")
+                end_time = datetime.strptime(f"{end_date} {end_time_str}", "%Y-%m-%d %H:%M")
+            except Exception as e:
+                print(f"   ⚠ Error parsing time: {e}")
+        
+        # Fallback: Try other German formats
+        if not start_time:
+            # Look for "ab DD.MM.YYYY HH:MM" or "bis DD.MM.YYYY HH:MM"
+            ab_pattern = r'ab\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})'
+            bis_pattern = r'bis\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})'
+            
+            ab_match = re.search(ab_pattern, text, re.IGNORECASE)
+            if ab_match:
+                try:
+                    start_time = datetime.strptime(f"{ab_match.group(1)} {ab_match.group(2)}", "%d.%m.%Y %H:%M")
+                except:
+                    pass
+            
+            bis_match = re.search(bis_pattern, text, re.IGNORECASE)
+            if bis_match:
+                try:
+                    end_time = datetime.strptime(f"{bis_match.group(1)} {bis_match.group(2)}", "%d.%m.%Y %H:%M")
+                except:
+                    pass
+        
+        return start_time, end_time
+    
+    def _determine_severity(self, text):
+        """Determine severity based on keywords"""
+        text_lower = text.lower()
+        
+        if any(word in text_lower for word in ['ausfall', 'gesperrt', 'eingestellt', 'komplett']):
+            return 'high'
+        elif any(word in text_lower for word in ['verspätung', 'verzögerung', 'teilweise']):
+            return 'medium'
+        else:
+            return 'low'
+    
+    def _parse_selenium_page(self, page_source, driver):
+        """Parse page loaded by Selenium"""
+        try:
+            from bs4 import BeautifulSoup
+            from selenium.webdriver.common.by import By
+            
+            print("   Analyzing rendered page...")
+            soup = BeautifulSoup(page_source, 'html.parser')
+            disruptions = []
+            
+            # Method 1: Try to find disruption elements directly via Selenium
+            try:
+                # Look for common disruption indicators
+                selectors = [
+                    "div[class*='disruption']",
+                    "div[class*='stoerung']",
+                    "div[class*='incident']",
+                    "div[class*='message']",
+                    "div[class*='alert']",
+                    "[data-disruption]",
+                    "[data-incident]"
+                ]
+                
+                for selector in selectors:
+                    try:
+                        elements = driver.find_elements(By.CSS_SELECTOR, selector)
+                        if elements:
+                            print(f"   Found {len(elements)} elements with selector: {selector}")
+                            for elem in elements:
+                                text = elem.text.strip()
+                                if len(text) > 20 and self._contains_sbahn_reference(text):
+                                    disruptions.append(self._create_disruption_from_text(text))
+                    except:
+                        continue
+            except Exception as e:
+                print(f"   ✗ Selenium element search error: {e}")
+            
+            # Method 2: Parse the page source with BeautifulSoup
+            if not disruptions:
+                print("   Trying BeautifulSoup parsing...")
+                disruptions = self._parse_map_page(page_source.encode(), page_source)
+            
+            # Method 3: Check for any text mentioning S-Bahn lines with disruptions
+            if not disruptions:
+                print("   Checking page text for S-Bahn mentions...")
+                page_text = soup.get_text()
+                if self._contains_sbahn_reference(page_text):
+                    # Extract paragraphs or sections mentioning S-Bahn
+                    for elem in soup.find_all(['p', 'div', 'span']):
+                        text = elem.get_text(strip=True)
+                        if len(text) > 30 and self._contains_sbahn_reference(text):
+                            lines = self._extract_lines_from_text(text)
+                            if lines:
+                                disruptions.append(self._create_disruption_from_text(text))
+            
+            # Remove duplicates
+            seen = set()
+            unique = []
+            for d in disruptions:
+                key = d['title'][:50]
+                if key not in seen:
+                    seen.add(key)
+                    unique.append(d)
+            
+            return unique
+            
+        except Exception as e:
+            print(f"   ✗ Parse error: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    
+    def _contains_sbahn_reference(self, text):
+        """Check if text contains S-Bahn line references"""
+        import re
+        return bool(re.search(r'S[\s-]?[1-8]', text, re.IGNORECASE))
+    
+    def _create_disruption_from_text(self, text):
+        """Create disruption object from text"""
+        # Extract first sentence or first 100 chars as title
+        sentences = text.split('.')
+        title = sentences[0][:100] if sentences else text[:100]
+        
+        return {
+            'id': f"sbahn_{hash(text) % 10000}",
+            'title': title,
+            'description': text[:500],
+            'lines': self._extract_lines_from_text(text),
+            'type': self._classify_type(title),
+            'start_time': None,
+            'end_time': None,
+            'severity': 'medium',
+            'source': 'db_karte_selenium',
+            'created_at': datetime.utcnow()
+        }
+    
+    def _parse_map_page(self, html_content, html_text):
+        """Parse DB Karte map page for S-Bahn disruptions"""
+        try:
+            from bs4 import BeautifulSoup
+            import re
+            import json
+            
+            disruptions = []
+            
+            # Method 1: Look for embedded JSON data in script tags
+            print("   Analyzing page for disruption data...")
+            
+            # The map page likely has JSON data embedded in <script> tags
+            soup = BeautifulSoup(html_content, 'html.parser')
+            scripts = soup.find_all('script')
+            
+            for script in scripts:
+                if script.string:
+                    # Look for JSON data containing disruption/störung information
+                    script_text = script.string
+                    
+                    # Try to find JSON objects
+                    json_pattern = r'\{[^{}]*(?:"disruption"|"störung"|"incident"|"message")[^{}]*\}'
+                    matches = re.finditer(json_pattern, script_text, re.IGNORECASE)
+                    
+                    for match in matches:
+                        try:
+                            data = json.loads(match.group())
+                            # Process found JSON data
+                            if self._is_disruption_data(data):
+                                disruption = self._parse_disruption_json(data)
+                                if disruption:
+                                    disruptions.append(disruption)
+                        except json.JSONDecodeError:
+                            continue
+            
+            # Method 2: Look for API endpoint URLs in the page
+            api_pattern = r'https?://[^\s"\']+(?:api|disruption|stoerung)[^\s"\']+'
+            api_urls = re.findall(api_pattern, html_text, re.IGNORECASE)
+            
+            if api_urls:
+                print(f"   Found {len(api_urls)} potential API endpoints")
+                for api_url in set(api_urls[:3]):  # Try first 3 unique URLs
+                    try:
+                        print(f"   Trying API: {api_url[:60]}...")
+                        api_response = self.session.get(api_url, timeout=10)
+                        if api_response.status_code == 200:
+                            api_data = api_response.json()
+                            api_disruptions = self._parse_api_response(api_data)
+                            disruptions.extend(api_disruptions)
+                    except:
+                        continue
+            
+            # Method 3: Look for visible disruption messages on the page
+            if not disruptions:
+                print("   Checking for visible disruption messages...")
+                disruptions = self._scrape_visible_disruptions(soup)
+            
+            # Remove duplicates based on title
+            seen_titles = set()
+            unique_disruptions = []
+            for d in disruptions:
+                if d['title'] not in seen_titles:
+                    seen_titles.add(d['title'])
+                    unique_disruptions.append(d)
+            
+            return unique_disruptions
+            
+        except Exception as e:
+            print(f"   ✗ Parse error: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    
+    def _is_disruption_data(self, data):
+        """Check if JSON data contains disruption information"""
+        if not isinstance(data, dict):
+            return False
+        
+        disruption_keys = ['disruption', 'störung', 'incident', 'message', 'title', 'description']
+        return any(key in str(data).lower() for key in disruption_keys)
+    
+    def _parse_disruption_json(self, data):
+        """Parse disruption from JSON data"""
+        try:
+            title = data.get('title') or data.get('headline') or data.get('message', '')
+            if not title or len(title) < 5:
+                return None
+            
+            return {
+                'id': data.get('id', f"json_{hash(title)}"),
+                'title': title,
+                'description': data.get('description') or data.get('text') or data.get('content', ''),
+                'lines': self._extract_lines_from_text(title),
+                'type': self._classify_type(title),
+                'start_time': None,
+                'end_time': None,
+                'severity': data.get('severity', 'medium'),
+                'source': 'db_karte_json',
+                'created_at': datetime.utcnow()
+            }
+        except:
+            return None
+    
+    def _parse_api_response(self, data):
+        """Parse API response for disruptions"""
+        disruptions = []
+        
+        try:
+            # Handle different response formats
+            if isinstance(data, dict):
+                if 'disruptions' in data:
+                    data = data['disruptions']
+                elif 'items' in data:
+                    data = data['items']
+                elif 'data' in data:
+                    data = data['data']
+                else:
+                    data = [data]
+            
+            if isinstance(data, list):
+                for item in data:
+                    disruption = self._parse_disruption_json(item)
+                    if disruption:
+                        disruptions.append(disruption)
+        except:
+            pass
+        
+        return disruptions
+    
+    def _scrape_visible_disruptions(self, soup):
+        """Scrape visible disruption messages from the page"""
+        disruptions = []
+        
+        try:
+            # Look for common disruption container classes
+            selectors = [
+                'div[class*="disruption"]',
+                'div[class*="stoerung"]',
+                'div[class*="incident"]',
+                'div[class*="message"]',
+                'div[class*="alert"]',
+                'article[class*="disruption"]',
+            ]
+            
+            for selector in selectors:
+                elements = soup.select(selector)
+                for elem in elements:
+                    text = elem.get_text(strip=True)
+                    if len(text) > 20 and any(word in text.lower() for word in ['s-bahn', 's1', 's2', 's3', 's4', 's6', 's7', 's8']):
+                        # Extract title (first line or heading)
+                        title_elem = elem.find(['h1', 'h2', 'h3', 'h4', 'strong'])
+                        title = title_elem.get_text(strip=True) if title_elem else text[:100]
+                        
+                        disruptions.append({
+                            'id': f"visible_{len(disruptions)}",
+                            'title': title,
+                            'description': text[:500],
+                            'lines': self._extract_lines_from_text(text),
+                            'type': self._classify_type(title),
+                            'start_time': None,
+                            'end_time': None,
+                            'severity': 'medium',
+                            'source': 'db_karte_page',
+                            'created_at': datetime.utcnow()
+                        })
+        except:
+            pass
+        
+        return disruptions
+    
+    def _extract_lines_from_text(self, text):
+        """Extract S-Bahn line numbers from text"""
+        import re
+        # Match S1, S2, S 3, S-4, etc.
+        pattern = r'S[\s-]?[1-8]'
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        # Normalize to format like "S1", "S2"
+        lines = [re.sub(r'[^\dS]', '', m.upper()) for m in matches]
+        return list(set(lines))  # Remove duplicates
+    
+    def _classify_type(self, title):
+        """Classify disruption type based on title"""
+        title_lower = title.lower()
+        if 'bauarbeit' in title_lower or 'wartung' in title_lower:
+            return 'maintenance'
+        elif 'ausfall' in title_lower or 'störung' in title_lower:
+            return 'disruption'
+        elif 'verspätung' in title_lower:
+            return 'delay'
+        else:
+            return 'info'
+
+
+def test_db_client():
+    """Test the DB client and print results"""
+    print("="*70)
+    print("🚆 Deutsche Bahn S-Bahn Client Test")
+    print("="*70)
+    
+    client = DBClient()
+    disruptions = client.get_sbahn_disruptions()
+    
+    if not disruptions:
+        print("\n⚠ No S-Bahn disruptions found (or not yet implemented)")
+        return
+    
+    print(f"\n📊 Total S-Bahn Disruptions: {len(disruptions)}")
+    print("="*70)
+    
+    for i, d in enumerate(disruptions, 1):
+        print(f"\n[{i}] {d['title']}")
+        print(f"    Lines: {', '.join(d['lines'])}")
+        print(f"    Type: {d['type']}")
+    
+    print("\n" + "="*70)
+
+
+if __name__ == '__main__':
+    test_db_client()
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""
+MVG Client - Placeholder for U-Bahn disruptions
+"""
+
+class MVGClient:
+    """Client for MVG (Munich Transport) U-Bahn disruptions"""
+    
+    def get_disruptions(self):
+        """
+        Fetch U-Bahn disruptions
+        
+        Returns:
+            list: Empty for now (U-Bahn scraping not implemented)
+        """
+        print("\n🔍 MVG U-Bahn disruptions...")
+        print("   ℹ️  U-Bahn scraping not yet implemented")
+        return []
+
+
+def test_mvg_client():
+    """Test the MVG client"""
+    print("="*70)
+    print("🚇 MVG U-Bahn Client Test")
+    print("="*70)
+    
+    client = MVGClient()
+    disruptions = client.get_disruptions()
+    
+    print("\n⚠ U-Bahn scraping not yet implemented")
+    print("="*70)
+
+
+if __name__ == '__main__':
+    test_mvg_client()
@@ -0,0 +1,10 @@
+requests==2.31.0
+beautifulsoup4==4.12.2
+feedparser==6.0.10
+pymongo==4.6.1
+python-dotenv==1.0.0
+pytz==2023.3
+selenium==4.15.2
+webdriver-manager==4.0.1
+flask==3.0.0
+redis==5.0.1
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Start both the API server and the worker
+
+# Start the worker in the background
+python -u worker.py &
+
+# Start the API server in the foreground
+python -u api_service.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Transport Crawler Worker - Listens to Redis queue and processes crawl tasks
+"""
+import redis
+import json
+import os
+import time
+from crawler_service import run_crawler
+
+REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
+QUEUE_NAME = 'transport_crawl_queue'
+
+
+def get_redis_client():
+    """Get Redis client"""
+    return redis.from_url(REDIS_URL, decode_responses=True)
+
+
+def process_crawl_task(message):
+    """Process a crawl task"""
+    try:
+        print(f"\n📨 Received task: {message.get('task')}")
+        print(f"   Timestamp: {message.get('timestamp')}")
+        
+        # Run the crawler
+        result = run_crawler()
+        
+        print(f"✅ Task completed: {result.get('total_disruptions')} disruptions found")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Task failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    """Main worker loop"""
+    print("="*70)
+    print("🚇 Transport Crawler Worker Starting")
+    print("="*70)
+    print(f"Redis URL: {REDIS_URL}")
+    print(f"Queue: {QUEUE_NAME}")
+    print("Waiting for tasks...")
+    print("="*70)
+    
+    r = get_redis_client()
+    
+    while True:
+        try:
+            # Block and wait for messages (timeout 1 second)
+            result = r.brpop(QUEUE_NAME, timeout=1)
+            
+            if result:
+                queue_name, message_json = result
+                message = json.loads(message_json)
+                process_crawl_task(message)
+            
+        except KeyboardInterrupt:
+            print("\n\n👋 Worker stopped by user")
+            break
+        except Exception as e:
+            print(f"\n❌ Worker error: {e}")
+            time.sleep(5)  # Wait before retrying
+
+
+if __name__ == '__main__':
+    main()