Files
Munich-news/transport_crawler/crawler_service.py
2025-11-14 12:51:18 +01:00

234 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Transport Crawler Service - Main orchestrator
Fetches disruptions from multiple sources and displays them
"""
from datetime import datetime
from mvg_api_client import MVGClient
from db_api_client import DBClient
def print_header():
"""Print header"""
print("\n" + "="*70)
print("🚇 Munich Transport Disruption Crawler")
print("="*70)
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)
def print_disruption_summary(all_disruptions):
"""Print summary of all disruptions"""
if not all_disruptions:
print("\n✅ No disruptions found - All lines operating normally!")
return
print(f"\n📊 SUMMARY: {len(all_disruptions)} Active Disruptions")
print("="*70)
# Group by type
by_type = {}
for d in all_disruptions:
dtype = d.get('type', 'unknown')
by_type[dtype] = by_type.get(dtype, 0) + 1
print("\nBy Type:")
for dtype, count in sorted(by_type.items()):
icon = {
'maintenance': '🔧',
'disruption': '⚠️',
'delay': '⏱️',
'info': ''
}.get(dtype, '')
print(f" {icon} {dtype.title()}: {count}")
# Group by source
by_source = {}
for d in all_disruptions:
source = d.get('source', 'unknown')
by_source[source] = by_source.get(source, 0) + 1
print("\nBy Source:")
for source, count in sorted(by_source.items()):
print(f"{source}: {count}")
def print_disruptions(disruptions, title):
"""Print disruptions in a formatted way"""
if not disruptions:
return
print(f"\n{title}")
print("-"*70)
for i, d in enumerate(disruptions, 1):
# Icon based on type
icon = {
'maintenance': '🔧',
'disruption': '⚠️',
'delay': '⏱️',
'info': ''
}.get(d.get('type', 'info'), '')
print(f"\n{icon} [{i}] {d.get('title', 'No title')}")
# Lines affected
lines = d.get('lines', [])
if lines:
line_str = ', '.join(lines)
print(f" 🚇 Lines: {line_str}")
# Time range
start = d.get('start_time')
end = d.get('end_time')
if start or end:
time_str = ""
if start:
time_str += f"From: {start.strftime('%d.%m %H:%M')}"
if end:
if time_str:
time_str += ""
time_str += f"Until: {end.strftime('%d.%m %H:%M')}"
print(f"{time_str}")
# Description
desc = d.get('description', '')
if desc:
# Truncate long descriptions
if len(desc) > 150:
desc = desc[:150] + "..."
print(f" 📝 {desc}")
# Severity
severity = d.get('severity', 'medium')
severity_icon = {
'high': '🔴',
'medium': '🟡',
'low': '🟢'
}.get(severity, '')
print(f" {severity_icon} Severity: {severity}")
def save_to_mongodb(disruptions):
"""Save disruptions to MongoDB"""
try:
from pymongo import MongoClient
import os
mongo_uri = os.getenv('MONGODB_URI', 'mongodb://admin:changeme@mongodb:27017/')
client = MongoClient(mongo_uri)
db = client['munich_news']
collection = db['transport_alerts']
print("\n💾 Saving to MongoDB...")
# Mark all existing alerts as inactive
collection.update_many({}, {'$set': {'is_active': False}})
# Insert or update current disruptions
saved_count = 0
for d in disruptions:
# Use disruption ID as unique identifier
collection.update_one(
{'alert_id': d['id']},
{
'$set': {
'alert_id': d['id'],
'title': d['title'],
'description': d['description'],
'lines': d['lines'],
'type': d['type'],
'severity': d['severity'],
'start_time': d['start_time'],
'end_time': d['end_time'],
'source': d['source'],
'is_active': True,
'updated_at': datetime.utcnow()
}
},
upsert=True
)
saved_count += 1
print(f"✓ Saved {saved_count} disruptions to MongoDB")
return True
except Exception as e:
print(f"✗ MongoDB error: {e}")
return False
def run_crawler():
"""Main crawler function"""
print_header()
all_disruptions = []
# 1. Fetch MVG disruptions (U-Bahn, Tram, Bus)
print("\n📡 Fetching data from sources...")
print("-"*70)
mvg_client = MVGClient()
mvg_disruptions = mvg_client.get_disruptions()
all_disruptions.extend(mvg_disruptions)
# 2. Fetch S-Bahn disruptions
db_client = DBClient()
sbahn_disruptions = db_client.get_sbahn_disruptions()
all_disruptions.extend(sbahn_disruptions)
# 3. Print summary
print_disruption_summary(all_disruptions)
# 4. Print detailed disruptions
if mvg_disruptions:
print_disruptions(mvg_disruptions, "\n🚇 MVG DISRUPTIONS (U-Bahn, Tram, Bus)")
if sbahn_disruptions:
print_disruptions(sbahn_disruptions, "\n🚆 S-BAHN DISRUPTIONS")
# 5. Output JSON
print("\n" + "="*70)
print("📄 JSON OUTPUT")
print("="*70)
import json
output = {
'timestamp': datetime.now().isoformat(),
'total_disruptions': len(all_disruptions),
'mvg_disruptions': len(mvg_disruptions),
'sbahn_disruptions': len(sbahn_disruptions),
'disruptions': []
}
for d in all_disruptions:
output['disruptions'].append({
'id': d.get('id'),
'title': d.get('title'),
'description': d.get('description'),
'lines': d.get('lines', []),
'type': d.get('type'),
'severity': d.get('severity'),
'start_time': d.get('start_time').isoformat() if d.get('start_time') else None,
'end_time': d.get('end_time').isoformat() if d.get('end_time') else None,
'source': d.get('source')
})
print(json.dumps(output, indent=2, ensure_ascii=False))
# 6. Save to MongoDB
save_to_mongodb(all_disruptions)
# Footer
print("\n" + "="*70)
print("✓ Crawler finished")
print("="*70 + "\n")
return output
if __name__ == '__main__':
try:
disruptions = run_crawler()
except KeyboardInterrupt:
print("\n\n👋 Crawler stopped by user")
except Exception as e:
print(f"\n\n❌ Crawler error: {e}")
import traceback
traceback.print_exc()