#!/usr/bin/env python3 """ Deutsche Bahn API Client - Fetch S-Bahn disruptions using Selenium """ import requests from datetime import datetime import time class DBClient: """Client for Deutsche Bahn (S-Bahn) disruptions""" # DB S-Bahn München map page MAP_URL = "https://karte.bahn.de/en/region/DB_SBahn_Muenchen" def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9,de;q=0.8', }) def get_sbahn_disruptions(self): """ Fetch S-Bahn disruptions for Munich from DB Karte using Selenium Returns: list: Disruption data """ print("\n🔍 Fetching S-Bahn disruptions from DB Karte (using Selenium)...") driver = None try: from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import os # Setup Chrome options for Chromium chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument('--disable-gpu') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) # Set realistic user agent chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # Use system Chromium if available (Docker container) chrome_bin = os.getenv('CHROME_BIN', '/usr/bin/chromium') chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver') if os.path.exists(chrome_bin): chrome_options.binary_location = chrome_bin print(f" Using system Chromium: {chrome_bin}") print(" Starting Chromium browser...") # Try to use system chromedriver try: if os.path.exists(chromedriver_path): service = Service(chromedriver_path) driver = webdriver.Chrome(service=service, options=chrome_options) else: driver = webdriver.Chrome(options=chrome_options) except Exception as e: print(f" ✗ Failed to start Chromium: {e}") print(f" ℹ️ Falling back to webdriver-manager...") try: from webdriver_manager.chrome import ChromeDriverManager service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) except Exception as e2: print(f" ✗ webdriver-manager also failed: {e2}") raise print(f" Loading: {self.MAP_URL}") driver.get(self.MAP_URL) # Wait for page to load print(" Waiting for page to load...") # Wait for disruption boxes to appear try: print(" Waiting for disruption boxes...") WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']")) ) # Give extra time for all boxes to load time.sleep(3) print(" ✓ Disruption boxes should be loaded") except Exception as e: print(f" ⚠ Timeout waiting for disruption boxes: {e}") time.sleep(5) print(f" ✓ Page loaded (title: {driver.title[:50]}...)") # Debug: Save screenshot and page source try: screenshot_path = "/tmp/db_karte_screenshot.png" driver.save_screenshot(screenshot_path) print(f" 📸 Screenshot saved to: {screenshot_path}") except: pass # Debug: Print page structure print(" Analyzing page structure...") page_source = driver.page_source # Save page source for inspection try: with open("/tmp/db_karte_source.html", "w", encoding="utf-8") as f: f.write(page_source) print(f" 📄 Page source saved to: /tmp/db_karte_source.html") except: pass # Look for disruption markers/icons on the map disruptions = self._find_and_click_disruptions(driver) # If no disruptions found via clicking, parse the page source if not disruptions: print(" No clickable disruptions found, parsing page source...") # Debug: Show what elements are on the page from bs4 import BeautifulSoup soup = BeautifulSoup(page_source, 'html.parser') # Count different element types print(f" Page stats: {len(soup.find_all('div'))} divs, {len(soup.find_all('button'))} buttons") # Look for any text mentioning disruptions text = soup.get_text().lower() if 'disruption' in text or 'störung' in text or 'incident' in text: print(f" ℹ️ Page contains disruption-related text") # Check for common map libraries if 'leaflet' in page_source.lower(): print(f" ℹ️ Page uses Leaflet maps") if 'mapbox' in page_source.lower(): print(f" ℹ️ Page uses Mapbox") if 'google.maps' in page_source.lower(): print(f" ℹ️ Page uses Google Maps") disruptions = self._parse_selenium_page(page_source, driver) if disruptions: print(f"✓ Found {len(disruptions)} S-Bahn disruptions") else: print(f" ℹ️ No S-Bahn disruptions found (all lines operating normally)") return disruptions except ImportError as e: print(f" ✗ Selenium not available: {e}") print(f" ℹ️ Install with: pip install selenium webdriver-manager") return [] except Exception as e: print(f" ✗ Error: {e}") import traceback traceback.print_exc() return [] finally: if driver: driver.quit() def _find_and_click_disruptions(self, driver): """Find disruption boxes in the sidebar""" try: from selenium.webdriver.common.by import By disruptions = [] print(" Looking for disruption boxes...") # Find all disruption boxes in the sidebar disruption_boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']") if not disruption_boxes: print(" No disruption boxes found") return [] print(f" Found {len(disruption_boxes)} disruption boxes") # First pass: collect all basic info without clicking basic_info = [] for i, box in enumerate(disruption_boxes): try: # Extract disruption ID disruption_id = box.get_attribute('id') # Extract title title_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxTitle']") title = title_elem.text.strip() # Extract subtitle (type) subtitle_elem = box.find_element(By.CSS_SELECTOR, "span[data-cy='disruptionboxSubtitle']") subtitle = subtitle_elem.text.strip() # Extract affected lines lines = [] badge_list = box.find_element(By.CSS_SELECTOR, "div[data-cy='disruptionBadgeList']") badges = badge_list.find_elements(By.CSS_SELECTOR, "span[data-cy='disruptionBadge']") for badge in badges: line_text = badge.text.strip() if line_text and line_text.startswith('S'): lines.append(line_text) # Determine severity from icon severity = 'medium' try: icon = box.find_element(By.CSS_SELECTOR, "img[data-cy='disruptionboxIcon']") icon_src = icon.get_attribute('src') if 'red' in icon_src: severity = 'high' elif 'orange' in icon_src: severity = 'medium' elif 'yellow' in icon_src: severity = 'low' except: pass # Store basic info basic_info.append({ 'id': disruption_id or f"sbahn_{i}", 'title': title, 'subtitle': subtitle, 'lines': lines, 'severity': severity, 'index': i }) print(f" ✓ [{i}] {title[:60]}... (Lines: {', '.join(lines)})") except Exception as e: print(f" ✗ Error extracting disruption {i}: {e}") continue # Second pass: click each one to get time details print(f"\n Extracting time details for {len(basic_info)} disruptions...") for info in basic_info: print(f" Processing disruption {info['index']}...") try: # Make sure we're back at the list view driver.execute_script("window.scrollTo(0, 0);") time.sleep(0.5) # Wait for boxes to be present again try: WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-cy='disruptionbox']")) ) except: pass # Refetch boxes each time boxes = driver.find_elements(By.CSS_SELECTOR, "div[data-cy='disruptionbox']") print(f" Found {len(boxes)} boxes after refetch") if info['index'] >= len(boxes): print(f" ⚠ Box {info['index']} not found (only {len(boxes)} boxes available)") continue # Get fresh reference to the box and button box = boxes[info['index']] button = box.find_element(By.TAG_NAME, "button") # Click to open details driver.execute_script("arguments[0].scrollIntoView(true);", button) time.sleep(0.3) driver.execute_script("arguments[0].click();", button) # Use JS click time.sleep(1.5) # Wait for detail panel to fully open # Extract time from page text detail_text = driver.find_element(By.TAG_NAME, "body").text # Debug: show a snippet of the detail text if "From:" in detail_text and "To:" in detail_text: snippet_start = detail_text.find("From:") snippet_end = detail_text.find("To:", snippet_start) + 50 snippet = detail_text[snippet_start:snippet_end] print(f" Time snippet: {snippet.replace(chr(10), ' ')}") start_time, end_time = self._extract_time_range(detail_text) # Go back to original page to reset the view driver.get(self.MAP_URL) time.sleep(3) # Wait for page to reload and boxes to appear # Create disruption object disruption_type = self._classify_type(info['title'] + ' ' + info['subtitle']) disruption = { 'id': info['id'], 'title': info['title'], 'description': info['subtitle'], 'lines': info['lines'], 'type': disruption_type, 'start_time': start_time, 'end_time': end_time, 'severity': info['severity'], 'source': 'db_karte_sidebar', 'created_at': datetime.utcnow() } disruptions.append(disruption) time_info = "" if start_time: time_info += f" From: {start_time.strftime('%d.%m %H:%M')}" if end_time: time_info += f" To: {end_time.strftime('%d.%m %H:%M')}" if time_info: print(f" ✓ [{info['index']}]{time_info}") except Exception as e: print(f" ⚠ Could not get time for disruption {info['index']}: {e}") # Still add the disruption without time info disruption = { 'id': info['id'], 'title': info['title'], 'description': info['subtitle'], 'lines': info['lines'], 'type': self._classify_type(info['title']), 'start_time': None, 'end_time': None, 'severity': info['severity'], 'source': 'db_karte_sidebar', 'created_at': datetime.utcnow() } disruptions.append(disruption) return disruptions except Exception as e: print(f" ✗ Error finding disruption boxes: {e}") return [] def _extract_disruption_details(self, driver): """Extract disruption details from popup/modal""" try: from selenium.webdriver.common.by import By # Look for popup/modal/tooltip containers popup_selectors = [ "div[class*='popup']", "div[class*='modal']", "div[class*='tooltip']", "div[class*='detail']", "div[class*='info']", "[role='dialog']", "[role='tooltip']", ] popup = None for selector in popup_selectors: try: elements = driver.find_elements(By.CSS_SELECTOR, selector) for elem in elements: if elem.is_displayed() and len(elem.text) > 20: popup = elem break if popup: break except: continue if not popup: # Try to get any recently appeared text body = driver.find_element(By.TAG_NAME, "body") popup_text = body.text else: popup_text = popup.text # Check if it's S-Bahn related if not self._contains_sbahn_reference(popup_text): return None # Extract title (usually first line or heading) title = popup_text.split('\n')[0][:100] if '\n' in popup_text else popup_text[:100] # Extract time information start_time, end_time = self._extract_time_range(popup_text) # Extract affected lines lines = self._extract_lines_from_text(popup_text) return { 'id': f"sbahn_detail_{hash(popup_text) % 10000}", 'title': title, 'description': popup_text[:500], 'lines': lines, 'type': self._classify_type(title), 'start_time': start_time, 'end_time': end_time, 'severity': self._determine_severity(popup_text), 'source': 'db_karte_detail', 'created_at': datetime.utcnow() } except Exception as e: return None def _extract_time_range(self, text): """Extract start and end time from text""" import re from datetime import datetime start_time = None end_time = None # Look for the specific format with possible newlines # Pattern: From:XX.YYYY-MM-DD, HH:MMTo:XX.YYYY-MM-DD, HH:MM # Remove newlines first to make matching easier text_clean = text.replace('\n', ' ').replace('\r', ' ') pattern = r'From:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})\s*To:\s*[A-Za-z]{2}\.\s*(\d{4}-\d{2}-\d{2}),\s*(\d{2}:\d{2})' match = re.search(pattern, text_clean) if match: try: start_date = match.group(1) # 2025-11-13 start_time_str = match.group(2) # 10:02 end_date = match.group(3) # 2025-11-13 end_time_str = match.group(4) # 14:30 start_time = datetime.strptime(f"{start_date} {start_time_str}", "%Y-%m-%d %H:%M") end_time = datetime.strptime(f"{end_date} {end_time_str}", "%Y-%m-%d %H:%M") except Exception as e: print(f" ⚠ Error parsing time: {e}") # Fallback: Try other German formats if not start_time: # Look for "ab DD.MM.YYYY HH:MM" or "bis DD.MM.YYYY HH:MM" ab_pattern = r'ab\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})' bis_pattern = r'bis\s+(\d{1,2}\.\d{1,2}\.\d{4})[,\s]+(\d{1,2}:\d{2})' ab_match = re.search(ab_pattern, text, re.IGNORECASE) if ab_match: try: start_time = datetime.strptime(f"{ab_match.group(1)} {ab_match.group(2)}", "%d.%m.%Y %H:%M") except: pass bis_match = re.search(bis_pattern, text, re.IGNORECASE) if bis_match: try: end_time = datetime.strptime(f"{bis_match.group(1)} {bis_match.group(2)}", "%d.%m.%Y %H:%M") except: pass return start_time, end_time def _determine_severity(self, text): """Determine severity based on keywords""" text_lower = text.lower() if any(word in text_lower for word in ['ausfall', 'gesperrt', 'eingestellt', 'komplett']): return 'high' elif any(word in text_lower for word in ['verspätung', 'verzögerung', 'teilweise']): return 'medium' else: return 'low' def _parse_selenium_page(self, page_source, driver): """Parse page loaded by Selenium""" try: from bs4 import BeautifulSoup from selenium.webdriver.common.by import By print(" Analyzing rendered page...") soup = BeautifulSoup(page_source, 'html.parser') disruptions = [] # Method 1: Try to find disruption elements directly via Selenium try: # Look for common disruption indicators selectors = [ "div[class*='disruption']", "div[class*='stoerung']", "div[class*='incident']", "div[class*='message']", "div[class*='alert']", "[data-disruption]", "[data-incident]" ] for selector in selectors: try: elements = driver.find_elements(By.CSS_SELECTOR, selector) if elements: print(f" Found {len(elements)} elements with selector: {selector}") for elem in elements: text = elem.text.strip() if len(text) > 20 and self._contains_sbahn_reference(text): disruptions.append(self._create_disruption_from_text(text)) except: continue except Exception as e: print(f" ✗ Selenium element search error: {e}") # Method 2: Parse the page source with BeautifulSoup if not disruptions: print(" Trying BeautifulSoup parsing...") disruptions = self._parse_map_page(page_source.encode(), page_source) # Method 3: Check for any text mentioning S-Bahn lines with disruptions if not disruptions: print(" Checking page text for S-Bahn mentions...") page_text = soup.get_text() if self._contains_sbahn_reference(page_text): # Extract paragraphs or sections mentioning S-Bahn for elem in soup.find_all(['p', 'div', 'span']): text = elem.get_text(strip=True) if len(text) > 30 and self._contains_sbahn_reference(text): lines = self._extract_lines_from_text(text) if lines: disruptions.append(self._create_disruption_from_text(text)) # Remove duplicates seen = set() unique = [] for d in disruptions: key = d['title'][:50] if key not in seen: seen.add(key) unique.append(d) return unique except Exception as e: print(f" ✗ Parse error: {e}") import traceback traceback.print_exc() return [] def _contains_sbahn_reference(self, text): """Check if text contains S-Bahn line references""" import re return bool(re.search(r'S[\s-]?[1-8]', text, re.IGNORECASE)) def _create_disruption_from_text(self, text): """Create disruption object from text""" # Extract first sentence or first 100 chars as title sentences = text.split('.') title = sentences[0][:100] if sentences else text[:100] return { 'id': f"sbahn_{hash(text) % 10000}", 'title': title, 'description': text[:500], 'lines': self._extract_lines_from_text(text), 'type': self._classify_type(title), 'start_time': None, 'end_time': None, 'severity': 'medium', 'source': 'db_karte_selenium', 'created_at': datetime.utcnow() } def _parse_map_page(self, html_content, html_text): """Parse DB Karte map page for S-Bahn disruptions""" try: from bs4 import BeautifulSoup import re import json disruptions = [] # Method 1: Look for embedded JSON data in script tags print(" Analyzing page for disruption data...") # The map page likely has JSON data embedded in