#!/usr/bin/env python """ News Sender Service - Standalone microservice for sending newsletters Fetches articles from MongoDB and sends to subscribers via email """ import smtplib from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from datetime import datetime from pathlib import Path from jinja2 import Template from pymongo import MongoClient import os import sys from dotenv import load_dotenv # Add backend directory to path for importing tracking service # Check if running in Docker (backend is at /app/backend) or locally (../backend) if Path('/app/backend').exists(): backend_dir = Path('/app/backend') else: backend_dir = Path(__file__).parent.parent / 'backend' sys.path.insert(0, str(backend_dir)) # Import tracking modules from services import tracking_service from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls # Load environment variables from backend/.env # Try multiple locations (Docker vs local) # Note: override=False ensures Docker environment variables take precedence env_locations = [ Path('/app/.env'), # Docker location Path(__file__).parent.parent / 'backend' / '.env', # Local location Path(__file__).parent / '.env', # Current directory ] env_loaded = False for env_path in env_locations: if env_path.exists(): load_dotenv(dotenv_path=env_path, override=False) # Don't override existing env vars print(f"โœ“ Loaded configuration from: {env_path}") env_loaded = True break if not env_loaded: print(f"โš  Warning: .env file not found in any of these locations:") for loc in env_locations: print(f" - {loc}") class Config: """Configuration for news sender""" # MongoDB - prioritize environment variable over .env file MONGODB_URI = os.environ.get('MONGODB_URI') or os.getenv('MONGODB_URI', 'mongodb://localhost:27017/') DB_NAME = 'munich_news' # Email SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com') SMTP_PORT = int(os.getenv('SMTP_PORT', '587')) EMAIL_USER = os.getenv('EMAIL_USER', '') EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '') # Newsletter MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10')) HOURS_LOOKBACK = int(os.getenv('NEWSLETTER_HOURS_LOOKBACK', '24')) WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000') # Tracking TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true' TRACKING_API_URL = os.getenv('TRACKING_API_URL', 'http://localhost:5001') TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90')) # MongoDB connection client = MongoClient(Config.MONGODB_URI) db = client[Config.DB_NAME] articles_collection = db['articles'] subscribers_collection = db['subscribers'] transport_alerts_collection = db['transport_alerts'] def get_today_transport_disruptions(): """ Get active S-Bahn disruptions for today Fetches from MongoDB transport_alerts collection Returns: list: Active disruptions with details """ try: from datetime import datetime # Get active disruptions disruptions = list(transport_alerts_collection.find( {'is_active': True}, {'_id': 0} ).sort('severity', -1).sort('updated_at', -1)) # Filter for disruptions happening today today = datetime.utcnow().date() today_disruptions = [] for d in disruptions: # Check if disruption is active today start_time = d.get('start_time') end_time = d.get('end_time') is_today = False if start_time and end_time: start_date = start_time.date() if hasattr(start_time, 'date') else today end_date = end_time.date() if hasattr(end_time, 'date') else today is_today = start_date <= today <= end_date elif start_time: start_date = start_time.date() if hasattr(start_time, 'date') else today is_today = start_date <= today else: is_today = True # No time info, assume it's relevant if is_today: # Format times for display if start_time: d['start_time_str'] = start_time.strftime('%H:%M') if hasattr(start_time, 'strftime') else str(start_time) if end_time: d['end_time_str'] = end_time.strftime('%H:%M') if hasattr(end_time, 'strftime') else str(end_time) # Format lines as comma-separated string d['lines_str'] = ', '.join(d.get('lines', [])) # Get severity icon severity_icons = { 'high': '๐Ÿ”ด', 'medium': '๐ŸŸก', 'low': '๐ŸŸข' } d['severity_icon'] = severity_icons.get(d.get('severity', 'medium'), '๐ŸŸก') today_disruptions.append(d) print(f"โœ“ Found {len(today_disruptions)} transport disruptions for today") return today_disruptions except Exception as e: print(f"โœ— Error fetching transport disruptions: {e}") return [] def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24): """ Get latest articles with AI summaries from database, fetched per category Includes cluster information for articles with multiple sources Args: categories: List of categories to fetch (None = all categories) articles_per_category: Maximum number of articles per category (default 3) hours: Number of hours to look back (default 24) Returns: list: Articles with summaries published today, grouped by category """ from datetime import timedelta # Get start of today (00:00:00 UTC) today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) # Get cluster summaries collection cluster_summaries_collection = db['cluster_summaries'] # If no categories specified, get all available categories if categories is None: categories = ['general', 'local', 'sports', 'science'] articles = [] processed_clusters = set() # Fetch articles for each category separately for category in categories: # Query for articles in this category from today cursor = articles_collection.find({ 'summary': {'$exists': True, '$ne': None}, 'category': category, '$or': [ {'published_at': {'$gte': today_start}}, {'created_at': {'$gte': today_start}} ] }).sort('created_at', -1).limit(articles_per_category) category_articles = [] for doc in cursor: # Double-check the date to ensure it's from today published_at = doc.get('published_at') created_at = doc.get('created_at') # Skip if both dates are old (extra safety check) if published_at and isinstance(published_at, datetime): if published_at < today_start: continue elif created_at and isinstance(created_at, datetime): if created_at < today_start: continue cluster_id = doc.get('cluster_id') # Check if this article is part of a cluster if cluster_id and cluster_id not in processed_clusters: # Get cluster summary cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id}) if cluster and cluster.get('article_count', 0) > 1: # This is a clustered article - get all source links processed_clusters.add(cluster_id) # Get all articles in this cluster cluster_articles = list(articles_collection.find({ 'cluster_id': cluster_id })) # Build sources list with links sources = [] for art in cluster_articles: sources.append({ 'name': art.get('source', ''), 'link': art.get('link', ''), 'title': art.get('title', '') }) category_articles.append({ 'title': doc.get('title', ''), 'title_en': doc.get('title_en'), 'translated_at': doc.get('translated_at'), 'author': doc.get('author'), 'link': doc.get('link', ''), 'summary': cluster.get('neutral_summary', doc.get('summary', '')), 'source': doc.get('source', ''), 'category': doc.get('category', 'general'), 'published_at': doc.get('published_at', ''), 'is_clustered': True, 'sources': sources, 'article_count': len(sources) }) else: # Single article (no cluster or cluster with only 1 article) category_articles.append({ 'title': doc.get('title', ''), 'title_en': doc.get('title_en'), 'translated_at': doc.get('translated_at'), 'author': doc.get('author'), 'link': doc.get('link', ''), 'summary': doc.get('summary', ''), 'source': doc.get('source', ''), 'category': doc.get('category', 'general'), 'published_at': doc.get('published_at', ''), 'is_clustered': False }) elif not cluster_id or cluster_id not in processed_clusters: # No cluster - single article category_articles.append({ 'title': doc.get('title', ''), 'title_en': doc.get('title_en'), 'translated_at': doc.get('translated_at'), 'author': doc.get('author'), 'link': doc.get('link', ''), 'summary': doc.get('summary', ''), 'source': doc.get('source', ''), 'category': doc.get('category', 'general'), 'published_at': doc.get('published_at', ''), 'is_clustered': False }) # Add this category's articles to the main list articles.extend(category_articles) # Sort articles: clustered articles first (by source count), then by recency # This prioritizes stories covered by multiple sources articles.sort(key=lambda x: ( -1 if x.get('is_clustered') else 0, # Clustered first -x.get('article_count', 1), # More sources = higher priority ), reverse=True) return articles def get_active_subscribers(): """ Get all active subscribers from database with their category preferences Returns: list: Subscriber dictionaries with email and categories """ cursor = subscribers_collection.find({'status': 'active'}) subscribers = [] for doc in cursor: subscribers.append({ 'email': doc['email'], 'categories': doc.get('categories', None) # None means all categories }) return subscribers def render_newsletter_html(articles, subscriber_categories=None, tracking_enabled=False, pixel_tracking_id=None, link_tracking_map=None, api_url=None): """ Render newsletter HTML from template with optional tracking integration Args: articles: List of article dictionaries subscriber_categories: List of categories the subscriber wants (None = all) tracking_enabled: Whether to inject tracking pixel and replace links pixel_tracking_id: Tracking ID for the email open pixel link_tracking_map: Dictionary mapping original URLs to tracking IDs api_url: Base URL for the tracking API Returns: str: Rendered HTML content with tracking injected if enabled """ # Load template template_path = Path(__file__).parent / 'newsletter_template.html' with open(template_path, 'r', encoding='utf-8') as f: template_content = f.read() template = Template(template_content) # Filter articles by subscriber's category preferences if subscriber_categories: print(f" Filtering for categories: {subscriber_categories}") filtered_articles = [a for a in articles if a.get('category', 'general') in subscriber_categories] print(f" Filtered {len(articles)} articles down to {len(filtered_articles)} articles") else: print(f" No category filter - using all {len(articles)} articles") filtered_articles = articles # Group articles by category (max 3 per category) from collections import defaultdict articles_by_category = defaultdict(list) for article in filtered_articles: category = article.get('category', 'general') if len(articles_by_category[category]) < 3: articles_by_category[category].append(article) # Convert to list of category sections category_sections = [] category_names = { 'general': {'name': 'Top Trending', 'icon': '๐Ÿ”ฅ'}, 'local': {'name': 'Local Events', 'icon': '๐Ÿ›๏ธ'}, 'sports': {'name': 'Sports', 'icon': 'โšฝ'}, 'science': {'name': 'Science & Tech', 'icon': '๐Ÿ”ฌ'} } for category, category_articles in sorted(articles_by_category.items()): if category_articles: cat_info = category_names.get(category, {'name': category.title(), 'icon': '๐Ÿ“„'}) category_sections.append({ 'id': category, 'name': cat_info['name'], 'icon': cat_info['icon'], 'articles': category_articles }) # Get weather data from weather_service import get_munich_weather weather = get_munich_weather() # Get transport disruptions for today transport_disruptions = get_today_transport_disruptions() # Prepare template data now = datetime.now() total_articles = sum(len(section['articles']) for section in category_sections) template_data = { 'date': now.strftime('%A, %B %d, %Y'), 'year': now.year, 'article_count': total_articles, 'category_sections': category_sections, 'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe', 'preferences_link': f'{Config.WEBSITE_URL}/preferences.html', 'website_link': Config.WEBSITE_URL, 'tracking_enabled': tracking_enabled, 'weather': weather, 'transport_disruptions': transport_disruptions } # Render HTML html = template.render(**template_data) # Inject tracking if enabled if tracking_enabled and pixel_tracking_id and api_url: # Inject tracking pixel html = inject_tracking_pixel(html, pixel_tracking_id, api_url) # Replace article links with tracking URLs if link_tracking_map: html = replace_article_links(html, link_tracking_map, api_url) return html def send_email(to_email, subject, html_content): """ Send email to a single recipient Args: to_email: Recipient email address subject: Email subject html_content: HTML content of email Returns: tuple: (success: bool, error: str or None) """ try: msg = MIMEMultipart('alternative') msg['Subject'] = subject msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>' msg['To'] = to_email msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z') msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>' msg['X-Mailer'] = 'Munich News Daily Sender' # Add plain text version as fallback plain_text = "This email requires HTML support. Please view it in an HTML-capable email client." msg.attach(MIMEText(plain_text, 'plain', 'utf-8')) # Add HTML version msg.attach(MIMEText(html_content, 'html', 'utf-8')) server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT) server.starttls() server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD) server.send_message(msg) server.quit() return True, None except Exception as e: return False, str(e) def send_newsletter(max_articles=None, test_email=None): """ Send newsletter to all active subscribers Args: max_articles: Maximum number of articles to include (default from config) test_email: If provided, send only to this email (for testing) Returns: dict: Statistics about sending """ print("\n" + "="*70) print("๐Ÿ“ง Munich News Daily - Newsletter Sender") print("="*70) # Validate email configuration if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD: print("โŒ Email credentials not configured") print(" Set EMAIL_USER and EMAIL_PASSWORD in .env file") return { 'success': False, 'error': 'Email credentials not configured' } # Get articles from today only today_date = datetime.now().strftime('%B %d, %Y') print(f"\nFetching articles published TODAY ({today_date})...") print(f" Articles per category: 3") # Fetch all categories - filtering per subscriber happens later articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK) if not articles: print("โŒ No articles from today with summaries found") print(f" No articles published today ({today_date})") print(" Run the crawler with Ollama enabled to get fresh content") return { 'success': False, 'error': f'No articles published today' } print(f"โœ“ Found {len(articles)} recent article(s)") # Get subscribers if test_email: # For test mode, fetch the actual subscriber's preferences from database print(f"\n๐Ÿงช Test mode: Sending to {test_email} only") subscriber_doc = subscribers_collection.find_one({'email': test_email}) if subscriber_doc: subscribers = [{ 'email': test_email, 'categories': subscriber_doc.get('categories', None) }] print(f"โœ“ Found subscriber with categories: {subscriber_doc.get('categories', 'all')}") else: # If not in database, send with all categories subscribers = [{'email': test_email, 'categories': None}] print(f"โš  Email not in database, sending with all categories") else: print("\nFetching active subscribers...") subscribers = get_active_subscribers() print(f"โœ“ Found {len(subscribers)} active subscriber(s)") if not subscribers: print("โŒ No active subscribers found") return { 'success': False, 'error': 'No active subscribers' } # Generate newsletter ID (date-based) newsletter_id = f"newsletter-{datetime.now().strftime('%Y-%m-%d')}" # Send to subscribers subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}" print(f"\nSending newsletter: '{subject}'") print(f"Newsletter ID: {newsletter_id}") print(f"Tracking enabled: {Config.TRACKING_ENABLED}") print("-" * 70) sent_count = 0 failed_count = 0 errors = [] for i, subscriber in enumerate(subscribers, 1): email = subscriber['email'] categories = subscriber['categories'] print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ') # Generate tracking data for this subscriber if tracking is enabled if Config.TRACKING_ENABLED: try: tracking_data = generate_tracking_urls( articles=articles, newsletter_id=newsletter_id, subscriber_email=email, tracking_service=tracking_service ) # Render newsletter with tracking and subscriber's category preferences html_content = render_newsletter_html( articles=articles, subscriber_categories=categories, tracking_enabled=True, pixel_tracking_id=tracking_data['pixel_tracking_id'], link_tracking_map=tracking_data['link_tracking_map'], api_url=Config.TRACKING_API_URL ) except Exception as e: print(f"โš  Tracking error: {e}, sending without tracking...", end=' ') # Fallback: send without tracking html_content = render_newsletter_html(articles, subscriber_categories=categories) else: # Render newsletter without tracking but with subscriber's preferences html_content = render_newsletter_html(articles, subscriber_categories=categories) # Send email success, error = send_email(email, subject, html_content) if success: print("โœ“") sent_count += 1 else: print(f"โœ— {error}") failed_count += 1 errors.append({'email': email, 'error': error}) # Summary print("\n" + "="*70) print("๐Ÿ“Š Sending Complete") print("="*70) print(f"โœ“ Successfully sent: {sent_count}") print(f"โœ— Failed: {failed_count}") print(f"๐Ÿ“ฐ Articles included: {len(articles)}") print("="*70 + "\n") return { 'success': True, 'sent_count': sent_count, 'failed_count': failed_count, 'total_subscribers': len(subscribers), 'article_count': len(articles), 'errors': errors } def preview_newsletter(max_articles=None, hours=None): """ Generate newsletter HTML for preview (doesn't send) Args: max_articles: Maximum number of articles to include (ignored, uses 3 per category) hours: Hours to look back (default from config) Returns: str: HTML content """ hours = hours or Config.HOURS_LOOKBACK articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours) if not articles: today_date = datetime.now().strftime('%B %d, %Y') return f"

No articles from today found

No articles published today ({today_date}). Run the crawler with Ollama enabled to get fresh content.

" # Preview without tracking return render_newsletter_html(articles, tracking_enabled=False) if __name__ == '__main__': # Parse command line arguments if len(sys.argv) > 1: command = sys.argv[1] if command == 'preview': # Generate preview HTML html = preview_newsletter() output_file = 'newsletter_preview.html' with open(output_file, 'w', encoding='utf-8') as f: f.write(html) print(f"โœ“ Preview saved to {output_file}") print(f" Open it in your browser to see the newsletter") elif command == 'test': # Send test email if len(sys.argv) < 3: print("Usage: python sender_service.py test ") sys.exit(1) test_email = sys.argv[2] send_newsletter(test_email=test_email) elif command == 'send': # Send to all subscribers max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None send_newsletter(max_articles=max_articles) else: print("Unknown command. Usage:") print(" python sender_service.py preview - Generate HTML preview") print(" python sender_service.py test - Send test email") print(" python sender_service.py send [count] - Send to all subscribers") else: # Default: send newsletter send_newsletter()