Files
Munich-news/news_sender/sender_service.py
2025-11-12 23:28:51 +01:00

571 lines
22 KiB
Python

#!/usr/bin/env python
"""
News Sender Service - Standalone microservice for sending newsletters
Fetches articles from MongoDB and sends to subscribers via email
"""
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime
from pathlib import Path
from jinja2 import Template
from pymongo import MongoClient
import os
import sys
from dotenv import load_dotenv
# Add backend directory to path for importing tracking service
# Check if running in Docker (backend is at /app/backend) or locally (../backend)
if Path('/app/backend').exists():
backend_dir = Path('/app/backend')
else:
backend_dir = Path(__file__).parent.parent / 'backend'
sys.path.insert(0, str(backend_dir))
# Import tracking modules
from services import tracking_service
from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls
# Load environment variables from backend/.env
# Try multiple locations (Docker vs local)
# Note: override=False ensures Docker environment variables take precedence
env_locations = [
Path('/app/.env'), # Docker location
Path(__file__).parent.parent / 'backend' / '.env', # Local location
Path(__file__).parent / '.env', # Current directory
]
env_loaded = False
for env_path in env_locations:
if env_path.exists():
load_dotenv(dotenv_path=env_path, override=False) # Don't override existing env vars
print(f"✓ Loaded configuration from: {env_path}")
env_loaded = True
break
if not env_loaded:
print(f"⚠ Warning: .env file not found in any of these locations:")
for loc in env_locations:
print(f" - {loc}")
class Config:
"""Configuration for news sender"""
# MongoDB - prioritize environment variable over .env file
MONGODB_URI = os.environ.get('MONGODB_URI') or os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
# Email
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
EMAIL_USER = os.getenv('EMAIL_USER', '')
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
# Newsletter
MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
HOURS_LOOKBACK = int(os.getenv('NEWSLETTER_HOURS_LOOKBACK', '24'))
WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')
# Tracking
TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true'
TRACKING_API_URL = os.getenv('TRACKING_API_URL', 'http://localhost:5001')
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
# MongoDB connection
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
subscribers_collection = db['subscribers']
def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
"""
Get latest articles with AI summaries from database, fetched per category
Includes cluster information for articles with multiple sources
Args:
categories: List of categories to fetch (None = all categories)
articles_per_category: Maximum number of articles per category (default 3)
hours: Number of hours to look back (default 24)
Returns:
list: Articles with summaries published today, grouped by category
"""
from datetime import timedelta
# Get start of today (00:00:00 UTC)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Get cluster summaries collection
cluster_summaries_collection = db['cluster_summaries']
# If no categories specified, get all available categories
if categories is None:
categories = ['general', 'local', 'sports', 'science']
articles = []
processed_clusters = set()
# Fetch articles for each category separately
for category in categories:
# Query for articles in this category from today
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'category': category,
'$or': [
{'published_at': {'$gte': today_start}},
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(articles_per_category)
category_articles = []
for doc in cursor:
# Double-check the date to ensure it's from today
published_at = doc.get('published_at')
created_at = doc.get('created_at')
# Skip if both dates are old (extra safety check)
if published_at and isinstance(published_at, datetime):
if published_at < today_start:
continue
elif created_at and isinstance(created_at, datetime):
if created_at < today_start:
continue
cluster_id = doc.get('cluster_id')
# Check if this article is part of a cluster
if cluster_id and cluster_id not in processed_clusters:
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
if cluster and cluster.get('article_count', 0) > 1:
# This is a clustered article - get all source links
processed_clusters.add(cluster_id)
# Get all articles in this cluster
cluster_articles = list(articles_collection.find({
'cluster_id': cluster_id
}))
# Build sources list with links
sources = []
for art in cluster_articles:
sources.append({
'name': art.get('source', ''),
'link': art.get('link', ''),
'title': art.get('title', '')
})
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': True,
'sources': sources,
'article_count': len(sources)
})
else:
# Single article (no cluster or cluster with only 1 article)
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
# Add this category's articles to the main list
articles.extend(category_articles)
# Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources
articles.sort(key=lambda x: (
-1 if x.get('is_clustered') else 0, # Clustered first
-x.get('article_count', 1), # More sources = higher priority
), reverse=True)
return articles
def get_active_subscribers():
"""
Get all active subscribers from database with their category preferences
Returns:
list: Subscriber dictionaries with email and categories
"""
cursor = subscribers_collection.find({'status': 'active'})
subscribers = []
for doc in cursor:
subscribers.append({
'email': doc['email'],
'categories': doc.get('categories', None) # None means all categories
})
return subscribers
def render_newsletter_html(articles, subscriber_categories=None, tracking_enabled=False,
pixel_tracking_id=None, link_tracking_map=None, api_url=None):
"""
Render newsletter HTML from template with optional tracking integration
Args:
articles: List of article dictionaries
subscriber_categories: List of categories the subscriber wants (None = all)
tracking_enabled: Whether to inject tracking pixel and replace links
pixel_tracking_id: Tracking ID for the email open pixel
link_tracking_map: Dictionary mapping original URLs to tracking IDs
api_url: Base URL for the tracking API
Returns:
str: Rendered HTML content with tracking injected if enabled
"""
# Load template
template_path = Path(__file__).parent / 'newsletter_template.html'
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
template = Template(template_content)
# Filter articles by subscriber's category preferences
if subscriber_categories:
print(f" Filtering for categories: {subscriber_categories}")
filtered_articles = [a for a in articles if a.get('category', 'general') in subscriber_categories]
print(f" Filtered {len(articles)} articles down to {len(filtered_articles)} articles")
else:
print(f" No category filter - using all {len(articles)} articles")
filtered_articles = articles
# Group articles by category (max 3 per category)
from collections import defaultdict
articles_by_category = defaultdict(list)
for article in filtered_articles:
category = article.get('category', 'general')
if len(articles_by_category[category]) < 3:
articles_by_category[category].append(article)
# Convert to list of category sections
category_sections = []
category_names = {
'general': {'name': 'Top Trending', 'icon': '🔥'},
'local': {'name': 'Local Events', 'icon': '🏛️'},
'sports': {'name': 'Sports', 'icon': ''},
'science': {'name': 'Science & Tech', 'icon': '🔬'}
}
for category, category_articles in sorted(articles_by_category.items()):
if category_articles:
cat_info = category_names.get(category, {'name': category.title(), 'icon': '📄'})
category_sections.append({
'id': category,
'name': cat_info['name'],
'icon': cat_info['icon'],
'articles': category_articles
})
# Get weather data
from weather_service import get_munich_weather
weather = get_munich_weather()
# Prepare template data
now = datetime.now()
total_articles = sum(len(section['articles']) for section in category_sections)
template_data = {
'date': now.strftime('%A, %B %d, %Y'),
'year': now.year,
'article_count': total_articles,
'category_sections': category_sections,
'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
'preferences_link': f'{Config.WEBSITE_URL}/preferences.html',
'website_link': Config.WEBSITE_URL,
'tracking_enabled': tracking_enabled,
'weather': weather
}
# Render HTML
html = template.render(**template_data)
# Inject tracking if enabled
if tracking_enabled and pixel_tracking_id and api_url:
# Inject tracking pixel
html = inject_tracking_pixel(html, pixel_tracking_id, api_url)
# Replace article links with tracking URLs
if link_tracking_map:
html = replace_article_links(html, link_tracking_map, api_url)
return html
def send_email(to_email, subject, html_content):
"""
Send email to a single recipient
Args:
to_email: Recipient email address
subject: Email subject
html_content: HTML content of email
Returns:
tuple: (success: bool, error: str or None)
"""
try:
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
msg['To'] = to_email
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
msg['X-Mailer'] = 'Munich News Daily Sender'
# Add plain text version as fallback
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
# Add HTML version
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
server.starttls()
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
server.send_message(msg)
server.quit()
return True, None
except Exception as e:
return False, str(e)
def send_newsletter(max_articles=None, test_email=None):
"""
Send newsletter to all active subscribers
Args:
max_articles: Maximum number of articles to include (default from config)
test_email: If provided, send only to this email (for testing)
Returns:
dict: Statistics about sending
"""
print("\n" + "="*70)
print("📧 Munich News Daily - Newsletter Sender")
print("="*70)
# Validate email configuration
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
print("❌ Email credentials not configured")
print(" Set EMAIL_USER and EMAIL_PASSWORD in .env file")
return {
'success': False,
'error': 'Email credentials not configured'
}
# Get articles from today only
today_date = datetime.now().strftime('%B %d, %Y')
print(f"\nFetching articles published TODAY ({today_date})...")
print(f" Articles per category: 3")
# Fetch all categories - filtering per subscriber happens later
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
if not articles:
print("❌ No articles from today with summaries found")
print(f" No articles published today ({today_date})")
print(" Run the crawler with Ollama enabled to get fresh content")
return {
'success': False,
'error': f'No articles published today'
}
print(f"✓ Found {len(articles)} recent article(s)")
# Get subscribers
if test_email:
# For test mode, fetch the actual subscriber's preferences from database
print(f"\n🧪 Test mode: Sending to {test_email} only")
subscriber_doc = subscribers_collection.find_one({'email': test_email})
if subscriber_doc:
subscribers = [{
'email': test_email,
'categories': subscriber_doc.get('categories', None)
}]
print(f"✓ Found subscriber with categories: {subscriber_doc.get('categories', 'all')}")
else:
# If not in database, send with all categories
subscribers = [{'email': test_email, 'categories': None}]
print(f"⚠ Email not in database, sending with all categories")
else:
print("\nFetching active subscribers...")
subscribers = get_active_subscribers()
print(f"✓ Found {len(subscribers)} active subscriber(s)")
if not subscribers:
print("❌ No active subscribers found")
return {
'success': False,
'error': 'No active subscribers'
}
# Generate newsletter ID (date-based)
newsletter_id = f"newsletter-{datetime.now().strftime('%Y-%m-%d')}"
# Send to subscribers
subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
print(f"\nSending newsletter: '{subject}'")
print(f"Newsletter ID: {newsletter_id}")
print(f"Tracking enabled: {Config.TRACKING_ENABLED}")
print("-" * 70)
sent_count = 0
failed_count = 0
errors = []
for i, subscriber in enumerate(subscribers, 1):
email = subscriber['email']
categories = subscriber['categories']
print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')
# Generate tracking data for this subscriber if tracking is enabled
if Config.TRACKING_ENABLED:
try:
tracking_data = generate_tracking_urls(
articles=articles,
newsletter_id=newsletter_id,
subscriber_email=email,
tracking_service=tracking_service
)
# Render newsletter with tracking and subscriber's category preferences
html_content = render_newsletter_html(
articles=articles,
subscriber_categories=categories,
tracking_enabled=True,
pixel_tracking_id=tracking_data['pixel_tracking_id'],
link_tracking_map=tracking_data['link_tracking_map'],
api_url=Config.TRACKING_API_URL
)
except Exception as e:
print(f"⚠ Tracking error: {e}, sending without tracking...", end=' ')
# Fallback: send without tracking
html_content = render_newsletter_html(articles, subscriber_categories=categories)
else:
# Render newsletter without tracking but with subscriber's preferences
html_content = render_newsletter_html(articles, subscriber_categories=categories)
# Send email
success, error = send_email(email, subject, html_content)
if success:
print("")
sent_count += 1
else:
print(f"{error}")
failed_count += 1
errors.append({'email': email, 'error': error})
# Summary
print("\n" + "="*70)
print("📊 Sending Complete")
print("="*70)
print(f"✓ Successfully sent: {sent_count}")
print(f"✗ Failed: {failed_count}")
print(f"📰 Articles included: {len(articles)}")
print("="*70 + "\n")
return {
'success': True,
'sent_count': sent_count,
'failed_count': failed_count,
'total_subscribers': len(subscribers),
'article_count': len(articles),
'errors': errors
}
def preview_newsletter(max_articles=None, hours=None):
"""
Generate newsletter HTML for preview (doesn't send)
Args:
max_articles: Maximum number of articles to include (ignored, uses 3 per category)
hours: Hours to look back (default from config)
Returns:
str: HTML content
"""
hours = hours or Config.HOURS_LOOKBACK
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
if not articles:
today_date = datetime.now().strftime('%B %d, %Y')
return f"<h1>No articles from today found</h1><p>No articles published today ({today_date}). Run the crawler with Ollama enabled to get fresh content.</p>"
# Preview without tracking
return render_newsletter_html(articles, tracking_enabled=False)
if __name__ == '__main__':
# Parse command line arguments
if len(sys.argv) > 1:
command = sys.argv[1]
if command == 'preview':
# Generate preview HTML
html = preview_newsletter()
output_file = 'newsletter_preview.html'
with open(output_file, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✓ Preview saved to {output_file}")
print(f" Open it in your browser to see the newsletter")
elif command == 'test':
# Send test email
if len(sys.argv) < 3:
print("Usage: python sender_service.py test <email>")
sys.exit(1)
test_email = sys.argv[2]
send_newsletter(test_email=test_email)
elif command == 'send':
# Send to all subscribers
max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
send_newsletter(max_articles=max_articles)
else:
print("Unknown command. Usage:")
print(" python sender_service.py preview - Generate HTML preview")
print(" python sender_service.py test <email> - Send test email")
print(" python sender_service.py send [count] - Send to all subscribers")
else:
# Default: send newsletter
send_newsletter()