571 lines
22 KiB
Python
571 lines
22 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
News Sender Service - Standalone microservice for sending newsletters
|
|
Fetches articles from MongoDB and sends to subscribers via email
|
|
"""
|
|
import smtplib
|
|
from email.mime.text import MIMEText
|
|
from email.mime.multipart import MIMEMultipart
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from jinja2 import Template
|
|
from pymongo import MongoClient
|
|
import os
|
|
import sys
|
|
from dotenv import load_dotenv
|
|
|
|
# Add backend directory to path for importing tracking service
|
|
# Check if running in Docker (backend is at /app/backend) or locally (../backend)
|
|
if Path('/app/backend').exists():
|
|
backend_dir = Path('/app/backend')
|
|
else:
|
|
backend_dir = Path(__file__).parent.parent / 'backend'
|
|
sys.path.insert(0, str(backend_dir))
|
|
|
|
# Import tracking modules
|
|
from services import tracking_service
|
|
from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls
|
|
|
|
# Load environment variables from backend/.env
|
|
# Try multiple locations (Docker vs local)
|
|
# Note: override=False ensures Docker environment variables take precedence
|
|
env_locations = [
|
|
Path('/app/.env'), # Docker location
|
|
Path(__file__).parent.parent / 'backend' / '.env', # Local location
|
|
Path(__file__).parent / '.env', # Current directory
|
|
]
|
|
|
|
env_loaded = False
|
|
for env_path in env_locations:
|
|
if env_path.exists():
|
|
load_dotenv(dotenv_path=env_path, override=False) # Don't override existing env vars
|
|
print(f"✓ Loaded configuration from: {env_path}")
|
|
env_loaded = True
|
|
break
|
|
|
|
if not env_loaded:
|
|
print(f"⚠ Warning: .env file not found in any of these locations:")
|
|
for loc in env_locations:
|
|
print(f" - {loc}")
|
|
|
|
|
|
class Config:
|
|
"""Configuration for news sender"""
|
|
# MongoDB - prioritize environment variable over .env file
|
|
MONGODB_URI = os.environ.get('MONGODB_URI') or os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
|
DB_NAME = 'munich_news'
|
|
|
|
# Email
|
|
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
|
|
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
|
|
EMAIL_USER = os.getenv('EMAIL_USER', '')
|
|
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
|
|
|
|
# Newsletter
|
|
MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
|
|
HOURS_LOOKBACK = int(os.getenv('NEWSLETTER_HOURS_LOOKBACK', '24'))
|
|
WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')
|
|
|
|
# Tracking
|
|
TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true'
|
|
TRACKING_API_URL = os.getenv('TRACKING_API_URL', 'http://localhost:5001')
|
|
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
|
|
|
|
|
|
# MongoDB connection
|
|
client = MongoClient(Config.MONGODB_URI)
|
|
db = client[Config.DB_NAME]
|
|
articles_collection = db['articles']
|
|
subscribers_collection = db['subscribers']
|
|
|
|
|
|
def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
|
|
"""
|
|
Get latest articles with AI summaries from database, fetched per category
|
|
Includes cluster information for articles with multiple sources
|
|
|
|
Args:
|
|
categories: List of categories to fetch (None = all categories)
|
|
articles_per_category: Maximum number of articles per category (default 3)
|
|
hours: Number of hours to look back (default 24)
|
|
|
|
Returns:
|
|
list: Articles with summaries published today, grouped by category
|
|
"""
|
|
from datetime import timedelta
|
|
|
|
# Get start of today (00:00:00 UTC)
|
|
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
|
|
|
# Get cluster summaries collection
|
|
cluster_summaries_collection = db['cluster_summaries']
|
|
|
|
# If no categories specified, get all available categories
|
|
if categories is None:
|
|
categories = ['general', 'local', 'sports', 'science']
|
|
|
|
articles = []
|
|
processed_clusters = set()
|
|
|
|
# Fetch articles for each category separately
|
|
for category in categories:
|
|
# Query for articles in this category from today
|
|
cursor = articles_collection.find({
|
|
'summary': {'$exists': True, '$ne': None},
|
|
'category': category,
|
|
'$or': [
|
|
{'published_at': {'$gte': today_start}},
|
|
{'created_at': {'$gte': today_start}}
|
|
]
|
|
}).sort('created_at', -1).limit(articles_per_category)
|
|
|
|
category_articles = []
|
|
|
|
for doc in cursor:
|
|
# Double-check the date to ensure it's from today
|
|
published_at = doc.get('published_at')
|
|
created_at = doc.get('created_at')
|
|
|
|
# Skip if both dates are old (extra safety check)
|
|
if published_at and isinstance(published_at, datetime):
|
|
if published_at < today_start:
|
|
continue
|
|
elif created_at and isinstance(created_at, datetime):
|
|
if created_at < today_start:
|
|
continue
|
|
|
|
cluster_id = doc.get('cluster_id')
|
|
|
|
# Check if this article is part of a cluster
|
|
if cluster_id and cluster_id not in processed_clusters:
|
|
# Get cluster summary
|
|
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
|
|
|
|
if cluster and cluster.get('article_count', 0) > 1:
|
|
# This is a clustered article - get all source links
|
|
processed_clusters.add(cluster_id)
|
|
|
|
# Get all articles in this cluster
|
|
cluster_articles = list(articles_collection.find({
|
|
'cluster_id': cluster_id
|
|
}))
|
|
|
|
# Build sources list with links
|
|
sources = []
|
|
for art in cluster_articles:
|
|
sources.append({
|
|
'name': art.get('source', ''),
|
|
'link': art.get('link', ''),
|
|
'title': art.get('title', '')
|
|
})
|
|
|
|
category_articles.append({
|
|
'title': doc.get('title', ''),
|
|
'title_en': doc.get('title_en'),
|
|
'translated_at': doc.get('translated_at'),
|
|
'author': doc.get('author'),
|
|
'link': doc.get('link', ''),
|
|
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
|
|
'source': doc.get('source', ''),
|
|
'category': doc.get('category', 'general'),
|
|
'published_at': doc.get('published_at', ''),
|
|
'is_clustered': True,
|
|
'sources': sources,
|
|
'article_count': len(sources)
|
|
})
|
|
else:
|
|
# Single article (no cluster or cluster with only 1 article)
|
|
category_articles.append({
|
|
'title': doc.get('title', ''),
|
|
'title_en': doc.get('title_en'),
|
|
'translated_at': doc.get('translated_at'),
|
|
'author': doc.get('author'),
|
|
'link': doc.get('link', ''),
|
|
'summary': doc.get('summary', ''),
|
|
'source': doc.get('source', ''),
|
|
'category': doc.get('category', 'general'),
|
|
'published_at': doc.get('published_at', ''),
|
|
'is_clustered': False
|
|
})
|
|
elif not cluster_id or cluster_id not in processed_clusters:
|
|
# No cluster - single article
|
|
category_articles.append({
|
|
'title': doc.get('title', ''),
|
|
'title_en': doc.get('title_en'),
|
|
'translated_at': doc.get('translated_at'),
|
|
'author': doc.get('author'),
|
|
'link': doc.get('link', ''),
|
|
'summary': doc.get('summary', ''),
|
|
'source': doc.get('source', ''),
|
|
'category': doc.get('category', 'general'),
|
|
'published_at': doc.get('published_at', ''),
|
|
'is_clustered': False
|
|
})
|
|
|
|
# Add this category's articles to the main list
|
|
articles.extend(category_articles)
|
|
|
|
# Sort articles: clustered articles first (by source count), then by recency
|
|
# This prioritizes stories covered by multiple sources
|
|
articles.sort(key=lambda x: (
|
|
-1 if x.get('is_clustered') else 0, # Clustered first
|
|
-x.get('article_count', 1), # More sources = higher priority
|
|
), reverse=True)
|
|
|
|
return articles
|
|
|
|
|
|
def get_active_subscribers():
|
|
"""
|
|
Get all active subscribers from database with their category preferences
|
|
|
|
Returns:
|
|
list: Subscriber dictionaries with email and categories
|
|
"""
|
|
cursor = subscribers_collection.find({'status': 'active'})
|
|
subscribers = []
|
|
for doc in cursor:
|
|
subscribers.append({
|
|
'email': doc['email'],
|
|
'categories': doc.get('categories', None) # None means all categories
|
|
})
|
|
return subscribers
|
|
|
|
|
|
def render_newsletter_html(articles, subscriber_categories=None, tracking_enabled=False,
|
|
pixel_tracking_id=None, link_tracking_map=None, api_url=None):
|
|
"""
|
|
Render newsletter HTML from template with optional tracking integration
|
|
|
|
Args:
|
|
articles: List of article dictionaries
|
|
subscriber_categories: List of categories the subscriber wants (None = all)
|
|
tracking_enabled: Whether to inject tracking pixel and replace links
|
|
pixel_tracking_id: Tracking ID for the email open pixel
|
|
link_tracking_map: Dictionary mapping original URLs to tracking IDs
|
|
api_url: Base URL for the tracking API
|
|
|
|
Returns:
|
|
str: Rendered HTML content with tracking injected if enabled
|
|
"""
|
|
# Load template
|
|
template_path = Path(__file__).parent / 'newsletter_template.html'
|
|
with open(template_path, 'r', encoding='utf-8') as f:
|
|
template_content = f.read()
|
|
|
|
template = Template(template_content)
|
|
|
|
# Filter articles by subscriber's category preferences
|
|
if subscriber_categories:
|
|
print(f" Filtering for categories: {subscriber_categories}")
|
|
filtered_articles = [a for a in articles if a.get('category', 'general') in subscriber_categories]
|
|
print(f" Filtered {len(articles)} articles down to {len(filtered_articles)} articles")
|
|
else:
|
|
print(f" No category filter - using all {len(articles)} articles")
|
|
filtered_articles = articles
|
|
|
|
# Group articles by category (max 3 per category)
|
|
from collections import defaultdict
|
|
articles_by_category = defaultdict(list)
|
|
|
|
for article in filtered_articles:
|
|
category = article.get('category', 'general')
|
|
if len(articles_by_category[category]) < 3:
|
|
articles_by_category[category].append(article)
|
|
|
|
# Convert to list of category sections
|
|
category_sections = []
|
|
category_names = {
|
|
'general': {'name': 'Top Trending', 'icon': '🔥'},
|
|
'local': {'name': 'Local Events', 'icon': '🏛️'},
|
|
'sports': {'name': 'Sports', 'icon': '⚽'},
|
|
'science': {'name': 'Science & Tech', 'icon': '🔬'}
|
|
}
|
|
|
|
for category, category_articles in sorted(articles_by_category.items()):
|
|
if category_articles:
|
|
cat_info = category_names.get(category, {'name': category.title(), 'icon': '📄'})
|
|
category_sections.append({
|
|
'id': category,
|
|
'name': cat_info['name'],
|
|
'icon': cat_info['icon'],
|
|
'articles': category_articles
|
|
})
|
|
|
|
# Get weather data
|
|
from weather_service import get_munich_weather
|
|
weather = get_munich_weather()
|
|
|
|
# Prepare template data
|
|
now = datetime.now()
|
|
total_articles = sum(len(section['articles']) for section in category_sections)
|
|
template_data = {
|
|
'date': now.strftime('%A, %B %d, %Y'),
|
|
'year': now.year,
|
|
'article_count': total_articles,
|
|
'category_sections': category_sections,
|
|
'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
|
|
'preferences_link': f'{Config.WEBSITE_URL}/preferences.html',
|
|
'website_link': Config.WEBSITE_URL,
|
|
'tracking_enabled': tracking_enabled,
|
|
'weather': weather
|
|
}
|
|
|
|
# Render HTML
|
|
html = template.render(**template_data)
|
|
|
|
# Inject tracking if enabled
|
|
if tracking_enabled and pixel_tracking_id and api_url:
|
|
# Inject tracking pixel
|
|
html = inject_tracking_pixel(html, pixel_tracking_id, api_url)
|
|
|
|
# Replace article links with tracking URLs
|
|
if link_tracking_map:
|
|
html = replace_article_links(html, link_tracking_map, api_url)
|
|
|
|
return html
|
|
|
|
|
|
def send_email(to_email, subject, html_content):
|
|
"""
|
|
Send email to a single recipient
|
|
|
|
Args:
|
|
to_email: Recipient email address
|
|
subject: Email subject
|
|
html_content: HTML content of email
|
|
|
|
Returns:
|
|
tuple: (success: bool, error: str or None)
|
|
"""
|
|
try:
|
|
msg = MIMEMultipart('alternative')
|
|
msg['Subject'] = subject
|
|
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
|
|
msg['To'] = to_email
|
|
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
|
|
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
|
|
msg['X-Mailer'] = 'Munich News Daily Sender'
|
|
|
|
# Add plain text version as fallback
|
|
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
|
|
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
|
|
|
|
# Add HTML version
|
|
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
|
|
|
|
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
|
|
server.starttls()
|
|
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
|
|
server.send_message(msg)
|
|
server.quit()
|
|
|
|
return True, None
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
def send_newsletter(max_articles=None, test_email=None):
|
|
"""
|
|
Send newsletter to all active subscribers
|
|
|
|
Args:
|
|
max_articles: Maximum number of articles to include (default from config)
|
|
test_email: If provided, send only to this email (for testing)
|
|
|
|
Returns:
|
|
dict: Statistics about sending
|
|
"""
|
|
print("\n" + "="*70)
|
|
print("📧 Munich News Daily - Newsletter Sender")
|
|
print("="*70)
|
|
|
|
# Validate email configuration
|
|
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
|
|
print("❌ Email credentials not configured")
|
|
print(" Set EMAIL_USER and EMAIL_PASSWORD in .env file")
|
|
return {
|
|
'success': False,
|
|
'error': 'Email credentials not configured'
|
|
}
|
|
|
|
# Get articles from today only
|
|
today_date = datetime.now().strftime('%B %d, %Y')
|
|
print(f"\nFetching articles published TODAY ({today_date})...")
|
|
print(f" Articles per category: 3")
|
|
# Fetch all categories - filtering per subscriber happens later
|
|
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
|
|
|
|
if not articles:
|
|
print("❌ No articles from today with summaries found")
|
|
print(f" No articles published today ({today_date})")
|
|
print(" Run the crawler with Ollama enabled to get fresh content")
|
|
return {
|
|
'success': False,
|
|
'error': f'No articles published today'
|
|
}
|
|
|
|
print(f"✓ Found {len(articles)} recent article(s)")
|
|
|
|
# Get subscribers
|
|
if test_email:
|
|
# For test mode, fetch the actual subscriber's preferences from database
|
|
print(f"\n🧪 Test mode: Sending to {test_email} only")
|
|
subscriber_doc = subscribers_collection.find_one({'email': test_email})
|
|
if subscriber_doc:
|
|
subscribers = [{
|
|
'email': test_email,
|
|
'categories': subscriber_doc.get('categories', None)
|
|
}]
|
|
print(f"✓ Found subscriber with categories: {subscriber_doc.get('categories', 'all')}")
|
|
else:
|
|
# If not in database, send with all categories
|
|
subscribers = [{'email': test_email, 'categories': None}]
|
|
print(f"⚠ Email not in database, sending with all categories")
|
|
else:
|
|
print("\nFetching active subscribers...")
|
|
subscribers = get_active_subscribers()
|
|
print(f"✓ Found {len(subscribers)} active subscriber(s)")
|
|
|
|
if not subscribers:
|
|
print("❌ No active subscribers found")
|
|
return {
|
|
'success': False,
|
|
'error': 'No active subscribers'
|
|
}
|
|
|
|
# Generate newsletter ID (date-based)
|
|
newsletter_id = f"newsletter-{datetime.now().strftime('%Y-%m-%d')}"
|
|
|
|
# Send to subscribers
|
|
subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
|
|
print(f"\nSending newsletter: '{subject}'")
|
|
print(f"Newsletter ID: {newsletter_id}")
|
|
print(f"Tracking enabled: {Config.TRACKING_ENABLED}")
|
|
print("-" * 70)
|
|
|
|
sent_count = 0
|
|
failed_count = 0
|
|
errors = []
|
|
|
|
for i, subscriber in enumerate(subscribers, 1):
|
|
email = subscriber['email']
|
|
categories = subscriber['categories']
|
|
|
|
print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')
|
|
|
|
# Generate tracking data for this subscriber if tracking is enabled
|
|
if Config.TRACKING_ENABLED:
|
|
try:
|
|
tracking_data = generate_tracking_urls(
|
|
articles=articles,
|
|
newsletter_id=newsletter_id,
|
|
subscriber_email=email,
|
|
tracking_service=tracking_service
|
|
)
|
|
|
|
# Render newsletter with tracking and subscriber's category preferences
|
|
html_content = render_newsletter_html(
|
|
articles=articles,
|
|
subscriber_categories=categories,
|
|
tracking_enabled=True,
|
|
pixel_tracking_id=tracking_data['pixel_tracking_id'],
|
|
link_tracking_map=tracking_data['link_tracking_map'],
|
|
api_url=Config.TRACKING_API_URL
|
|
)
|
|
except Exception as e:
|
|
print(f"⚠ Tracking error: {e}, sending without tracking...", end=' ')
|
|
# Fallback: send without tracking
|
|
html_content = render_newsletter_html(articles, subscriber_categories=categories)
|
|
else:
|
|
# Render newsletter without tracking but with subscriber's preferences
|
|
html_content = render_newsletter_html(articles, subscriber_categories=categories)
|
|
|
|
# Send email
|
|
success, error = send_email(email, subject, html_content)
|
|
|
|
if success:
|
|
print("✓")
|
|
sent_count += 1
|
|
else:
|
|
print(f"✗ {error}")
|
|
failed_count += 1
|
|
errors.append({'email': email, 'error': error})
|
|
|
|
# Summary
|
|
print("\n" + "="*70)
|
|
print("📊 Sending Complete")
|
|
print("="*70)
|
|
print(f"✓ Successfully sent: {sent_count}")
|
|
print(f"✗ Failed: {failed_count}")
|
|
print(f"📰 Articles included: {len(articles)}")
|
|
print("="*70 + "\n")
|
|
|
|
return {
|
|
'success': True,
|
|
'sent_count': sent_count,
|
|
'failed_count': failed_count,
|
|
'total_subscribers': len(subscribers),
|
|
'article_count': len(articles),
|
|
'errors': errors
|
|
}
|
|
|
|
|
|
def preview_newsletter(max_articles=None, hours=None):
|
|
"""
|
|
Generate newsletter HTML for preview (doesn't send)
|
|
|
|
Args:
|
|
max_articles: Maximum number of articles to include (ignored, uses 3 per category)
|
|
hours: Hours to look back (default from config)
|
|
|
|
Returns:
|
|
str: HTML content
|
|
"""
|
|
hours = hours or Config.HOURS_LOOKBACK
|
|
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
|
|
|
|
if not articles:
|
|
today_date = datetime.now().strftime('%B %d, %Y')
|
|
return f"<h1>No articles from today found</h1><p>No articles published today ({today_date}). Run the crawler with Ollama enabled to get fresh content.</p>"
|
|
|
|
# Preview without tracking
|
|
return render_newsletter_html(articles, tracking_enabled=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Parse command line arguments
|
|
if len(sys.argv) > 1:
|
|
command = sys.argv[1]
|
|
|
|
if command == 'preview':
|
|
# Generate preview HTML
|
|
html = preview_newsletter()
|
|
output_file = 'newsletter_preview.html'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(html)
|
|
print(f"✓ Preview saved to {output_file}")
|
|
print(f" Open it in your browser to see the newsletter")
|
|
|
|
elif command == 'test':
|
|
# Send test email
|
|
if len(sys.argv) < 3:
|
|
print("Usage: python sender_service.py test <email>")
|
|
sys.exit(1)
|
|
test_email = sys.argv[2]
|
|
send_newsletter(test_email=test_email)
|
|
|
|
elif command == 'send':
|
|
# Send to all subscribers
|
|
max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
|
|
send_newsletter(max_articles=max_articles)
|
|
|
|
else:
|
|
print("Unknown command. Usage:")
|
|
print(" python sender_service.py preview - Generate HTML preview")
|
|
print(" python sender_service.py test <email> - Send test email")
|
|
print(" python sender_service.py send [count] - Send to all subscribers")
|
|
else:
|
|
# Default: send newsletter
|
|
send_newsletter()
|