Munich-news/news_sender/sender_service.py

#!/usr/bin/env python
"""
News Sender Service - Standalone microservice for sending newsletters
Fetches articles from MongoDB and sends to subscribers via email
"""
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime
from pathlib import Path
from jinja2 import Template
from pymongo import MongoClient
import os
import sys
from dotenv import load_dotenv

# Add backend directory to path for importing tracking service
# Check if running in Docker (backend is at /app/backend) or locally (../backend)
if Path('/app/backend').exists():
    backend_dir = Path('/app/backend')
else:
    backend_dir = Path(__file__).parent.parent / 'backend'
sys.path.insert(0, str(backend_dir))

# Import tracking modules
from services import tracking_service
from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls

# Load environment variables from backend/.env
# Try multiple locations (Docker vs local)
# Note: override=False ensures Docker environment variables take precedence
env_locations = [
    Path('/app/.env'),  # Docker location
    Path(__file__).parent.parent / 'backend' / '.env',  # Local location
    Path(__file__).parent / '.env',  # Current directory
]

env_loaded = False
for env_path in env_locations:
    if env_path.exists():
        load_dotenv(dotenv_path=env_path, override=False)  # Don't override existing env vars
        print(f"✓ Loaded configuration from: {env_path}")
        env_loaded = True
        break

if not env_loaded:
    print(f"⚠ Warning: .env file not found in any of these locations:")
    for loc in env_locations:
        print(f"  - {loc}")


class Config:
    """Configuration for news sender"""
    # MongoDB - prioritize environment variable over .env file
    MONGODB_URI = os.environ.get('MONGODB_URI') or os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'

    # Email
    SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
    SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
    EMAIL_USER = os.getenv('EMAIL_USER', '')
    EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')

    # Newsletter
    MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
    HOURS_LOOKBACK = int(os.getenv('NEWSLETTER_HOURS_LOOKBACK', '24'))
    WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')

    # Tracking
    TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true'
    TRACKING_API_URL = os.getenv('TRACKING_API_URL', 'http://localhost:5001')
    TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))


# MongoDB connection
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
subscribers_collection = db['subscribers']


def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
    """
    Get latest articles with AI summaries from database, fetched per category
    Includes cluster information for articles with multiple sources

    Args:
        categories: List of categories to fetch (None = all categories)
        articles_per_category: Maximum number of articles per category (default 3)
        hours: Number of hours to look back (default 24)

    Returns:
        list: Articles with summaries published today, grouped by category
    """
    from datetime import timedelta

    # Get start of today (00:00:00 UTC)
    today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

    # Get cluster summaries collection
    cluster_summaries_collection = db['cluster_summaries']

    # If no categories specified, get all available categories
    if categories is None:
        categories = ['general', 'local', 'sports', 'science']

    articles = []
    processed_clusters = set()

    # Fetch articles for each category separately
    for category in categories:
        # Query for articles in this category from today
        cursor = articles_collection.find({
            'summary': {'$exists': True, '$ne': None},
            'category': category,
            '$or': [
                {'published_at': {'$gte': today_start}},
                {'created_at': {'$gte': today_start}}
            ]
        }).sort('created_at', -1).limit(articles_per_category)

        category_articles = []

        for doc in cursor:
            # Double-check the date to ensure it's from today
            published_at = doc.get('published_at')
            created_at = doc.get('created_at')

            # Skip if both dates are old (extra safety check)
            if published_at and isinstance(published_at, datetime):
                if published_at < today_start:
                    continue
            elif created_at and isinstance(created_at, datetime):
                if created_at < today_start:
                    continue

            cluster_id = doc.get('cluster_id')

            # Check if this article is part of a cluster
            if cluster_id and cluster_id not in processed_clusters:
                # Get cluster summary
                cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})

                if cluster and cluster.get('article_count', 0) > 1:
                    # This is a clustered article - get all source links
                    processed_clusters.add(cluster_id)

                    # Get all articles in this cluster
                    cluster_articles = list(articles_collection.find({
                        'cluster_id': cluster_id
                    }))

                    # Build sources list with links
                    sources = []
                    for art in cluster_articles:
                        sources.append({
                            'name': art.get('source', ''),
                            'link': art.get('link', ''),
                            'title': art.get('title', '')
                        })

                    category_articles.append({
                        'title': doc.get('title', ''),
                        'title_en': doc.get('title_en'),
                        'translated_at': doc.get('translated_at'),
                        'author': doc.get('author'),
                        'link': doc.get('link', ''),
                        'summary': cluster.get('neutral_summary', doc.get('summary', '')),
                        'source': doc.get('source', ''),
                        'category': doc.get('category', 'general'),
                        'published_at': doc.get('published_at', ''),
                        'is_clustered': True,
                        'sources': sources,
                        'article_count': len(sources)
                    })
                else:
                    # Single article (no cluster or cluster with only 1 article)
                    category_articles.append({
                        'title': doc.get('title', ''),
                        'title_en': doc.get('title_en'),
                        'translated_at': doc.get('translated_at'),
                        'author': doc.get('author'),
                        'link': doc.get('link', ''),
                        'summary': doc.get('summary', ''),
                        'source': doc.get('source', ''),
                        'category': doc.get('category', 'general'),
                        'published_at': doc.get('published_at', ''),
                        'is_clustered': False
                    })
            elif not cluster_id or cluster_id not in processed_clusters:
                # No cluster - single article
                category_articles.append({
                    'title': doc.get('title', ''),
                    'title_en': doc.get('title_en'),
                    'translated_at': doc.get('translated_at'),
                    'author': doc.get('author'),
                    'link': doc.get('link', ''),
                    'summary': doc.get('summary', ''),
                    'source': doc.get('source', ''),
                    'category': doc.get('category', 'general'),
                    'published_at': doc.get('published_at', ''),
                    'is_clustered': False
                })

        # Add this category's articles to the main list
        articles.extend(category_articles)

    # Sort articles: clustered articles first (by source count), then by recency
    # This prioritizes stories covered by multiple sources
    articles.sort(key=lambda x: (
        -1 if x.get('is_clustered') else 0,  # Clustered first
        -x.get('article_count', 1),  # More sources = higher priority
    ), reverse=True)

    return articles


def get_active_subscribers():
    """
    Get all active subscribers from database with their category preferences

    Returns:
        list: Subscriber dictionaries with email and categories
    """
    cursor = subscribers_collection.find({'status': 'active'})
    subscribers = []
    for doc in cursor:
        subscribers.append({
            'email': doc['email'],
            'categories': doc.get('categories', None)  # None means all categories
        })
    return subscribers


def render_newsletter_html(articles, subscriber_categories=None, tracking_enabled=False,
                          pixel_tracking_id=None, link_tracking_map=None, api_url=None):
    """
    Render newsletter HTML from template with optional tracking integration

    Args:
        articles: List of article dictionaries
        subscriber_categories: List of categories the subscriber wants (None = all)
        tracking_enabled: Whether to inject tracking pixel and replace links
        pixel_tracking_id: Tracking ID for the email open pixel
        link_tracking_map: Dictionary mapping original URLs to tracking IDs
        api_url: Base URL for the tracking API

    Returns:
        str: Rendered HTML content with tracking injected if enabled
    """
    # Load template
    template_path = Path(__file__).parent / 'newsletter_template.html'
    with open(template_path, 'r', encoding='utf-8') as f:
        template_content = f.read()

    template = Template(template_content)

    # Filter articles by subscriber's category preferences
    if subscriber_categories:
        print(f"  Filtering for categories: {subscriber_categories}")
        filtered_articles = [a for a in articles if a.get('category', 'general') in subscriber_categories]
        print(f"  Filtered {len(articles)} articles down to {len(filtered_articles)} articles")
    else:
        print(f"  No category filter - using all {len(articles)} articles")
        filtered_articles = articles

    # Group articles by category (max 3 per category)
    from collections import defaultdict
    articles_by_category = defaultdict(list)

    for article in filtered_articles:
        category = article.get('category', 'general')
        if len(articles_by_category[category]) < 3:
            articles_by_category[category].append(article)

    # Convert to list of category sections
    category_sections = []
    category_names = {
        'general': {'name': 'Top Trending', 'icon': '🔥'},
        'local': {'name': 'Local Events', 'icon': '🏛️'},
        'sports': {'name': 'Sports', 'icon': '⚽'},
        'science': {'name': 'Science & Tech', 'icon': '🔬'}
    }

    for category, category_articles in sorted(articles_by_category.items()):
        if category_articles:
            cat_info = category_names.get(category, {'name': category.title(), 'icon': '📄'})
            category_sections.append({
                'id': category,
                'name': cat_info['name'],
                'icon': cat_info['icon'],
                'articles': category_articles
            })

    # Get weather data
    from weather_service import get_munich_weather
    weather = get_munich_weather()

    # Prepare template data
    now = datetime.now()
    total_articles = sum(len(section['articles']) for section in category_sections)
    template_data = {
        'date': now.strftime('%A, %B %d, %Y'),
        'year': now.year,
        'article_count': total_articles,
        'category_sections': category_sections,
        'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
        'preferences_link': f'{Config.WEBSITE_URL}/preferences.html',
        'website_link': Config.WEBSITE_URL,
        'tracking_enabled': tracking_enabled,
        'weather': weather
    }

    # Render HTML
    html = template.render(**template_data)

    # Inject tracking if enabled
    if tracking_enabled and pixel_tracking_id and api_url:
        # Inject tracking pixel
        html = inject_tracking_pixel(html, pixel_tracking_id, api_url)

        # Replace article links with tracking URLs
        if link_tracking_map:
            html = replace_article_links(html, link_tracking_map, api_url)

    return html


def send_email(to_email, subject, html_content):
    """
    Send email to a single recipient

    Args:
        to_email: Recipient email address
        subject: Email subject
        html_content: HTML content of email

    Returns:
        tuple: (success: bool, error: str or None)
    """
    try:
        msg = MIMEMultipart('alternative')
        msg['Subject'] = subject
        msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
        msg['To'] = to_email
        msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
        msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
        msg['X-Mailer'] = 'Munich News Daily Sender'

        # Add plain text version as fallback
        plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
        msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))

        # Add HTML version
        msg.attach(MIMEText(html_content, 'html', 'utf-8'))

        server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
        server.starttls()
        server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
        server.send_message(msg)
        server.quit()

        return True, None
    except Exception as e:
        return False, str(e)


def send_newsletter(max_articles=None, test_email=None):
    """
    Send newsletter to all active subscribers

    Args:
        max_articles: Maximum number of articles to include (default from config)
        test_email: If provided, send only to this email (for testing)

    Returns:
        dict: Statistics about sending
    """
    print("\n" + "="*70)
    print("📧 Munich News Daily - Newsletter Sender")
    print("="*70)

    # Validate email configuration
    if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
        print("❌ Email credentials not configured")
        print("   Set EMAIL_USER and EMAIL_PASSWORD in .env file")
        return {
            'success': False,
            'error': 'Email credentials not configured'
        }

    # Get articles from today only
    today_date = datetime.now().strftime('%B %d, %Y')
    print(f"\nFetching articles published TODAY ({today_date})...")
    print(f"  Articles per category: 3")
    # Fetch all categories - filtering per subscriber happens later
    articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)

    if not articles:
        print("❌ No articles from today with summaries found")
        print(f"   No articles published today ({today_date})")
        print("   Run the crawler with Ollama enabled to get fresh content")
        return {
            'success': False,
            'error': f'No articles published today'
        }

    print(f"✓ Found {len(articles)} recent article(s)")

    # Get subscribers
    if test_email:
        # For test mode, fetch the actual subscriber's preferences from database
        print(f"\n🧪 Test mode: Sending to {test_email} only")
        subscriber_doc = subscribers_collection.find_one({'email': test_email})
        if subscriber_doc:
            subscribers = [{
                'email': test_email,
                'categories': subscriber_doc.get('categories', None)
            }]
            print(f"✓ Found subscriber with categories: {subscriber_doc.get('categories', 'all')}")
        else:
            # If not in database, send with all categories
            subscribers = [{'email': test_email, 'categories': None}]
            print(f"⚠ Email not in database, sending with all categories")
    else:
        print("\nFetching active subscribers...")
        subscribers = get_active_subscribers()
        print(f"✓ Found {len(subscribers)} active subscriber(s)")

    if not subscribers:
        print("❌ No active subscribers found")
        return {
            'success': False,
            'error': 'No active subscribers'
        }

    # Generate newsletter ID (date-based)
    newsletter_id = f"newsletter-{datetime.now().strftime('%Y-%m-%d')}"

    # Send to subscribers
    subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
    print(f"\nSending newsletter: '{subject}'")
    print(f"Newsletter ID: {newsletter_id}")
    print(f"Tracking enabled: {Config.TRACKING_ENABLED}")
    print("-" * 70)

    sent_count = 0
    failed_count = 0
    errors = []

    for i, subscriber in enumerate(subscribers, 1):
        email = subscriber['email']
        categories = subscriber['categories']

        print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')

        # Generate tracking data for this subscriber if tracking is enabled
        if Config.TRACKING_ENABLED:
            try:
                tracking_data = generate_tracking_urls(
                    articles=articles,
                    newsletter_id=newsletter_id,
                    subscriber_email=email,
                    tracking_service=tracking_service
                )

                # Render newsletter with tracking and subscriber's category preferences
                html_content = render_newsletter_html(
                    articles=articles,
                    subscriber_categories=categories,
                    tracking_enabled=True,
                    pixel_tracking_id=tracking_data['pixel_tracking_id'],
                    link_tracking_map=tracking_data['link_tracking_map'],
                    api_url=Config.TRACKING_API_URL
                )
            except Exception as e:
                print(f"⚠ Tracking error: {e}, sending without tracking...", end=' ')
                # Fallback: send without tracking
                html_content = render_newsletter_html(articles, subscriber_categories=categories)
        else:
            # Render newsletter without tracking but with subscriber's preferences
            html_content = render_newsletter_html(articles, subscriber_categories=categories)

        # Send email
        success, error = send_email(email, subject, html_content)

        if success:
            print("✓")
            sent_count += 1
        else:
            print(f"✗ {error}")
            failed_count += 1
            errors.append({'email': email, 'error': error})

    # Summary
    print("\n" + "="*70)
    print("📊 Sending Complete")
    print("="*70)
    print(f"✓ Successfully sent: {sent_count}")
    print(f"✗ Failed: {failed_count}")
    print(f"📰 Articles included: {len(articles)}")
    print("="*70 + "\n")

    return {
        'success': True,
        'sent_count': sent_count,
        'failed_count': failed_count,
        'total_subscribers': len(subscribers),
        'article_count': len(articles),
        'errors': errors
    }


def preview_newsletter(max_articles=None, hours=None):
    """
    Generate newsletter HTML for preview (doesn't send)

    Args:
        max_articles: Maximum number of articles to include (ignored, uses 3 per category)
        hours: Hours to look back (default from config)

    Returns:
        str: HTML content
    """
    hours = hours or Config.HOURS_LOOKBACK
    articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)

    if not articles:
        today_date = datetime.now().strftime('%B %d, %Y')
        return f"<h1>No articles from today found</h1><p>No articles published today ({today_date}). Run the crawler with Ollama enabled to get fresh content.</p>"

    # Preview without tracking
    return render_newsletter_html(articles, tracking_enabled=False)


if __name__ == '__main__':
    # Parse command line arguments
    if len(sys.argv) > 1:
        command = sys.argv[1]

        if command == 'preview':
            # Generate preview HTML
            html = preview_newsletter()
            output_file = 'newsletter_preview.html'
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(html)
            print(f"✓ Preview saved to {output_file}")
            print(f"  Open it in your browser to see the newsletter")

        elif command == 'test':
            # Send test email
            if len(sys.argv) < 3:
                print("Usage: python sender_service.py test <email>")
                sys.exit(1)
            test_email = sys.argv[2]
            send_newsletter(test_email=test_email)

        elif command == 'send':
            # Send to all subscribers
            max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
            send_newsletter(max_articles=max_articles)

        else:
            print("Unknown command. Usage:")
            print("  python sender_service.py preview          - Generate HTML preview")
            print("  python sender_service.py test <email>     - Send test email")
            print("  python sender_service.py send [count]     - Send to all subscribers")
    else:
        # Default: send newsletter
        send_newsletter()