Munich-news/backend/services/personalization_service.py

"""
Newsletter Personalization Service for Munich News Daily.
Ranks and selects articles based on user interest profiles.
"""

from typing import Dict, List, Optional
from datetime import datetime, timedelta
from services.interest_profiling_service import get_user_interests


def calculate_article_score(
    article: Dict,
    user_interests: Optional[Dict],
    category_weight: float = 0.4,
    keyword_weight: float = 0.6
) -> float:
    """
    Calculate personalization score for an article based on user interests.

    Score is calculated as:
    - Category match: 0-1.0 based on user's interest in the category
    - Keyword match: Average of user's interest in article keywords
    - Final score: (category_score * 0.4) + (keyword_score * 0.6)

    Args:
        article: Article dictionary with 'category' and 'keywords' fields
        user_interests: User interest profile (None for non-personalized)
        category_weight: Weight for category matching (default: 0.4)
        keyword_weight: Weight for keyword matching (default: 0.6)

    Returns:
        float: Personalization score between 0.0 and 1.0
    """
    # If no user interests, return neutral score
    if not user_interests:
        return 0.5

    # Get article metadata
    article_category = article.get('category', 'general')
    article_keywords = article.get('keywords', [])

    # Calculate category score
    user_categories = user_interests.get('categories', {})
    category_score = user_categories.get(article_category, 0.0)

    # Calculate keyword score (average of all matching keywords)
    user_keywords = user_interests.get('keywords', {})
    keyword_scores = []

    for keyword in article_keywords:
        if keyword in user_keywords:
            keyword_scores.append(user_keywords[keyword])

    # Average keyword score (0.0 if no matches)
    keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0

    # Weighted final score
    final_score = (category_score * category_weight) + (keyword_score * keyword_weight)

    return round(final_score, 3)


def rank_articles_for_user(
    articles: List[Dict],
    subscriber_email: str,
    personalization_ratio: float = 0.7
) -> List[Dict]:
    """
    Rank articles for a specific user based on their interests.

    Mixes personalized content with trending content to avoid filter bubbles.

    Args:
        articles: List of article dictionaries
        subscriber_email: Email address of the user
        personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)

    Returns:
        list: Articles sorted by personalization score with score added
    """
    # Get user interests
    user_interests = get_user_interests(subscriber_email)

    # Calculate score for each article
    scored_articles = []
    for article in articles:
        score = calculate_article_score(article, user_interests)

        # Add score to article (don't modify original)
        article_with_score = article.copy()
        article_with_score['personalization_score'] = score
        scored_articles.append(article_with_score)

    # Sort by score (highest first)
    scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)

    return scored_articles


def select_personalized_articles(
    articles: List[Dict],
    subscriber_email: str,
    max_articles: int = 10,
    personalization_ratio: float = 0.7,
    min_score_threshold: float = 0.1
) -> List[Dict]:
    """
    Select and rank articles for a personalized newsletter.

    Strategy:
    - Top N * personalization_ratio articles: Highest scoring (personalized)
    - Remaining articles: Most recent (trending/diverse content)
    - Ensures mix of personalized + diverse content

    Args:
        articles: List of available articles
        subscriber_email: Email address of the user
        max_articles: Maximum number of articles to include (default: 10)
        personalization_ratio: Ratio of personalized content (default: 0.7)
        min_score_threshold: Minimum score to consider personalized (default: 0.1)

    Returns:
        list: Selected articles with personalization scores
    """
    if not articles:
        return []

    # Rank all articles
    ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)

    # Calculate split
    num_personalized = int(max_articles * personalization_ratio)
    num_trending = max_articles - num_personalized

    # Get personalized articles (high scoring)
    personalized = [
        a for a in ranked_articles
        if a['personalization_score'] >= min_score_threshold
    ][:num_personalized]

    # Get trending articles (most recent, not already selected)
    personalized_ids = {a.get('_id') for a in personalized}
    trending = [
        a for a in ranked_articles
        if a.get('_id') not in personalized_ids
    ][:num_trending]

    # Combine: personalized first, then trending
    selected = personalized + trending

    # Ensure we don't exceed max_articles
    return selected[:max_articles]


def get_personalization_explanation(
    article: Dict,
    user_interests: Optional[Dict]
) -> Dict[str, any]:
    """
    Generate explanation for why an article was recommended.

    Useful for transparency and debugging.

    Args:
        article: Article dictionary
        user_interests: User interest profile

    Returns:
        dict: Explanation containing:
            - score: Overall personalization score
            - category_match: Category score
            - keyword_matches: List of matching keywords with scores
            - reason: Human-readable explanation
    """
    if not user_interests:
        return {
            'score': 0.5,
            'category_match': 0.0,
            'keyword_matches': [],
            'reason': 'No personalization data available'
        }

    article_category = article.get('category', 'general')
    article_keywords = article.get('keywords', [])

    user_categories = user_interests.get('categories', {})
    user_keywords = user_interests.get('keywords', {})

    # Category match
    category_score = user_categories.get(article_category, 0.0)

    # Keyword matches
    keyword_matches = []
    for keyword in article_keywords:
        if keyword in user_keywords:
            keyword_matches.append({
                'keyword': keyword,
                'score': user_keywords[keyword]
            })

    # Calculate overall score
    overall_score = calculate_article_score(article, user_interests)

    # Generate reason
    if overall_score >= 0.5:
        reason = f"High match with your interests in {article_category}"
        if keyword_matches:
            top_keywords = [m['keyword'] for m in keyword_matches[:2]]
            reason += f" and topics like {', '.join(top_keywords)}"
    elif overall_score >= 0.3:
        reason = f"Moderate match with your interests"
    else:
        reason = "Trending article for diverse content"

    return {
        'score': overall_score,
        'category_match': category_score,
        'keyword_matches': keyword_matches,
        'reason': reason
    }


def get_personalization_stats(
    selected_articles: List[Dict],
    subscriber_email: str
) -> Dict[str, any]:
    """
    Get statistics about personalization for a newsletter.

    Args:
        selected_articles: Articles selected for the newsletter
        subscriber_email: Email address of the user

    Returns:
        dict: Statistics containing:
            - total_articles: Number of articles
            - avg_score: Average personalization score
            - highly_personalized: Number of articles with score >= 0.5
            - moderately_personalized: Number with score 0.3-0.5
            - trending: Number with score < 0.3
    """
    if not selected_articles:
        return {
            'total_articles': 0,
            'avg_score': 0.0,
            'highly_personalized': 0,
            'moderately_personalized': 0,
            'trending': 0
        }

    scores = [a.get('personalization_score', 0.0) for a in selected_articles]
    avg_score = sum(scores) / len(scores)

    highly_personalized = sum(1 for s in scores if s >= 0.5)
    moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
    trending = sum(1 for s in scores if s < 0.3)

    return {
        'total_articles': len(selected_articles),
        'avg_score': round(avg_score, 3),
        'highly_personalized': highly_personalized,
        'moderately_personalized': moderately_personalized,
        'trending': trending
    }


def batch_personalize_newsletters(
    articles: List[Dict],
    subscribers: List[str],
    max_articles_per_user: int = 10
) -> Dict[str, List[Dict]]:
    """
    Generate personalized article selections for multiple subscribers.

    Useful for batch newsletter generation.

    Args:
        articles: List of available articles
        subscribers: List of subscriber email addresses
        max_articles_per_user: Max articles per newsletter (default: 10)

    Returns:
        dict: Mapping of email -> personalized article list
    """
    personalized_newsletters = {}

    for subscriber_email in subscribers:
        personalized_articles = select_personalized_articles(
            articles,
            subscriber_email,
            max_articles=max_articles_per_user
        )
        personalized_newsletters[subscriber_email] = personalized_articles

    return personalized_newsletters