update

2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions
--- a/backend/services/personalization_service.py
+++ b/backend/services/personalization_service.py
@@ -0,0 +1,295 @@
+"""
+Newsletter Personalization Service for Munich News Daily.
+Ranks and selects articles based on user interest profiles.
+"""
+
+from typing import Dict, List, Optional
+from datetime import datetime, timedelta
+from services.interest_profiling_service import get_user_interests
+
+
+def calculate_article_score(
+    article: Dict,
+    user_interests: Optional[Dict],
+    category_weight: float = 0.4,
+    keyword_weight: float = 0.6
+) -> float:
+    """
+    Calculate personalization score for an article based on user interests.
+    
+    Score is calculated as:
+    - Category match: 0-1.0 based on user's interest in the category
+    - Keyword match: Average of user's interest in article keywords
+    - Final score: (category_score * 0.4) + (keyword_score * 0.6)
+    
+    Args:
+        article: Article dictionary with 'category' and 'keywords' fields
+        user_interests: User interest profile (None for non-personalized)
+        category_weight: Weight for category matching (default: 0.4)
+        keyword_weight: Weight for keyword matching (default: 0.6)
+        
+    Returns:
+        float: Personalization score between 0.0 and 1.0
+    """
+    # If no user interests, return neutral score
+    if not user_interests:
+        return 0.5
+    
+    # Get article metadata
+    article_category = article.get('category', 'general')
+    article_keywords = article.get('keywords', [])
+    
+    # Calculate category score
+    user_categories = user_interests.get('categories', {})
+    category_score = user_categories.get(article_category, 0.0)
+    
+    # Calculate keyword score (average of all matching keywords)
+    user_keywords = user_interests.get('keywords', {})
+    keyword_scores = []
+    
+    for keyword in article_keywords:
+        if keyword in user_keywords:
+            keyword_scores.append(user_keywords[keyword])
+    
+    # Average keyword score (0.0 if no matches)
+    keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
+    
+    # Weighted final score
+    final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
+    
+    return round(final_score, 3)
+
+
+def rank_articles_for_user(
+    articles: List[Dict],
+    subscriber_email: str,
+    personalization_ratio: float = 0.7
+) -> List[Dict]:
+    """
+    Rank articles for a specific user based on their interests.
+    
+    Mixes personalized content with trending content to avoid filter bubbles.
+    
+    Args:
+        articles: List of article dictionaries
+        subscriber_email: Email address of the user
+        personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
+        
+    Returns:
+        list: Articles sorted by personalization score with score added
+    """
+    # Get user interests
+    user_interests = get_user_interests(subscriber_email)
+    
+    # Calculate score for each article
+    scored_articles = []
+    for article in articles:
+        score = calculate_article_score(article, user_interests)
+        
+        # Add score to article (don't modify original)
+        article_with_score = article.copy()
+        article_with_score['personalization_score'] = score
+        scored_articles.append(article_with_score)
+    
+    # Sort by score (highest first)
+    scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
+    
+    return scored_articles
+
+
+def select_personalized_articles(
+    articles: List[Dict],
+    subscriber_email: str,
+    max_articles: int = 10,
+    personalization_ratio: float = 0.7,
+    min_score_threshold: float = 0.1
+) -> List[Dict]:
+    """
+    Select and rank articles for a personalized newsletter.
+    
+    Strategy:
+    - Top N * personalization_ratio articles: Highest scoring (personalized)
+    - Remaining articles: Most recent (trending/diverse content)
+    - Ensures mix of personalized + diverse content
+    
+    Args:
+        articles: List of available articles
+        subscriber_email: Email address of the user
+        max_articles: Maximum number of articles to include (default: 10)
+        personalization_ratio: Ratio of personalized content (default: 0.7)
+        min_score_threshold: Minimum score to consider personalized (default: 0.1)
+        
+    Returns:
+        list: Selected articles with personalization scores
+    """
+    if not articles:
+        return []
+    
+    # Rank all articles
+    ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
+    
+    # Calculate split
+    num_personalized = int(max_articles * personalization_ratio)
+    num_trending = max_articles - num_personalized
+    
+    # Get personalized articles (high scoring)
+    personalized = [
+        a for a in ranked_articles 
+        if a['personalization_score'] >= min_score_threshold
+    ][:num_personalized]
+    
+    # Get trending articles (most recent, not already selected)
+    personalized_ids = {a.get('_id') for a in personalized}
+    trending = [
+        a for a in ranked_articles 
+        if a.get('_id') not in personalized_ids
+    ][:num_trending]
+    
+    # Combine: personalized first, then trending
+    selected = personalized + trending
+    
+    # Ensure we don't exceed max_articles
+    return selected[:max_articles]
+
+
+def get_personalization_explanation(
+    article: Dict,
+    user_interests: Optional[Dict]
+) -> Dict[str, any]:
+    """
+    Generate explanation for why an article was recommended.
+    
+    Useful for transparency and debugging.
+    
+    Args:
+        article: Article dictionary
+        user_interests: User interest profile
+        
+    Returns:
+        dict: Explanation containing:
+            - score: Overall personalization score
+            - category_match: Category score
+            - keyword_matches: List of matching keywords with scores
+            - reason: Human-readable explanation
+    """
+    if not user_interests:
+        return {
+            'score': 0.5,
+            'category_match': 0.0,
+            'keyword_matches': [],
+            'reason': 'No personalization data available'
+        }
+    
+    article_category = article.get('category', 'general')
+    article_keywords = article.get('keywords', [])
+    
+    user_categories = user_interests.get('categories', {})
+    user_keywords = user_interests.get('keywords', {})
+    
+    # Category match
+    category_score = user_categories.get(article_category, 0.0)
+    
+    # Keyword matches
+    keyword_matches = []
+    for keyword in article_keywords:
+        if keyword in user_keywords:
+            keyword_matches.append({
+                'keyword': keyword,
+                'score': user_keywords[keyword]
+            })
+    
+    # Calculate overall score
+    overall_score = calculate_article_score(article, user_interests)
+    
+    # Generate reason
+    if overall_score >= 0.5:
+        reason = f"High match with your interests in {article_category}"
+        if keyword_matches:
+            top_keywords = [m['keyword'] for m in keyword_matches[:2]]
+            reason += f" and topics like {', '.join(top_keywords)}"
+    elif overall_score >= 0.3:
+        reason = f"Moderate match with your interests"
+    else:
+        reason = "Trending article for diverse content"
+    
+    return {
+        'score': overall_score,
+        'category_match': category_score,
+        'keyword_matches': keyword_matches,
+        'reason': reason
+    }
+
+
+def get_personalization_stats(
+    selected_articles: List[Dict],
+    subscriber_email: str
+) -> Dict[str, any]:
+    """
+    Get statistics about personalization for a newsletter.
+    
+    Args:
+        selected_articles: Articles selected for the newsletter
+        subscriber_email: Email address of the user
+        
+    Returns:
+        dict: Statistics containing:
+            - total_articles: Number of articles
+            - avg_score: Average personalization score
+            - highly_personalized: Number of articles with score >= 0.5
+            - moderately_personalized: Number with score 0.3-0.5
+            - trending: Number with score < 0.3
+    """
+    if not selected_articles:
+        return {
+            'total_articles': 0,
+            'avg_score': 0.0,
+            'highly_personalized': 0,
+            'moderately_personalized': 0,
+            'trending': 0
+        }
+    
+    scores = [a.get('personalization_score', 0.0) for a in selected_articles]
+    avg_score = sum(scores) / len(scores)
+    
+    highly_personalized = sum(1 for s in scores if s >= 0.5)
+    moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
+    trending = sum(1 for s in scores if s < 0.3)
+    
+    return {
+        'total_articles': len(selected_articles),
+        'avg_score': round(avg_score, 3),
+        'highly_personalized': highly_personalized,
+        'moderately_personalized': moderately_personalized,
+        'trending': trending
+    }
+
+
+def batch_personalize_newsletters(
+    articles: List[Dict],
+    subscribers: List[str],
+    max_articles_per_user: int = 10
+) -> Dict[str, List[Dict]]:
+    """
+    Generate personalized article selections for multiple subscribers.
+    
+    Useful for batch newsletter generation.
+    
+    Args:
+        articles: List of available articles
+        subscribers: List of subscriber email addresses
+        max_articles_per_user: Max articles per newsletter (default: 10)
+        
+    Returns:
+        dict: Mapping of email -> personalized article list
+    """
+    personalized_newsletters = {}
+    
+    for subscriber_email in subscribers:
+        personalized_articles = select_personalized_articles(
+            articles,
+            subscriber_email,
+            max_articles=max_articles_per_user
+        )
+        personalized_newsletters[subscriber_email] = personalized_articles
+    
+    return personalized_newsletters