update

2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions
--- a/backend/services/interest_profiling_service.py
+++ b/backend/services/interest_profiling_service.py
@@ -0,0 +1,323 @@
+"""
+User Interest Profiling Service for Munich News Daily.
+Builds and maintains user interest profiles based on article click behavior.
+"""
+
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+from database import link_clicks_collection
+from pymongo import MongoClient
+from config import Config
+
+# Connect to MongoDB
+client = MongoClient(Config.MONGODB_URI)
+db = client[Config.DB_NAME]
+user_interests_collection = db['user_interests']
+
+
+def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
+    """
+    Update user interest profile based on a clicked article.
+    
+    Increments interest scores for the article's keywords and category.
+    Creates a new profile if the user doesn't have one yet.
+    
+    Args:
+        subscriber_email: Email address of the user
+        keywords: List of keywords from the clicked article
+        category: Category of the clicked article
+        
+    Returns:
+        dict: Updated user interest profile
+    """
+    current_time = datetime.utcnow()
+    
+    # Get existing profile or create new one
+    profile = user_interests_collection.find_one({'email': subscriber_email})
+    
+    if not profile:
+        # Create new profile
+        profile = {
+            'email': subscriber_email,
+            'categories': {},
+            'keywords': {},
+            'total_clicks': 0,
+            'last_updated': current_time,
+            'created_at': current_time
+        }
+    
+    # Update category interest (increment by 0.1, max 1.0)
+    current_category_score = profile['categories'].get(category, 0.0)
+    profile['categories'][category] = min(current_category_score + 0.1, 1.0)
+    
+    # Update keyword interests (increment by 0.1, max 1.0)
+    for keyword in keywords:
+        if keyword:  # Skip empty keywords
+            current_keyword_score = profile['keywords'].get(keyword, 0.0)
+            profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
+    
+    # Update metadata
+    profile['total_clicks'] = profile.get('total_clicks', 0) + 1
+    profile['last_updated'] = current_time
+    
+    # Upsert profile
+    user_interests_collection.update_one(
+        {'email': subscriber_email},
+        {'$set': profile},
+        upsert=True
+    )
+    
+    return profile
+
+
+def get_user_interests(subscriber_email: str) -> Optional[Dict]:
+    """
+    Get user interest profile.
+    
+    Args:
+        subscriber_email: Email address of the user
+        
+    Returns:
+        dict: User interest profile or None if not found
+    """
+    return user_interests_collection.find_one({'email': subscriber_email})
+
+
+def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
+    """
+    Decay interest scores for users who haven't clicked recently.
+    
+    Reduces interest scores over time to reflect changing interests.
+    Only decays profiles that haven't been updated in the last N days.
+    
+    Args:
+        decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
+        days_threshold: Only decay profiles older than this many days (default: 7)
+        
+    Returns:
+        dict: Statistics about the decay operation
+            - profiles_decayed: Number of profiles that were decayed
+            - profiles_checked: Total number of profiles checked
+    """
+    cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
+    
+    # Find profiles that haven't been updated recently
+    old_profiles = user_interests_collection.find({
+        'last_updated': {'$lt': cutoff_date}
+    })
+    
+    profiles_decayed = 0
+    profiles_checked = 0
+    
+    for profile in old_profiles:
+        profiles_checked += 1
+        
+        # Decay category scores
+        decayed_categories = {}
+        for category, score in profile.get('categories', {}).items():
+            new_score = score * decay_factor
+            # Remove categories with very low scores (< 0.05)
+            if new_score >= 0.05:
+                decayed_categories[category] = round(new_score, 3)
+        
+        # Decay keyword scores
+        decayed_keywords = {}
+        for keyword, score in profile.get('keywords', {}).items():
+            new_score = score * decay_factor
+            # Remove keywords with very low scores (< 0.05)
+            if new_score >= 0.05:
+                decayed_keywords[keyword] = round(new_score, 3)
+        
+        # Update profile with decayed scores
+        user_interests_collection.update_one(
+            {'email': profile['email']},
+            {
+                '$set': {
+                    'categories': decayed_categories,
+                    'keywords': decayed_keywords,
+                    'last_decayed': datetime.utcnow()
+                }
+            }
+        )
+        
+        profiles_decayed += 1
+    
+    return {
+        'profiles_decayed': profiles_decayed,
+        'profiles_checked': profiles_checked
+    }
+
+
+def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
+    """
+    Get user's top interests sorted by score.
+    
+    Args:
+        subscriber_email: Email address of the user
+        top_n: Number of top interests to return (default: 10)
+        
+    Returns:
+        dict: Top interests containing:
+            - top_categories: List of (category, score) tuples
+            - top_keywords: List of (keyword, score) tuples
+    """
+    profile = get_user_interests(subscriber_email)
+    
+    if not profile:
+        return {
+            'top_categories': [],
+            'top_keywords': []
+        }
+    
+    # Sort categories by score
+    categories = profile.get('categories', {})
+    top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    
+    # Sort keywords by score
+    keywords = profile.get('keywords', {})
+    top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
+    
+    return {
+        'top_categories': top_categories,
+        'top_keywords': top_keywords
+    }
+
+
+def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
+    """
+    Build or rebuild user interest profile from click history.
+    
+    Useful for:
+    - Initializing profiles for existing users
+    - Rebuilding profiles after algorithm changes
+    - Backfilling data
+    
+    Args:
+        subscriber_email: Email address of the user
+        days_lookback: Number of days of history to analyze (default: 30)
+        
+    Returns:
+        dict: Newly built interest profile
+    """
+    cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
+    
+    # Get all clicks from this user in the lookback period
+    clicks = link_clicks_collection.find({
+        'subscriber_email': subscriber_email,
+        'clicked': True,
+        'clicked_at': {'$gte': cutoff_date}
+    })
+    
+    # Initialize profile
+    profile = {
+        'email': subscriber_email,
+        'categories': {},
+        'keywords': {},
+        'total_clicks': 0,
+        'last_updated': datetime.utcnow(),
+        'created_at': datetime.utcnow()
+    }
+    
+    # Process each click
+    for click in clicks:
+        category = click.get('category', 'general')
+        keywords = click.get('keywords', [])
+        
+        # Update category score
+        profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
+        
+        # Update keyword scores
+        for keyword in keywords:
+            if keyword:
+                profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
+        
+        profile['total_clicks'] += 1
+    
+    # Cap scores at 1.0
+    for category in profile['categories']:
+        profile['categories'][category] = min(profile['categories'][category], 1.0)
+    
+    for keyword in profile['keywords']:
+        profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
+    
+    # Save profile
+    if profile['total_clicks'] > 0:
+        user_interests_collection.update_one(
+            {'email': subscriber_email},
+            {'$set': profile},
+            upsert=True
+        )
+    
+    return profile
+
+
+def get_interest_statistics() -> Dict:
+    """
+    Get statistics about user interests across all users.
+    
+    Returns:
+        dict: Statistics containing:
+            - total_users: Total number of users with profiles
+            - avg_clicks_per_user: Average number of clicks per user
+            - most_popular_categories: Top categories across all users
+            - most_popular_keywords: Top keywords across all users
+    """
+    total_users = user_interests_collection.count_documents({})
+    
+    if total_users == 0:
+        return {
+            'total_users': 0,
+            'avg_clicks_per_user': 0,
+            'most_popular_categories': [],
+            'most_popular_keywords': []
+        }
+    
+    # Calculate average clicks
+    pipeline = [
+        {
+            '$group': {
+                '_id': None,
+                'total_clicks': {'$sum': '$total_clicks'}
+            }
+        }
+    ]
+    
+    result = list(user_interests_collection.aggregate(pipeline))
+    total_clicks = result[0]['total_clicks'] if result else 0
+    avg_clicks = total_clicks / total_users if total_users > 0 else 0
+    
+    # Get most popular categories
+    category_counts = {}
+    keyword_counts = {}
+    
+    for profile in user_interests_collection.find({}):
+        for category, score in profile.get('categories', {}).items():
+            category_counts[category] = category_counts.get(category, 0) + score
+        
+        for keyword, score in profile.get('keywords', {}).items():
+            keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
+    
+    # Sort and get top 10
+    top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
+    top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
+    
+    return {
+        'total_users': total_users,
+        'avg_clicks_per_user': round(avg_clicks, 2),
+        'most_popular_categories': top_categories,
+        'most_popular_keywords': top_keywords
+    }
+
+
+def delete_user_interests(subscriber_email: str) -> bool:
+    """
+    Delete user interest profile (for GDPR compliance).
+    
+    Args:
+        subscriber_email: Email address of the user
+        
+    Returns:
+        bool: True if profile was deleted, False if not found
+    """
+    result = user_interests_collection.delete_one({'email': subscriber_email})
+    return result.deleted_count > 0
--- a/backend/services/personalization_service.py
+++ b/backend/services/personalization_service.py
@@ -0,0 +1,295 @@
+"""
+Newsletter Personalization Service for Munich News Daily.
+Ranks and selects articles based on user interest profiles.
+"""
+
+from typing import Dict, List, Optional
+from datetime import datetime, timedelta
+from services.interest_profiling_service import get_user_interests
+
+
+def calculate_article_score(
+    article: Dict,
+    user_interests: Optional[Dict],
+    category_weight: float = 0.4,
+    keyword_weight: float = 0.6
+) -> float:
+    """
+    Calculate personalization score for an article based on user interests.
+    
+    Score is calculated as:
+    - Category match: 0-1.0 based on user's interest in the category
+    - Keyword match: Average of user's interest in article keywords
+    - Final score: (category_score * 0.4) + (keyword_score * 0.6)
+    
+    Args:
+        article: Article dictionary with 'category' and 'keywords' fields
+        user_interests: User interest profile (None for non-personalized)
+        category_weight: Weight for category matching (default: 0.4)
+        keyword_weight: Weight for keyword matching (default: 0.6)
+        
+    Returns:
+        float: Personalization score between 0.0 and 1.0
+    """
+    # If no user interests, return neutral score
+    if not user_interests:
+        return 0.5
+    
+    # Get article metadata
+    article_category = article.get('category', 'general')
+    article_keywords = article.get('keywords', [])
+    
+    # Calculate category score
+    user_categories = user_interests.get('categories', {})
+    category_score = user_categories.get(article_category, 0.0)
+    
+    # Calculate keyword score (average of all matching keywords)
+    user_keywords = user_interests.get('keywords', {})
+    keyword_scores = []
+    
+    for keyword in article_keywords:
+        if keyword in user_keywords:
+            keyword_scores.append(user_keywords[keyword])
+    
+    # Average keyword score (0.0 if no matches)
+    keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
+    
+    # Weighted final score
+    final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
+    
+    return round(final_score, 3)
+
+
+def rank_articles_for_user(
+    articles: List[Dict],
+    subscriber_email: str,
+    personalization_ratio: float = 0.7
+) -> List[Dict]:
+    """
+    Rank articles for a specific user based on their interests.
+    
+    Mixes personalized content with trending content to avoid filter bubbles.
+    
+    Args:
+        articles: List of article dictionaries
+        subscriber_email: Email address of the user
+        personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
+        
+    Returns:
+        list: Articles sorted by personalization score with score added
+    """
+    # Get user interests
+    user_interests = get_user_interests(subscriber_email)
+    
+    # Calculate score for each article
+    scored_articles = []
+    for article in articles:
+        score = calculate_article_score(article, user_interests)
+        
+        # Add score to article (don't modify original)
+        article_with_score = article.copy()
+        article_with_score['personalization_score'] = score
+        scored_articles.append(article_with_score)
+    
+    # Sort by score (highest first)
+    scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
+    
+    return scored_articles
+
+
+def select_personalized_articles(
+    articles: List[Dict],
+    subscriber_email: str,
+    max_articles: int = 10,
+    personalization_ratio: float = 0.7,
+    min_score_threshold: float = 0.1
+) -> List[Dict]:
+    """
+    Select and rank articles for a personalized newsletter.
+    
+    Strategy:
+    - Top N * personalization_ratio articles: Highest scoring (personalized)
+    - Remaining articles: Most recent (trending/diverse content)
+    - Ensures mix of personalized + diverse content
+    
+    Args:
+        articles: List of available articles
+        subscriber_email: Email address of the user
+        max_articles: Maximum number of articles to include (default: 10)
+        personalization_ratio: Ratio of personalized content (default: 0.7)
+        min_score_threshold: Minimum score to consider personalized (default: 0.1)
+        
+    Returns:
+        list: Selected articles with personalization scores
+    """
+    if not articles:
+        return []
+    
+    # Rank all articles
+    ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
+    
+    # Calculate split
+    num_personalized = int(max_articles * personalization_ratio)
+    num_trending = max_articles - num_personalized
+    
+    # Get personalized articles (high scoring)
+    personalized = [
+        a for a in ranked_articles 
+        if a['personalization_score'] >= min_score_threshold
+    ][:num_personalized]
+    
+    # Get trending articles (most recent, not already selected)
+    personalized_ids = {a.get('_id') for a in personalized}
+    trending = [
+        a for a in ranked_articles 
+        if a.get('_id') not in personalized_ids
+    ][:num_trending]
+    
+    # Combine: personalized first, then trending
+    selected = personalized + trending
+    
+    # Ensure we don't exceed max_articles
+    return selected[:max_articles]
+
+
+def get_personalization_explanation(
+    article: Dict,
+    user_interests: Optional[Dict]
+) -> Dict[str, any]:
+    """
+    Generate explanation for why an article was recommended.
+    
+    Useful for transparency and debugging.
+    
+    Args:
+        article: Article dictionary
+        user_interests: User interest profile
+        
+    Returns:
+        dict: Explanation containing:
+            - score: Overall personalization score
+            - category_match: Category score
+            - keyword_matches: List of matching keywords with scores
+            - reason: Human-readable explanation
+    """
+    if not user_interests:
+        return {
+            'score': 0.5,
+            'category_match': 0.0,
+            'keyword_matches': [],
+            'reason': 'No personalization data available'
+        }
+    
+    article_category = article.get('category', 'general')
+    article_keywords = article.get('keywords', [])
+    
+    user_categories = user_interests.get('categories', {})
+    user_keywords = user_interests.get('keywords', {})
+    
+    # Category match
+    category_score = user_categories.get(article_category, 0.0)
+    
+    # Keyword matches
+    keyword_matches = []
+    for keyword in article_keywords:
+        if keyword in user_keywords:
+            keyword_matches.append({
+                'keyword': keyword,
+                'score': user_keywords[keyword]
+            })
+    
+    # Calculate overall score
+    overall_score = calculate_article_score(article, user_interests)
+    
+    # Generate reason
+    if overall_score >= 0.5:
+        reason = f"High match with your interests in {article_category}"
+        if keyword_matches:
+            top_keywords = [m['keyword'] for m in keyword_matches[:2]]
+            reason += f" and topics like {', '.join(top_keywords)}"
+    elif overall_score >= 0.3:
+        reason = f"Moderate match with your interests"
+    else:
+        reason = "Trending article for diverse content"
+    
+    return {
+        'score': overall_score,
+        'category_match': category_score,
+        'keyword_matches': keyword_matches,
+        'reason': reason
+    }
+
+
+def get_personalization_stats(
+    selected_articles: List[Dict],
+    subscriber_email: str
+) -> Dict[str, any]:
+    """
+    Get statistics about personalization for a newsletter.
+    
+    Args:
+        selected_articles: Articles selected for the newsletter
+        subscriber_email: Email address of the user
+        
+    Returns:
+        dict: Statistics containing:
+            - total_articles: Number of articles
+            - avg_score: Average personalization score
+            - highly_personalized: Number of articles with score >= 0.5
+            - moderately_personalized: Number with score 0.3-0.5
+            - trending: Number with score < 0.3
+    """
+    if not selected_articles:
+        return {
+            'total_articles': 0,
+            'avg_score': 0.0,
+            'highly_personalized': 0,
+            'moderately_personalized': 0,
+            'trending': 0
+        }
+    
+    scores = [a.get('personalization_score', 0.0) for a in selected_articles]
+    avg_score = sum(scores) / len(scores)
+    
+    highly_personalized = sum(1 for s in scores if s >= 0.5)
+    moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
+    trending = sum(1 for s in scores if s < 0.3)
+    
+    return {
+        'total_articles': len(selected_articles),
+        'avg_score': round(avg_score, 3),
+        'highly_personalized': highly_personalized,
+        'moderately_personalized': moderately_personalized,
+        'trending': trending
+    }
+
+
+def batch_personalize_newsletters(
+    articles: List[Dict],
+    subscribers: List[str],
+    max_articles_per_user: int = 10
+) -> Dict[str, List[Dict]]:
+    """
+    Generate personalized article selections for multiple subscribers.
+    
+    Useful for batch newsletter generation.
+    
+    Args:
+        articles: List of available articles
+        subscribers: List of subscriber email addresses
+        max_articles_per_user: Max articles per newsletter (default: 10)
+        
+    Returns:
+        dict: Mapping of email -> personalized article list
+    """
+    personalized_newsletters = {}
+    
+    for subscriber_email in subscribers:
+        personalized_articles = select_personalized_articles(
+            articles,
+            subscriber_email,
+            max_articles=max_articles_per_user
+        )
+        personalized_newsletters[subscriber_email] = personalized_articles
+    
+    return personalized_newsletters
--- a/backend/services/tracking_service.py
+++ b/backend/services/tracking_service.py
@@ -80,6 +80,9 @@ def create_newsletter_tracking(
    link_tracking_map = {}
    
    if article_links:
+        # Import here to avoid circular dependency
+        from database import articles_collection
+        
        for article in article_links:
            article_url = article.get('url')
            article_title = article.get('title', '')
@@ -87,13 +90,22 @@ def create_newsletter_tracking(
            if article_url:
                link_tracking_id = generate_tracking_id()
                
-                # Create link click tracking record
+                # Look up article metadata from database for personalization
+                article_doc = articles_collection.find_one({'link': article_url})
+                article_id = str(article_doc['_id']) if article_doc else None
+                category = article_doc.get('category', 'general') if article_doc else 'general'
+                keywords = article_doc.get('keywords', []) if article_doc else []
+                
+                # Create link click tracking record with metadata
                link_click_doc = {
                    'tracking_id': link_tracking_id,
                    'newsletter_id': newsletter_id,
                    'subscriber_email': subscriber_email,
                    'article_url': article_url,
                    'article_title': article_title,
+                    'article_id': article_id,           # NEW: Article database ID
+                    'category': category,                # NEW: Article category
+                    'keywords': keywords,                # NEW: Article keywords for personalization
                    'clicked': False,
                    'clicked_at': None,
                    'user_agent': None,