296 lines
9.5 KiB
Python
296 lines
9.5 KiB
Python
"""
|
|
Newsletter Personalization Service for Munich News Daily.
|
|
Ranks and selects articles based on user interest profiles.
|
|
"""
|
|
|
|
from typing import Dict, List, Optional
|
|
from datetime import datetime, timedelta
|
|
from services.interest_profiling_service import get_user_interests
|
|
|
|
|
|
def calculate_article_score(
|
|
article: Dict,
|
|
user_interests: Optional[Dict],
|
|
category_weight: float = 0.4,
|
|
keyword_weight: float = 0.6
|
|
) -> float:
|
|
"""
|
|
Calculate personalization score for an article based on user interests.
|
|
|
|
Score is calculated as:
|
|
- Category match: 0-1.0 based on user's interest in the category
|
|
- Keyword match: Average of user's interest in article keywords
|
|
- Final score: (category_score * 0.4) + (keyword_score * 0.6)
|
|
|
|
Args:
|
|
article: Article dictionary with 'category' and 'keywords' fields
|
|
user_interests: User interest profile (None for non-personalized)
|
|
category_weight: Weight for category matching (default: 0.4)
|
|
keyword_weight: Weight for keyword matching (default: 0.6)
|
|
|
|
Returns:
|
|
float: Personalization score between 0.0 and 1.0
|
|
"""
|
|
# If no user interests, return neutral score
|
|
if not user_interests:
|
|
return 0.5
|
|
|
|
# Get article metadata
|
|
article_category = article.get('category', 'general')
|
|
article_keywords = article.get('keywords', [])
|
|
|
|
# Calculate category score
|
|
user_categories = user_interests.get('categories', {})
|
|
category_score = user_categories.get(article_category, 0.0)
|
|
|
|
# Calculate keyword score (average of all matching keywords)
|
|
user_keywords = user_interests.get('keywords', {})
|
|
keyword_scores = []
|
|
|
|
for keyword in article_keywords:
|
|
if keyword in user_keywords:
|
|
keyword_scores.append(user_keywords[keyword])
|
|
|
|
# Average keyword score (0.0 if no matches)
|
|
keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
|
|
|
|
# Weighted final score
|
|
final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
|
|
|
|
return round(final_score, 3)
|
|
|
|
|
|
def rank_articles_for_user(
|
|
articles: List[Dict],
|
|
subscriber_email: str,
|
|
personalization_ratio: float = 0.7
|
|
) -> List[Dict]:
|
|
"""
|
|
Rank articles for a specific user based on their interests.
|
|
|
|
Mixes personalized content with trending content to avoid filter bubbles.
|
|
|
|
Args:
|
|
articles: List of article dictionaries
|
|
subscriber_email: Email address of the user
|
|
personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
|
|
|
|
Returns:
|
|
list: Articles sorted by personalization score with score added
|
|
"""
|
|
# Get user interests
|
|
user_interests = get_user_interests(subscriber_email)
|
|
|
|
# Calculate score for each article
|
|
scored_articles = []
|
|
for article in articles:
|
|
score = calculate_article_score(article, user_interests)
|
|
|
|
# Add score to article (don't modify original)
|
|
article_with_score = article.copy()
|
|
article_with_score['personalization_score'] = score
|
|
scored_articles.append(article_with_score)
|
|
|
|
# Sort by score (highest first)
|
|
scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
|
|
|
|
return scored_articles
|
|
|
|
|
|
def select_personalized_articles(
|
|
articles: List[Dict],
|
|
subscriber_email: str,
|
|
max_articles: int = 10,
|
|
personalization_ratio: float = 0.7,
|
|
min_score_threshold: float = 0.1
|
|
) -> List[Dict]:
|
|
"""
|
|
Select and rank articles for a personalized newsletter.
|
|
|
|
Strategy:
|
|
- Top N * personalization_ratio articles: Highest scoring (personalized)
|
|
- Remaining articles: Most recent (trending/diverse content)
|
|
- Ensures mix of personalized + diverse content
|
|
|
|
Args:
|
|
articles: List of available articles
|
|
subscriber_email: Email address of the user
|
|
max_articles: Maximum number of articles to include (default: 10)
|
|
personalization_ratio: Ratio of personalized content (default: 0.7)
|
|
min_score_threshold: Minimum score to consider personalized (default: 0.1)
|
|
|
|
Returns:
|
|
list: Selected articles with personalization scores
|
|
"""
|
|
if not articles:
|
|
return []
|
|
|
|
# Rank all articles
|
|
ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
|
|
|
|
# Calculate split
|
|
num_personalized = int(max_articles * personalization_ratio)
|
|
num_trending = max_articles - num_personalized
|
|
|
|
# Get personalized articles (high scoring)
|
|
personalized = [
|
|
a for a in ranked_articles
|
|
if a['personalization_score'] >= min_score_threshold
|
|
][:num_personalized]
|
|
|
|
# Get trending articles (most recent, not already selected)
|
|
personalized_ids = {a.get('_id') for a in personalized}
|
|
trending = [
|
|
a for a in ranked_articles
|
|
if a.get('_id') not in personalized_ids
|
|
][:num_trending]
|
|
|
|
# Combine: personalized first, then trending
|
|
selected = personalized + trending
|
|
|
|
# Ensure we don't exceed max_articles
|
|
return selected[:max_articles]
|
|
|
|
|
|
def get_personalization_explanation(
|
|
article: Dict,
|
|
user_interests: Optional[Dict]
|
|
) -> Dict[str, any]:
|
|
"""
|
|
Generate explanation for why an article was recommended.
|
|
|
|
Useful for transparency and debugging.
|
|
|
|
Args:
|
|
article: Article dictionary
|
|
user_interests: User interest profile
|
|
|
|
Returns:
|
|
dict: Explanation containing:
|
|
- score: Overall personalization score
|
|
- category_match: Category score
|
|
- keyword_matches: List of matching keywords with scores
|
|
- reason: Human-readable explanation
|
|
"""
|
|
if not user_interests:
|
|
return {
|
|
'score': 0.5,
|
|
'category_match': 0.0,
|
|
'keyword_matches': [],
|
|
'reason': 'No personalization data available'
|
|
}
|
|
|
|
article_category = article.get('category', 'general')
|
|
article_keywords = article.get('keywords', [])
|
|
|
|
user_categories = user_interests.get('categories', {})
|
|
user_keywords = user_interests.get('keywords', {})
|
|
|
|
# Category match
|
|
category_score = user_categories.get(article_category, 0.0)
|
|
|
|
# Keyword matches
|
|
keyword_matches = []
|
|
for keyword in article_keywords:
|
|
if keyword in user_keywords:
|
|
keyword_matches.append({
|
|
'keyword': keyword,
|
|
'score': user_keywords[keyword]
|
|
})
|
|
|
|
# Calculate overall score
|
|
overall_score = calculate_article_score(article, user_interests)
|
|
|
|
# Generate reason
|
|
if overall_score >= 0.5:
|
|
reason = f"High match with your interests in {article_category}"
|
|
if keyword_matches:
|
|
top_keywords = [m['keyword'] for m in keyword_matches[:2]]
|
|
reason += f" and topics like {', '.join(top_keywords)}"
|
|
elif overall_score >= 0.3:
|
|
reason = f"Moderate match with your interests"
|
|
else:
|
|
reason = "Trending article for diverse content"
|
|
|
|
return {
|
|
'score': overall_score,
|
|
'category_match': category_score,
|
|
'keyword_matches': keyword_matches,
|
|
'reason': reason
|
|
}
|
|
|
|
|
|
def get_personalization_stats(
|
|
selected_articles: List[Dict],
|
|
subscriber_email: str
|
|
) -> Dict[str, any]:
|
|
"""
|
|
Get statistics about personalization for a newsletter.
|
|
|
|
Args:
|
|
selected_articles: Articles selected for the newsletter
|
|
subscriber_email: Email address of the user
|
|
|
|
Returns:
|
|
dict: Statistics containing:
|
|
- total_articles: Number of articles
|
|
- avg_score: Average personalization score
|
|
- highly_personalized: Number of articles with score >= 0.5
|
|
- moderately_personalized: Number with score 0.3-0.5
|
|
- trending: Number with score < 0.3
|
|
"""
|
|
if not selected_articles:
|
|
return {
|
|
'total_articles': 0,
|
|
'avg_score': 0.0,
|
|
'highly_personalized': 0,
|
|
'moderately_personalized': 0,
|
|
'trending': 0
|
|
}
|
|
|
|
scores = [a.get('personalization_score', 0.0) for a in selected_articles]
|
|
avg_score = sum(scores) / len(scores)
|
|
|
|
highly_personalized = sum(1 for s in scores if s >= 0.5)
|
|
moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
|
|
trending = sum(1 for s in scores if s < 0.3)
|
|
|
|
return {
|
|
'total_articles': len(selected_articles),
|
|
'avg_score': round(avg_score, 3),
|
|
'highly_personalized': highly_personalized,
|
|
'moderately_personalized': moderately_personalized,
|
|
'trending': trending
|
|
}
|
|
|
|
|
|
def batch_personalize_newsletters(
|
|
articles: List[Dict],
|
|
subscribers: List[str],
|
|
max_articles_per_user: int = 10
|
|
) -> Dict[str, List[Dict]]:
|
|
"""
|
|
Generate personalized article selections for multiple subscribers.
|
|
|
|
Useful for batch newsletter generation.
|
|
|
|
Args:
|
|
articles: List of available articles
|
|
subscribers: List of subscriber email addresses
|
|
max_articles_per_user: Max articles per newsletter (default: 10)
|
|
|
|
Returns:
|
|
dict: Mapping of email -> personalized article list
|
|
"""
|
|
personalized_newsletters = {}
|
|
|
|
for subscriber_email in subscribers:
|
|
personalized_articles = select_personalized_articles(
|
|
articles,
|
|
subscriber_email,
|
|
max_articles=max_articles_per_user
|
|
)
|
|
personalized_newsletters[subscriber_email] = personalized_articles
|
|
|
|
return personalized_newsletters
|