This commit is contained in:
2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions

View File

@@ -0,0 +1,323 @@
"""
User Interest Profiling Service for Munich News Daily.
Builds and maintains user interest profiles based on article click behavior.
"""
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from database import link_clicks_collection
from pymongo import MongoClient
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
user_interests_collection = db['user_interests']
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
"""
Update user interest profile based on a clicked article.
Increments interest scores for the article's keywords and category.
Creates a new profile if the user doesn't have one yet.
Args:
subscriber_email: Email address of the user
keywords: List of keywords from the clicked article
category: Category of the clicked article
Returns:
dict: Updated user interest profile
"""
current_time = datetime.utcnow()
# Get existing profile or create new one
profile = user_interests_collection.find_one({'email': subscriber_email})
if not profile:
# Create new profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': current_time,
'created_at': current_time
}
# Update category interest (increment by 0.1, max 1.0)
current_category_score = profile['categories'].get(category, 0.0)
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
# Update keyword interests (increment by 0.1, max 1.0)
for keyword in keywords:
if keyword: # Skip empty keywords
current_keyword_score = profile['keywords'].get(keyword, 0.0)
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
# Update metadata
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
profile['last_updated'] = current_time
# Upsert profile
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
"""
Get user interest profile.
Args:
subscriber_email: Email address of the user
Returns:
dict: User interest profile or None if not found
"""
return user_interests_collection.find_one({'email': subscriber_email})
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
"""
Decay interest scores for users who haven't clicked recently.
Reduces interest scores over time to reflect changing interests.
Only decays profiles that haven't been updated in the last N days.
Args:
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
days_threshold: Only decay profiles older than this many days (default: 7)
Returns:
dict: Statistics about the decay operation
- profiles_decayed: Number of profiles that were decayed
- profiles_checked: Total number of profiles checked
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
# Find profiles that haven't been updated recently
old_profiles = user_interests_collection.find({
'last_updated': {'$lt': cutoff_date}
})
profiles_decayed = 0
profiles_checked = 0
for profile in old_profiles:
profiles_checked += 1
# Decay category scores
decayed_categories = {}
for category, score in profile.get('categories', {}).items():
new_score = score * decay_factor
# Remove categories with very low scores (< 0.05)
if new_score >= 0.05:
decayed_categories[category] = round(new_score, 3)
# Decay keyword scores
decayed_keywords = {}
for keyword, score in profile.get('keywords', {}).items():
new_score = score * decay_factor
# Remove keywords with very low scores (< 0.05)
if new_score >= 0.05:
decayed_keywords[keyword] = round(new_score, 3)
# Update profile with decayed scores
user_interests_collection.update_one(
{'email': profile['email']},
{
'$set': {
'categories': decayed_categories,
'keywords': decayed_keywords,
'last_decayed': datetime.utcnow()
}
}
)
profiles_decayed += 1
return {
'profiles_decayed': profiles_decayed,
'profiles_checked': profiles_checked
}
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
"""
Get user's top interests sorted by score.
Args:
subscriber_email: Email address of the user
top_n: Number of top interests to return (default: 10)
Returns:
dict: Top interests containing:
- top_categories: List of (category, score) tuples
- top_keywords: List of (keyword, score) tuples
"""
profile = get_user_interests(subscriber_email)
if not profile:
return {
'top_categories': [],
'top_keywords': []
}
# Sort categories by score
categories = profile.get('categories', {})
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
# Sort keywords by score
keywords = profile.get('keywords', {})
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
return {
'top_categories': top_categories,
'top_keywords': top_keywords
}
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
"""
Build or rebuild user interest profile from click history.
Useful for:
- Initializing profiles for existing users
- Rebuilding profiles after algorithm changes
- Backfilling data
Args:
subscriber_email: Email address of the user
days_lookback: Number of days of history to analyze (default: 30)
Returns:
dict: Newly built interest profile
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
# Get all clicks from this user in the lookback period
clicks = link_clicks_collection.find({
'subscriber_email': subscriber_email,
'clicked': True,
'clicked_at': {'$gte': cutoff_date}
})
# Initialize profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': datetime.utcnow(),
'created_at': datetime.utcnow()
}
# Process each click
for click in clicks:
category = click.get('category', 'general')
keywords = click.get('keywords', [])
# Update category score
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
# Update keyword scores
for keyword in keywords:
if keyword:
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
profile['total_clicks'] += 1
# Cap scores at 1.0
for category in profile['categories']:
profile['categories'][category] = min(profile['categories'][category], 1.0)
for keyword in profile['keywords']:
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
# Save profile
if profile['total_clicks'] > 0:
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_interest_statistics() -> Dict:
"""
Get statistics about user interests across all users.
Returns:
dict: Statistics containing:
- total_users: Total number of users with profiles
- avg_clicks_per_user: Average number of clicks per user
- most_popular_categories: Top categories across all users
- most_popular_keywords: Top keywords across all users
"""
total_users = user_interests_collection.count_documents({})
if total_users == 0:
return {
'total_users': 0,
'avg_clicks_per_user': 0,
'most_popular_categories': [],
'most_popular_keywords': []
}
# Calculate average clicks
pipeline = [
{
'$group': {
'_id': None,
'total_clicks': {'$sum': '$total_clicks'}
}
}
]
result = list(user_interests_collection.aggregate(pipeline))
total_clicks = result[0]['total_clicks'] if result else 0
avg_clicks = total_clicks / total_users if total_users > 0 else 0
# Get most popular categories
category_counts = {}
keyword_counts = {}
for profile in user_interests_collection.find({}):
for category, score in profile.get('categories', {}).items():
category_counts[category] = category_counts.get(category, 0) + score
for keyword, score in profile.get('keywords', {}).items():
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
# Sort and get top 10
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
return {
'total_users': total_users,
'avg_clicks_per_user': round(avg_clicks, 2),
'most_popular_categories': top_categories,
'most_popular_keywords': top_keywords
}
def delete_user_interests(subscriber_email: str) -> bool:
"""
Delete user interest profile (for GDPR compliance).
Args:
subscriber_email: Email address of the user
Returns:
bool: True if profile was deleted, False if not found
"""
result = user_interests_collection.delete_one({'email': subscriber_email})
return result.deleted_count > 0

View File

@@ -0,0 +1,295 @@
"""
Newsletter Personalization Service for Munich News Daily.
Ranks and selects articles based on user interest profiles.
"""
from typing import Dict, List, Optional
from datetime import datetime, timedelta
from services.interest_profiling_service import get_user_interests
def calculate_article_score(
article: Dict,
user_interests: Optional[Dict],
category_weight: float = 0.4,
keyword_weight: float = 0.6
) -> float:
"""
Calculate personalization score for an article based on user interests.
Score is calculated as:
- Category match: 0-1.0 based on user's interest in the category
- Keyword match: Average of user's interest in article keywords
- Final score: (category_score * 0.4) + (keyword_score * 0.6)
Args:
article: Article dictionary with 'category' and 'keywords' fields
user_interests: User interest profile (None for non-personalized)
category_weight: Weight for category matching (default: 0.4)
keyword_weight: Weight for keyword matching (default: 0.6)
Returns:
float: Personalization score between 0.0 and 1.0
"""
# If no user interests, return neutral score
if not user_interests:
return 0.5
# Get article metadata
article_category = article.get('category', 'general')
article_keywords = article.get('keywords', [])
# Calculate category score
user_categories = user_interests.get('categories', {})
category_score = user_categories.get(article_category, 0.0)
# Calculate keyword score (average of all matching keywords)
user_keywords = user_interests.get('keywords', {})
keyword_scores = []
for keyword in article_keywords:
if keyword in user_keywords:
keyword_scores.append(user_keywords[keyword])
# Average keyword score (0.0 if no matches)
keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
# Weighted final score
final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
return round(final_score, 3)
def rank_articles_for_user(
articles: List[Dict],
subscriber_email: str,
personalization_ratio: float = 0.7
) -> List[Dict]:
"""
Rank articles for a specific user based on their interests.
Mixes personalized content with trending content to avoid filter bubbles.
Args:
articles: List of article dictionaries
subscriber_email: Email address of the user
personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
Returns:
list: Articles sorted by personalization score with score added
"""
# Get user interests
user_interests = get_user_interests(subscriber_email)
# Calculate score for each article
scored_articles = []
for article in articles:
score = calculate_article_score(article, user_interests)
# Add score to article (don't modify original)
article_with_score = article.copy()
article_with_score['personalization_score'] = score
scored_articles.append(article_with_score)
# Sort by score (highest first)
scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
return scored_articles
def select_personalized_articles(
articles: List[Dict],
subscriber_email: str,
max_articles: int = 10,
personalization_ratio: float = 0.7,
min_score_threshold: float = 0.1
) -> List[Dict]:
"""
Select and rank articles for a personalized newsletter.
Strategy:
- Top N * personalization_ratio articles: Highest scoring (personalized)
- Remaining articles: Most recent (trending/diverse content)
- Ensures mix of personalized + diverse content
Args:
articles: List of available articles
subscriber_email: Email address of the user
max_articles: Maximum number of articles to include (default: 10)
personalization_ratio: Ratio of personalized content (default: 0.7)
min_score_threshold: Minimum score to consider personalized (default: 0.1)
Returns:
list: Selected articles with personalization scores
"""
if not articles:
return []
# Rank all articles
ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
# Calculate split
num_personalized = int(max_articles * personalization_ratio)
num_trending = max_articles - num_personalized
# Get personalized articles (high scoring)
personalized = [
a for a in ranked_articles
if a['personalization_score'] >= min_score_threshold
][:num_personalized]
# Get trending articles (most recent, not already selected)
personalized_ids = {a.get('_id') for a in personalized}
trending = [
a for a in ranked_articles
if a.get('_id') not in personalized_ids
][:num_trending]
# Combine: personalized first, then trending
selected = personalized + trending
# Ensure we don't exceed max_articles
return selected[:max_articles]
def get_personalization_explanation(
article: Dict,
user_interests: Optional[Dict]
) -> Dict[str, any]:
"""
Generate explanation for why an article was recommended.
Useful for transparency and debugging.
Args:
article: Article dictionary
user_interests: User interest profile
Returns:
dict: Explanation containing:
- score: Overall personalization score
- category_match: Category score
- keyword_matches: List of matching keywords with scores
- reason: Human-readable explanation
"""
if not user_interests:
return {
'score': 0.5,
'category_match': 0.0,
'keyword_matches': [],
'reason': 'No personalization data available'
}
article_category = article.get('category', 'general')
article_keywords = article.get('keywords', [])
user_categories = user_interests.get('categories', {})
user_keywords = user_interests.get('keywords', {})
# Category match
category_score = user_categories.get(article_category, 0.0)
# Keyword matches
keyword_matches = []
for keyword in article_keywords:
if keyword in user_keywords:
keyword_matches.append({
'keyword': keyword,
'score': user_keywords[keyword]
})
# Calculate overall score
overall_score = calculate_article_score(article, user_interests)
# Generate reason
if overall_score >= 0.5:
reason = f"High match with your interests in {article_category}"
if keyword_matches:
top_keywords = [m['keyword'] for m in keyword_matches[:2]]
reason += f" and topics like {', '.join(top_keywords)}"
elif overall_score >= 0.3:
reason = f"Moderate match with your interests"
else:
reason = "Trending article for diverse content"
return {
'score': overall_score,
'category_match': category_score,
'keyword_matches': keyword_matches,
'reason': reason
}
def get_personalization_stats(
selected_articles: List[Dict],
subscriber_email: str
) -> Dict[str, any]:
"""
Get statistics about personalization for a newsletter.
Args:
selected_articles: Articles selected for the newsletter
subscriber_email: Email address of the user
Returns:
dict: Statistics containing:
- total_articles: Number of articles
- avg_score: Average personalization score
- highly_personalized: Number of articles with score >= 0.5
- moderately_personalized: Number with score 0.3-0.5
- trending: Number with score < 0.3
"""
if not selected_articles:
return {
'total_articles': 0,
'avg_score': 0.0,
'highly_personalized': 0,
'moderately_personalized': 0,
'trending': 0
}
scores = [a.get('personalization_score', 0.0) for a in selected_articles]
avg_score = sum(scores) / len(scores)
highly_personalized = sum(1 for s in scores if s >= 0.5)
moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
trending = sum(1 for s in scores if s < 0.3)
return {
'total_articles': len(selected_articles),
'avg_score': round(avg_score, 3),
'highly_personalized': highly_personalized,
'moderately_personalized': moderately_personalized,
'trending': trending
}
def batch_personalize_newsletters(
articles: List[Dict],
subscribers: List[str],
max_articles_per_user: int = 10
) -> Dict[str, List[Dict]]:
"""
Generate personalized article selections for multiple subscribers.
Useful for batch newsletter generation.
Args:
articles: List of available articles
subscribers: List of subscriber email addresses
max_articles_per_user: Max articles per newsletter (default: 10)
Returns:
dict: Mapping of email -> personalized article list
"""
personalized_newsletters = {}
for subscriber_email in subscribers:
personalized_articles = select_personalized_articles(
articles,
subscriber_email,
max_articles=max_articles_per_user
)
personalized_newsletters[subscriber_email] = personalized_articles
return personalized_newsletters

View File

@@ -80,6 +80,9 @@ def create_newsletter_tracking(
link_tracking_map = {}
if article_links:
# Import here to avoid circular dependency
from database import articles_collection
for article in article_links:
article_url = article.get('url')
article_title = article.get('title', '')
@@ -87,13 +90,22 @@ def create_newsletter_tracking(
if article_url:
link_tracking_id = generate_tracking_id()
# Create link click tracking record
# Look up article metadata from database for personalization
article_doc = articles_collection.find_one({'link': article_url})
article_id = str(article_doc['_id']) if article_doc else None
category = article_doc.get('category', 'general') if article_doc else 'general'
keywords = article_doc.get('keywords', []) if article_doc else []
# Create link click tracking record with metadata
link_click_doc = {
'tracking_id': link_tracking_id,
'newsletter_id': newsletter_id,
'subscriber_email': subscriber_email,
'article_url': article_url,
'article_title': article_title,
'article_id': article_id, # NEW: Article database ID
'category': category, # NEW: Article category
'keywords': keywords, # NEW: Article keywords for personalization
'clicked': False,
'clicked_at': None,
'user_agent': None,