This commit is contained in:
2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions

View File

@@ -0,0 +1,323 @@
"""
User Interest Profiling Service for Munich News Daily.
Builds and maintains user interest profiles based on article click behavior.
"""
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from database import link_clicks_collection
from pymongo import MongoClient
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
user_interests_collection = db['user_interests']
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
"""
Update user interest profile based on a clicked article.
Increments interest scores for the article's keywords and category.
Creates a new profile if the user doesn't have one yet.
Args:
subscriber_email: Email address of the user
keywords: List of keywords from the clicked article
category: Category of the clicked article
Returns:
dict: Updated user interest profile
"""
current_time = datetime.utcnow()
# Get existing profile or create new one
profile = user_interests_collection.find_one({'email': subscriber_email})
if not profile:
# Create new profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': current_time,
'created_at': current_time
}
# Update category interest (increment by 0.1, max 1.0)
current_category_score = profile['categories'].get(category, 0.0)
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
# Update keyword interests (increment by 0.1, max 1.0)
for keyword in keywords:
if keyword: # Skip empty keywords
current_keyword_score = profile['keywords'].get(keyword, 0.0)
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
# Update metadata
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
profile['last_updated'] = current_time
# Upsert profile
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
"""
Get user interest profile.
Args:
subscriber_email: Email address of the user
Returns:
dict: User interest profile or None if not found
"""
return user_interests_collection.find_one({'email': subscriber_email})
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
"""
Decay interest scores for users who haven't clicked recently.
Reduces interest scores over time to reflect changing interests.
Only decays profiles that haven't been updated in the last N days.
Args:
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
days_threshold: Only decay profiles older than this many days (default: 7)
Returns:
dict: Statistics about the decay operation
- profiles_decayed: Number of profiles that were decayed
- profiles_checked: Total number of profiles checked
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
# Find profiles that haven't been updated recently
old_profiles = user_interests_collection.find({
'last_updated': {'$lt': cutoff_date}
})
profiles_decayed = 0
profiles_checked = 0
for profile in old_profiles:
profiles_checked += 1
# Decay category scores
decayed_categories = {}
for category, score in profile.get('categories', {}).items():
new_score = score * decay_factor
# Remove categories with very low scores (< 0.05)
if new_score >= 0.05:
decayed_categories[category] = round(new_score, 3)
# Decay keyword scores
decayed_keywords = {}
for keyword, score in profile.get('keywords', {}).items():
new_score = score * decay_factor
# Remove keywords with very low scores (< 0.05)
if new_score >= 0.05:
decayed_keywords[keyword] = round(new_score, 3)
# Update profile with decayed scores
user_interests_collection.update_one(
{'email': profile['email']},
{
'$set': {
'categories': decayed_categories,
'keywords': decayed_keywords,
'last_decayed': datetime.utcnow()
}
}
)
profiles_decayed += 1
return {
'profiles_decayed': profiles_decayed,
'profiles_checked': profiles_checked
}
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
"""
Get user's top interests sorted by score.
Args:
subscriber_email: Email address of the user
top_n: Number of top interests to return (default: 10)
Returns:
dict: Top interests containing:
- top_categories: List of (category, score) tuples
- top_keywords: List of (keyword, score) tuples
"""
profile = get_user_interests(subscriber_email)
if not profile:
return {
'top_categories': [],
'top_keywords': []
}
# Sort categories by score
categories = profile.get('categories', {})
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
# Sort keywords by score
keywords = profile.get('keywords', {})
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
return {
'top_categories': top_categories,
'top_keywords': top_keywords
}
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
"""
Build or rebuild user interest profile from click history.
Useful for:
- Initializing profiles for existing users
- Rebuilding profiles after algorithm changes
- Backfilling data
Args:
subscriber_email: Email address of the user
days_lookback: Number of days of history to analyze (default: 30)
Returns:
dict: Newly built interest profile
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
# Get all clicks from this user in the lookback period
clicks = link_clicks_collection.find({
'subscriber_email': subscriber_email,
'clicked': True,
'clicked_at': {'$gte': cutoff_date}
})
# Initialize profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': datetime.utcnow(),
'created_at': datetime.utcnow()
}
# Process each click
for click in clicks:
category = click.get('category', 'general')
keywords = click.get('keywords', [])
# Update category score
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
# Update keyword scores
for keyword in keywords:
if keyword:
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
profile['total_clicks'] += 1
# Cap scores at 1.0
for category in profile['categories']:
profile['categories'][category] = min(profile['categories'][category], 1.0)
for keyword in profile['keywords']:
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
# Save profile
if profile['total_clicks'] > 0:
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_interest_statistics() -> Dict:
"""
Get statistics about user interests across all users.
Returns:
dict: Statistics containing:
- total_users: Total number of users with profiles
- avg_clicks_per_user: Average number of clicks per user
- most_popular_categories: Top categories across all users
- most_popular_keywords: Top keywords across all users
"""
total_users = user_interests_collection.count_documents({})
if total_users == 0:
return {
'total_users': 0,
'avg_clicks_per_user': 0,
'most_popular_categories': [],
'most_popular_keywords': []
}
# Calculate average clicks
pipeline = [
{
'$group': {
'_id': None,
'total_clicks': {'$sum': '$total_clicks'}
}
}
]
result = list(user_interests_collection.aggregate(pipeline))
total_clicks = result[0]['total_clicks'] if result else 0
avg_clicks = total_clicks / total_users if total_users > 0 else 0
# Get most popular categories
category_counts = {}
keyword_counts = {}
for profile in user_interests_collection.find({}):
for category, score in profile.get('categories', {}).items():
category_counts[category] = category_counts.get(category, 0) + score
for keyword, score in profile.get('keywords', {}).items():
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
# Sort and get top 10
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
return {
'total_users': total_users,
'avg_clicks_per_user': round(avg_clicks, 2),
'most_popular_categories': top_categories,
'most_popular_keywords': top_keywords
}
def delete_user_interests(subscriber_email: str) -> bool:
"""
Delete user interest profile (for GDPR compliance).
Args:
subscriber_email: Email address of the user
Returns:
bool: True if profile was deleted, False if not found
"""
result = user_interests_collection.delete_one({'email': subscriber_email})
return result.deleted_count > 0