324 lines
10 KiB
Python
324 lines
10 KiB
Python
"""
|
|
User Interest Profiling Service for Munich News Daily.
|
|
Builds and maintains user interest profiles based on article click behavior.
|
|
"""
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional
|
|
from database import link_clicks_collection
|
|
from pymongo import MongoClient
|
|
from config import Config
|
|
|
|
# Connect to MongoDB
|
|
client = MongoClient(Config.MONGODB_URI)
|
|
db = client[Config.DB_NAME]
|
|
user_interests_collection = db['user_interests']
|
|
|
|
|
|
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
|
|
"""
|
|
Update user interest profile based on a clicked article.
|
|
|
|
Increments interest scores for the article's keywords and category.
|
|
Creates a new profile if the user doesn't have one yet.
|
|
|
|
Args:
|
|
subscriber_email: Email address of the user
|
|
keywords: List of keywords from the clicked article
|
|
category: Category of the clicked article
|
|
|
|
Returns:
|
|
dict: Updated user interest profile
|
|
"""
|
|
current_time = datetime.utcnow()
|
|
|
|
# Get existing profile or create new one
|
|
profile = user_interests_collection.find_one({'email': subscriber_email})
|
|
|
|
if not profile:
|
|
# Create new profile
|
|
profile = {
|
|
'email': subscriber_email,
|
|
'categories': {},
|
|
'keywords': {},
|
|
'total_clicks': 0,
|
|
'last_updated': current_time,
|
|
'created_at': current_time
|
|
}
|
|
|
|
# Update category interest (increment by 0.1, max 1.0)
|
|
current_category_score = profile['categories'].get(category, 0.0)
|
|
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
|
|
|
|
# Update keyword interests (increment by 0.1, max 1.0)
|
|
for keyword in keywords:
|
|
if keyword: # Skip empty keywords
|
|
current_keyword_score = profile['keywords'].get(keyword, 0.0)
|
|
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
|
|
|
|
# Update metadata
|
|
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
|
|
profile['last_updated'] = current_time
|
|
|
|
# Upsert profile
|
|
user_interests_collection.update_one(
|
|
{'email': subscriber_email},
|
|
{'$set': profile},
|
|
upsert=True
|
|
)
|
|
|
|
return profile
|
|
|
|
|
|
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
|
|
"""
|
|
Get user interest profile.
|
|
|
|
Args:
|
|
subscriber_email: Email address of the user
|
|
|
|
Returns:
|
|
dict: User interest profile or None if not found
|
|
"""
|
|
return user_interests_collection.find_one({'email': subscriber_email})
|
|
|
|
|
|
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
|
|
"""
|
|
Decay interest scores for users who haven't clicked recently.
|
|
|
|
Reduces interest scores over time to reflect changing interests.
|
|
Only decays profiles that haven't been updated in the last N days.
|
|
|
|
Args:
|
|
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
|
|
days_threshold: Only decay profiles older than this many days (default: 7)
|
|
|
|
Returns:
|
|
dict: Statistics about the decay operation
|
|
- profiles_decayed: Number of profiles that were decayed
|
|
- profiles_checked: Total number of profiles checked
|
|
"""
|
|
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
|
|
|
|
# Find profiles that haven't been updated recently
|
|
old_profiles = user_interests_collection.find({
|
|
'last_updated': {'$lt': cutoff_date}
|
|
})
|
|
|
|
profiles_decayed = 0
|
|
profiles_checked = 0
|
|
|
|
for profile in old_profiles:
|
|
profiles_checked += 1
|
|
|
|
# Decay category scores
|
|
decayed_categories = {}
|
|
for category, score in profile.get('categories', {}).items():
|
|
new_score = score * decay_factor
|
|
# Remove categories with very low scores (< 0.05)
|
|
if new_score >= 0.05:
|
|
decayed_categories[category] = round(new_score, 3)
|
|
|
|
# Decay keyword scores
|
|
decayed_keywords = {}
|
|
for keyword, score in profile.get('keywords', {}).items():
|
|
new_score = score * decay_factor
|
|
# Remove keywords with very low scores (< 0.05)
|
|
if new_score >= 0.05:
|
|
decayed_keywords[keyword] = round(new_score, 3)
|
|
|
|
# Update profile with decayed scores
|
|
user_interests_collection.update_one(
|
|
{'email': profile['email']},
|
|
{
|
|
'$set': {
|
|
'categories': decayed_categories,
|
|
'keywords': decayed_keywords,
|
|
'last_decayed': datetime.utcnow()
|
|
}
|
|
}
|
|
)
|
|
|
|
profiles_decayed += 1
|
|
|
|
return {
|
|
'profiles_decayed': profiles_decayed,
|
|
'profiles_checked': profiles_checked
|
|
}
|
|
|
|
|
|
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
|
|
"""
|
|
Get user's top interests sorted by score.
|
|
|
|
Args:
|
|
subscriber_email: Email address of the user
|
|
top_n: Number of top interests to return (default: 10)
|
|
|
|
Returns:
|
|
dict: Top interests containing:
|
|
- top_categories: List of (category, score) tuples
|
|
- top_keywords: List of (keyword, score) tuples
|
|
"""
|
|
profile = get_user_interests(subscriber_email)
|
|
|
|
if not profile:
|
|
return {
|
|
'top_categories': [],
|
|
'top_keywords': []
|
|
}
|
|
|
|
# Sort categories by score
|
|
categories = profile.get('categories', {})
|
|
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
|
|
# Sort keywords by score
|
|
keywords = profile.get('keywords', {})
|
|
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
|
|
return {
|
|
'top_categories': top_categories,
|
|
'top_keywords': top_keywords
|
|
}
|
|
|
|
|
|
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
|
|
"""
|
|
Build or rebuild user interest profile from click history.
|
|
|
|
Useful for:
|
|
- Initializing profiles for existing users
|
|
- Rebuilding profiles after algorithm changes
|
|
- Backfilling data
|
|
|
|
Args:
|
|
subscriber_email: Email address of the user
|
|
days_lookback: Number of days of history to analyze (default: 30)
|
|
|
|
Returns:
|
|
dict: Newly built interest profile
|
|
"""
|
|
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
|
|
|
|
# Get all clicks from this user in the lookback period
|
|
clicks = link_clicks_collection.find({
|
|
'subscriber_email': subscriber_email,
|
|
'clicked': True,
|
|
'clicked_at': {'$gte': cutoff_date}
|
|
})
|
|
|
|
# Initialize profile
|
|
profile = {
|
|
'email': subscriber_email,
|
|
'categories': {},
|
|
'keywords': {},
|
|
'total_clicks': 0,
|
|
'last_updated': datetime.utcnow(),
|
|
'created_at': datetime.utcnow()
|
|
}
|
|
|
|
# Process each click
|
|
for click in clicks:
|
|
category = click.get('category', 'general')
|
|
keywords = click.get('keywords', [])
|
|
|
|
# Update category score
|
|
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
|
|
|
|
# Update keyword scores
|
|
for keyword in keywords:
|
|
if keyword:
|
|
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
|
|
|
|
profile['total_clicks'] += 1
|
|
|
|
# Cap scores at 1.0
|
|
for category in profile['categories']:
|
|
profile['categories'][category] = min(profile['categories'][category], 1.0)
|
|
|
|
for keyword in profile['keywords']:
|
|
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
|
|
|
|
# Save profile
|
|
if profile['total_clicks'] > 0:
|
|
user_interests_collection.update_one(
|
|
{'email': subscriber_email},
|
|
{'$set': profile},
|
|
upsert=True
|
|
)
|
|
|
|
return profile
|
|
|
|
|
|
def get_interest_statistics() -> Dict:
|
|
"""
|
|
Get statistics about user interests across all users.
|
|
|
|
Returns:
|
|
dict: Statistics containing:
|
|
- total_users: Total number of users with profiles
|
|
- avg_clicks_per_user: Average number of clicks per user
|
|
- most_popular_categories: Top categories across all users
|
|
- most_popular_keywords: Top keywords across all users
|
|
"""
|
|
total_users = user_interests_collection.count_documents({})
|
|
|
|
if total_users == 0:
|
|
return {
|
|
'total_users': 0,
|
|
'avg_clicks_per_user': 0,
|
|
'most_popular_categories': [],
|
|
'most_popular_keywords': []
|
|
}
|
|
|
|
# Calculate average clicks
|
|
pipeline = [
|
|
{
|
|
'$group': {
|
|
'_id': None,
|
|
'total_clicks': {'$sum': '$total_clicks'}
|
|
}
|
|
}
|
|
]
|
|
|
|
result = list(user_interests_collection.aggregate(pipeline))
|
|
total_clicks = result[0]['total_clicks'] if result else 0
|
|
avg_clicks = total_clicks / total_users if total_users > 0 else 0
|
|
|
|
# Get most popular categories
|
|
category_counts = {}
|
|
keyword_counts = {}
|
|
|
|
for profile in user_interests_collection.find({}):
|
|
for category, score in profile.get('categories', {}).items():
|
|
category_counts[category] = category_counts.get(category, 0) + score
|
|
|
|
for keyword, score in profile.get('keywords', {}).items():
|
|
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
|
|
|
|
# Sort and get top 10
|
|
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
|
|
return {
|
|
'total_users': total_users,
|
|
'avg_clicks_per_user': round(avg_clicks, 2),
|
|
'most_popular_categories': top_categories,
|
|
'most_popular_keywords': top_keywords
|
|
}
|
|
|
|
|
|
def delete_user_interests(subscriber_email: str) -> bool:
|
|
"""
|
|
Delete user interest profile (for GDPR compliance).
|
|
|
|
Args:
|
|
subscriber_email: Email address of the user
|
|
|
|
Returns:
|
|
bool: True if profile was deleted, False if not found
|
|
"""
|
|
result = user_interests_collection.delete_one({'email': subscriber_email})
|
|
return result.deleted_count > 0
|