update
This commit is contained in:
9
.env.local
Normal file
9
.env.local
Normal file
@@ -0,0 +1,9 @@
|
||||
# Munich News Daily - Local Development Environment Variables
|
||||
|
||||
# MongoDB Configuration
|
||||
MONGO_USERNAME=admin
|
||||
MONGO_PASSWORD=local123
|
||||
MONGO_AUTH=--auth
|
||||
|
||||
# Ollama Model (use smaller/faster model for local dev)
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -84,7 +84,9 @@ yarn.lock
|
||||
.env.production.local
|
||||
*.env
|
||||
!.env.example
|
||||
!.env.local
|
||||
!backend/.env.example
|
||||
!backend/.env.local
|
||||
|
||||
# ===================================
|
||||
# Database
|
||||
|
||||
@@ -7,6 +7,7 @@ A fully automated news aggregation and newsletter system that crawls Munich news
|
||||
- **🤖 AI-Powered Clustering** - Automatically detects duplicate stories from different sources
|
||||
- **📰 Neutral Summaries** - Combines multiple perspectives into balanced coverage
|
||||
- **🎯 Smart Prioritization** - Shows most important stories first (multi-source coverage)
|
||||
- **🎨 Personalized Newsletters** - AI-powered content recommendations based on user interests
|
||||
- **📊 Engagement Tracking** - Open rates, click tracking, and analytics
|
||||
- **⚡ GPU Acceleration** - 5-10x faster AI processing with GPU support
|
||||
- **🔒 GDPR Compliant** - Privacy-first with data retention controls
|
||||
@@ -365,6 +366,8 @@ curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-
|
||||
|
||||
### Core Features
|
||||
- **[docs/AI_NEWS_AGGREGATION.md](docs/AI_NEWS_AGGREGATION.md)** - AI-powered clustering & neutral summaries
|
||||
- **[docs/PERSONALIZATION.md](docs/PERSONALIZATION.md)** - Personalized newsletter system
|
||||
- **[docs/PERSONALIZATION_COMPLETE.md](docs/PERSONALIZATION_COMPLETE.md)** - Personalization implementation guide
|
||||
- **[docs/FEATURES.md](docs/FEATURES.md)** - Complete feature list
|
||||
- **[docs/API.md](docs/API.md)** - API endpoints reference
|
||||
|
||||
@@ -399,6 +402,9 @@ docker-compose exec sender python tests/sender/test_tracking_integration.py
|
||||
|
||||
# Run backend tests
|
||||
docker-compose exec backend python tests/backend/test_tracking.py
|
||||
|
||||
# Test personalization system (all 4 phases)
|
||||
docker exec munich-news-local-backend python test_personalization_system.py
|
||||
```
|
||||
|
||||
## 🚀 Production Deployment
|
||||
|
||||
30
backend/.env.local
Normal file
30
backend/.env.local
Normal file
@@ -0,0 +1,30 @@
|
||||
# Munich News Daily - Local Development Backend Configuration
|
||||
|
||||
# MongoDB Configuration
|
||||
MONGODB_URI=mongodb://admin:changeme@mongodb:27017/
|
||||
|
||||
# Email Configuration (use test credentials or disable)
|
||||
SMTP_SERVER=localhost
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=test@localhost
|
||||
EMAIL_PASSWORD=test123
|
||||
|
||||
# Newsletter Settings
|
||||
NEWSLETTER_MAX_ARTICLES=5
|
||||
NEWSLETTER_HOURS_LOOKBACK=24
|
||||
WEBSITE_URL=http://localhost:3000
|
||||
|
||||
# Tracking Configuration
|
||||
TRACKING_ENABLED=true
|
||||
TRACKING_API_URL=http://localhost:5001
|
||||
TRACKING_DATA_RETENTION_DAYS=90
|
||||
|
||||
# Ollama Configuration (AI Summarization)
|
||||
OLLAMA_ENABLED=true
|
||||
OLLAMA_BASE_URL=http://ollama:11434
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
OLLAMA_TIMEOUT=120
|
||||
SUMMARY_MAX_WORDS=150
|
||||
|
||||
# Flask Server Configuration
|
||||
FLASK_PORT=5001
|
||||
@@ -11,6 +11,8 @@ from routes.tracking_routes import tracking_bp
|
||||
from routes.analytics_routes import analytics_bp
|
||||
from routes.admin_routes import admin_bp
|
||||
from routes.transport_routes import transport_bp
|
||||
from routes.interests_routes import interests_bp
|
||||
from routes.personalization_routes import personalization_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
@@ -29,6 +31,8 @@ app.register_blueprint(tracking_bp)
|
||||
app.register_blueprint(analytics_bp)
|
||||
app.register_blueprint(admin_bp)
|
||||
app.register_blueprint(transport_bp)
|
||||
app.register_blueprint(interests_bp)
|
||||
app.register_blueprint(personalization_bp)
|
||||
|
||||
# Health check endpoint
|
||||
@app.route('/health')
|
||||
|
||||
239
backend/routes/interests_routes.py
Normal file
239
backend/routes/interests_routes.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""
|
||||
User Interest Profile API routes for Munich News Daily.
|
||||
Provides endpoints to view and manage user interest profiles.
|
||||
"""
|
||||
|
||||
from flask import Blueprint, request, jsonify
|
||||
from services.interest_profiling_service import (
|
||||
get_user_interests,
|
||||
get_top_interests,
|
||||
build_interests_from_history,
|
||||
decay_user_interests,
|
||||
get_interest_statistics,
|
||||
delete_user_interests
|
||||
)
|
||||
|
||||
interests_bp = Blueprint('interests', __name__)
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/<email>', methods=['GET'])
|
||||
def get_interests(email):
|
||||
"""
|
||||
Get user interest profile.
|
||||
|
||||
Args:
|
||||
email: Email address of the user
|
||||
|
||||
Returns:
|
||||
JSON response with user interest profile
|
||||
"""
|
||||
try:
|
||||
profile = get_user_interests(email)
|
||||
|
||||
if not profile:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'User profile not found'
|
||||
}), 404
|
||||
|
||||
# Remove MongoDB _id field
|
||||
if '_id' in profile:
|
||||
del profile['_id']
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'profile': profile
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/<email>/top', methods=['GET'])
|
||||
def get_top_user_interests(email):
|
||||
"""
|
||||
Get user's top interests sorted by score.
|
||||
|
||||
Query parameters:
|
||||
top_n: Number of top interests to return (default: 10)
|
||||
|
||||
Args:
|
||||
email: Email address of the user
|
||||
|
||||
Returns:
|
||||
JSON response with top categories and keywords
|
||||
"""
|
||||
try:
|
||||
top_n = request.args.get('top_n', 10, type=int)
|
||||
|
||||
top_interests = get_top_interests(email, top_n)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'email': email,
|
||||
'top_categories': [
|
||||
{'category': cat, 'score': score}
|
||||
for cat, score in top_interests['top_categories']
|
||||
],
|
||||
'top_keywords': [
|
||||
{'keyword': kw, 'score': score}
|
||||
for kw, score in top_interests['top_keywords']
|
||||
]
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/<email>/rebuild', methods=['POST'])
|
||||
def rebuild_interests(email):
|
||||
"""
|
||||
Rebuild user interest profile from click history.
|
||||
|
||||
Request body (optional):
|
||||
{
|
||||
"days_lookback": 30 // Number of days of history to analyze
|
||||
}
|
||||
|
||||
Args:
|
||||
email: Email address of the user
|
||||
|
||||
Returns:
|
||||
JSON response with rebuilt profile
|
||||
"""
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
days_lookback = data.get('days_lookback', 30)
|
||||
|
||||
# Validate days_lookback
|
||||
if not isinstance(days_lookback, int) or days_lookback < 1:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'days_lookback must be a positive integer'
|
||||
}), 400
|
||||
|
||||
profile = build_interests_from_history(email, days_lookback)
|
||||
|
||||
# Remove MongoDB _id field
|
||||
if '_id' in profile:
|
||||
del profile['_id']
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Profile rebuilt from {days_lookback} days of history',
|
||||
'profile': profile
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/decay', methods=['POST'])
|
||||
def decay_interests():
|
||||
"""
|
||||
Decay interest scores for inactive users.
|
||||
|
||||
Request body (optional):
|
||||
{
|
||||
"decay_factor": 0.95, // Multiplier for scores (default: 0.95)
|
||||
"days_threshold": 7 // Only decay profiles older than N days
|
||||
}
|
||||
|
||||
Returns:
|
||||
JSON response with decay statistics
|
||||
"""
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
decay_factor = data.get('decay_factor', 0.95)
|
||||
days_threshold = data.get('days_threshold', 7)
|
||||
|
||||
# Validate parameters
|
||||
if not isinstance(decay_factor, (int, float)) or decay_factor <= 0 or decay_factor > 1:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'decay_factor must be between 0 and 1'
|
||||
}), 400
|
||||
|
||||
if not isinstance(days_threshold, int) or days_threshold < 1:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'days_threshold must be a positive integer'
|
||||
}), 400
|
||||
|
||||
result = decay_user_interests(decay_factor, days_threshold)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Decayed interests for profiles older than {days_threshold} days',
|
||||
'statistics': result
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/statistics', methods=['GET'])
|
||||
def get_statistics():
|
||||
"""
|
||||
Get statistics about user interests across all users.
|
||||
|
||||
Returns:
|
||||
JSON response with interest statistics
|
||||
"""
|
||||
try:
|
||||
stats = get_interest_statistics()
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'statistics': stats
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@interests_bp.route('/api/interests/<email>', methods=['DELETE'])
|
||||
def delete_interests(email):
|
||||
"""
|
||||
Delete user interest profile (GDPR compliance).
|
||||
|
||||
Args:
|
||||
email: Email address of the user
|
||||
|
||||
Returns:
|
||||
JSON response with confirmation
|
||||
"""
|
||||
try:
|
||||
deleted = delete_user_interests(email)
|
||||
|
||||
if not deleted:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'User profile not found'
|
||||
}), 404
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Interest profile deleted for {email}'
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
135
backend/routes/personalization_routes.py
Normal file
135
backend/routes/personalization_routes.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
Personalization API routes for Munich News Daily.
|
||||
Provides endpoints to test and preview personalized content.
|
||||
"""
|
||||
|
||||
from flask import Blueprint, request, jsonify
|
||||
from datetime import datetime, timedelta
|
||||
from database import articles_collection
|
||||
from services.personalization_service import (
|
||||
rank_articles_for_user,
|
||||
select_personalized_articles,
|
||||
get_personalization_explanation,
|
||||
get_personalization_stats
|
||||
)
|
||||
|
||||
personalization_bp = Blueprint('personalization', __name__)
|
||||
|
||||
|
||||
@personalization_bp.route('/api/personalize/preview/<email>', methods=['GET'])
|
||||
def preview_personalized_newsletter(email):
|
||||
"""
|
||||
Preview personalized newsletter for a user.
|
||||
|
||||
Query parameters:
|
||||
max_articles: Maximum articles to return (default: 10)
|
||||
hours_lookback: Hours of articles to consider (default: 24)
|
||||
|
||||
Returns:
|
||||
JSON with personalized article selection and statistics
|
||||
"""
|
||||
try:
|
||||
max_articles = request.args.get('max_articles', 10, type=int)
|
||||
hours_lookback = request.args.get('hours_lookback', 24, type=int)
|
||||
|
||||
# Get recent articles
|
||||
cutoff_date = datetime.utcnow() - timedelta(hours=hours_lookback)
|
||||
articles = list(articles_collection.find({
|
||||
'created_at': {'$gte': cutoff_date},
|
||||
'summary': {'$exists': True, '$ne': None}
|
||||
}).sort('created_at', -1))
|
||||
|
||||
|
||||
# Select personalized articles
|
||||
personalized = select_personalized_articles(
|
||||
articles,
|
||||
email,
|
||||
max_articles=max_articles
|
||||
)
|
||||
|
||||
# Get statistics
|
||||
stats = get_personalization_stats(personalized, email)
|
||||
|
||||
# Format response
|
||||
articles_response = []
|
||||
for article in personalized:
|
||||
articles_response.append({
|
||||
'title': article.get('title', ''),
|
||||
'title_en': article.get('title_en'),
|
||||
'summary': article.get('summary', ''),
|
||||
'link': article.get('link', ''),
|
||||
'category': article.get('category', 'general'),
|
||||
'keywords': article.get('keywords', []),
|
||||
'personalization_score': article.get('personalization_score', 0.0),
|
||||
'published_at': article.get('published_at', '')
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'email': email,
|
||||
'articles': articles_response,
|
||||
'statistics': stats
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@personalization_bp.route('/api/personalize/explain', methods=['POST'])
|
||||
def explain_recommendation():
|
||||
"""
|
||||
Explain why an article was recommended to a user.
|
||||
|
||||
Request body:
|
||||
{
|
||||
"email": "user@example.com",
|
||||
"article_id": "article-id-here"
|
||||
}
|
||||
|
||||
Returns:
|
||||
JSON with explanation of recommendation
|
||||
"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
|
||||
if not data or 'email' not in data or 'article_id' not in data:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'email and article_id required'
|
||||
}), 400
|
||||
|
||||
email = data['email']
|
||||
article_id = data['article_id']
|
||||
|
||||
# Get article
|
||||
from bson import ObjectId
|
||||
article = articles_collection.find_one({'_id': ObjectId(article_id)})
|
||||
|
||||
if not article:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'Article not found'
|
||||
}), 404
|
||||
|
||||
# Get user interests
|
||||
from services.interest_profiling_service import get_user_interests
|
||||
user_interests = get_user_interests(email)
|
||||
|
||||
# Generate explanation
|
||||
explanation = get_personalization_explanation(article, user_interests)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'email': email,
|
||||
'article_title': article.get('title', ''),
|
||||
'explanation': explanation
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
@@ -79,8 +79,8 @@ def track_click(tracking_id):
|
||||
"""
|
||||
Track link clicks and redirect to original article URL.
|
||||
|
||||
Logs the click event and redirects the user to the original article URL.
|
||||
Handles invalid tracking_id by redirecting to homepage.
|
||||
Logs the click event, updates user interest profile, and redirects the user
|
||||
to the original article URL. Handles invalid tracking_id by redirecting to homepage.
|
||||
Ensures redirect completes within 200ms.
|
||||
|
||||
Args:
|
||||
@@ -115,6 +115,19 @@ def track_click(tracking_id):
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Update user interest profile (Phase 3)
|
||||
subscriber_email = tracking_record.get('subscriber_email')
|
||||
keywords = tracking_record.get('keywords', [])
|
||||
category = tracking_record.get('category', 'general')
|
||||
|
||||
if subscriber_email and subscriber_email != 'anonymized':
|
||||
try:
|
||||
from services.interest_profiling_service import update_user_interests
|
||||
update_user_interests(subscriber_email, keywords, category)
|
||||
except Exception as e:
|
||||
# Don't fail the redirect if interest update fails
|
||||
print(f"Error updating user interests: {str(e)}")
|
||||
except Exception as e:
|
||||
# Log error but still redirect
|
||||
print(f"Error tracking click for {tracking_id}: {str(e)}")
|
||||
|
||||
323
backend/services/interest_profiling_service.py
Normal file
323
backend/services/interest_profiling_service.py
Normal file
@@ -0,0 +1,323 @@
|
||||
"""
|
||||
User Interest Profiling Service for Munich News Daily.
|
||||
Builds and maintains user interest profiles based on article click behavior.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional
|
||||
from database import link_clicks_collection
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
|
||||
# Connect to MongoDB
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
user_interests_collection = db['user_interests']
|
||||
|
||||
|
||||
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
|
||||
"""
|
||||
Update user interest profile based on a clicked article.
|
||||
|
||||
Increments interest scores for the article's keywords and category.
|
||||
Creates a new profile if the user doesn't have one yet.
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the user
|
||||
keywords: List of keywords from the clicked article
|
||||
category: Category of the clicked article
|
||||
|
||||
Returns:
|
||||
dict: Updated user interest profile
|
||||
"""
|
||||
current_time = datetime.utcnow()
|
||||
|
||||
# Get existing profile or create new one
|
||||
profile = user_interests_collection.find_one({'email': subscriber_email})
|
||||
|
||||
if not profile:
|
||||
# Create new profile
|
||||
profile = {
|
||||
'email': subscriber_email,
|
||||
'categories': {},
|
||||
'keywords': {},
|
||||
'total_clicks': 0,
|
||||
'last_updated': current_time,
|
||||
'created_at': current_time
|
||||
}
|
||||
|
||||
# Update category interest (increment by 0.1, max 1.0)
|
||||
current_category_score = profile['categories'].get(category, 0.0)
|
||||
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
|
||||
|
||||
# Update keyword interests (increment by 0.1, max 1.0)
|
||||
for keyword in keywords:
|
||||
if keyword: # Skip empty keywords
|
||||
current_keyword_score = profile['keywords'].get(keyword, 0.0)
|
||||
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
|
||||
|
||||
# Update metadata
|
||||
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
|
||||
profile['last_updated'] = current_time
|
||||
|
||||
# Upsert profile
|
||||
user_interests_collection.update_one(
|
||||
{'email': subscriber_email},
|
||||
{'$set': profile},
|
||||
upsert=True
|
||||
)
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get user interest profile.
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the user
|
||||
|
||||
Returns:
|
||||
dict: User interest profile or None if not found
|
||||
"""
|
||||
return user_interests_collection.find_one({'email': subscriber_email})
|
||||
|
||||
|
||||
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
|
||||
"""
|
||||
Decay interest scores for users who haven't clicked recently.
|
||||
|
||||
Reduces interest scores over time to reflect changing interests.
|
||||
Only decays profiles that haven't been updated in the last N days.
|
||||
|
||||
Args:
|
||||
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
|
||||
days_threshold: Only decay profiles older than this many days (default: 7)
|
||||
|
||||
Returns:
|
||||
dict: Statistics about the decay operation
|
||||
- profiles_decayed: Number of profiles that were decayed
|
||||
- profiles_checked: Total number of profiles checked
|
||||
"""
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
|
||||
|
||||
# Find profiles that haven't been updated recently
|
||||
old_profiles = user_interests_collection.find({
|
||||
'last_updated': {'$lt': cutoff_date}
|
||||
})
|
||||
|
||||
profiles_decayed = 0
|
||||
profiles_checked = 0
|
||||
|
||||
for profile in old_profiles:
|
||||
profiles_checked += 1
|
||||
|
||||
# Decay category scores
|
||||
decayed_categories = {}
|
||||
for category, score in profile.get('categories', {}).items():
|
||||
new_score = score * decay_factor
|
||||
# Remove categories with very low scores (< 0.05)
|
||||
if new_score >= 0.05:
|
||||
decayed_categories[category] = round(new_score, 3)
|
||||
|
||||
# Decay keyword scores
|
||||
decayed_keywords = {}
|
||||
for keyword, score in profile.get('keywords', {}).items():
|
||||
new_score = score * decay_factor
|
||||
# Remove keywords with very low scores (< 0.05)
|
||||
if new_score >= 0.05:
|
||||
decayed_keywords[keyword] = round(new_score, 3)
|
||||
|
||||
# Update profile with decayed scores
|
||||
user_interests_collection.update_one(
|
||||
{'email': profile['email']},
|
||||
{
|
||||
'$set': {
|
||||
'categories': decayed_categories,
|
||||
'keywords': decayed_keywords,
|
||||
'last_decayed': datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
profiles_decayed += 1
|
||||
|
||||
return {
|
||||
'profiles_decayed': profiles_decayed,
|
||||
'profiles_checked': profiles_checked
|
||||
}
|
||||
|
||||
|
||||
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
|
||||
"""
|
||||
Get user's top interests sorted by score.
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the user
|
||||
top_n: Number of top interests to return (default: 10)
|
||||
|
||||
Returns:
|
||||
dict: Top interests containing:
|
||||
- top_categories: List of (category, score) tuples
|
||||
- top_keywords: List of (keyword, score) tuples
|
||||
"""
|
||||
profile = get_user_interests(subscriber_email)
|
||||
|
||||
if not profile:
|
||||
return {
|
||||
'top_categories': [],
|
||||
'top_keywords': []
|
||||
}
|
||||
|
||||
# Sort categories by score
|
||||
categories = profile.get('categories', {})
|
||||
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
||||
|
||||
# Sort keywords by score
|
||||
keywords = profile.get('keywords', {})
|
||||
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
||||
|
||||
return {
|
||||
'top_categories': top_categories,
|
||||
'top_keywords': top_keywords
|
||||
}
|
||||
|
||||
|
||||
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
|
||||
"""
|
||||
Build or rebuild user interest profile from click history.
|
||||
|
||||
Useful for:
|
||||
- Initializing profiles for existing users
|
||||
- Rebuilding profiles after algorithm changes
|
||||
- Backfilling data
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the user
|
||||
days_lookback: Number of days of history to analyze (default: 30)
|
||||
|
||||
Returns:
|
||||
dict: Newly built interest profile
|
||||
"""
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
|
||||
|
||||
# Get all clicks from this user in the lookback period
|
||||
clicks = link_clicks_collection.find({
|
||||
'subscriber_email': subscriber_email,
|
||||
'clicked': True,
|
||||
'clicked_at': {'$gte': cutoff_date}
|
||||
})
|
||||
|
||||
# Initialize profile
|
||||
profile = {
|
||||
'email': subscriber_email,
|
||||
'categories': {},
|
||||
'keywords': {},
|
||||
'total_clicks': 0,
|
||||
'last_updated': datetime.utcnow(),
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
# Process each click
|
||||
for click in clicks:
|
||||
category = click.get('category', 'general')
|
||||
keywords = click.get('keywords', [])
|
||||
|
||||
# Update category score
|
||||
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
|
||||
|
||||
# Update keyword scores
|
||||
for keyword in keywords:
|
||||
if keyword:
|
||||
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
|
||||
|
||||
profile['total_clicks'] += 1
|
||||
|
||||
# Cap scores at 1.0
|
||||
for category in profile['categories']:
|
||||
profile['categories'][category] = min(profile['categories'][category], 1.0)
|
||||
|
||||
for keyword in profile['keywords']:
|
||||
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
|
||||
|
||||
# Save profile
|
||||
if profile['total_clicks'] > 0:
|
||||
user_interests_collection.update_one(
|
||||
{'email': subscriber_email},
|
||||
{'$set': profile},
|
||||
upsert=True
|
||||
)
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def get_interest_statistics() -> Dict:
|
||||
"""
|
||||
Get statistics about user interests across all users.
|
||||
|
||||
Returns:
|
||||
dict: Statistics containing:
|
||||
- total_users: Total number of users with profiles
|
||||
- avg_clicks_per_user: Average number of clicks per user
|
||||
- most_popular_categories: Top categories across all users
|
||||
- most_popular_keywords: Top keywords across all users
|
||||
"""
|
||||
total_users = user_interests_collection.count_documents({})
|
||||
|
||||
if total_users == 0:
|
||||
return {
|
||||
'total_users': 0,
|
||||
'avg_clicks_per_user': 0,
|
||||
'most_popular_categories': [],
|
||||
'most_popular_keywords': []
|
||||
}
|
||||
|
||||
# Calculate average clicks
|
||||
pipeline = [
|
||||
{
|
||||
'$group': {
|
||||
'_id': None,
|
||||
'total_clicks': {'$sum': '$total_clicks'}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
result = list(user_interests_collection.aggregate(pipeline))
|
||||
total_clicks = result[0]['total_clicks'] if result else 0
|
||||
avg_clicks = total_clicks / total_users if total_users > 0 else 0
|
||||
|
||||
# Get most popular categories
|
||||
category_counts = {}
|
||||
keyword_counts = {}
|
||||
|
||||
for profile in user_interests_collection.find({}):
|
||||
for category, score in profile.get('categories', {}).items():
|
||||
category_counts[category] = category_counts.get(category, 0) + score
|
||||
|
||||
for keyword, score in profile.get('keywords', {}).items():
|
||||
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
|
||||
|
||||
# Sort and get top 10
|
||||
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
return {
|
||||
'total_users': total_users,
|
||||
'avg_clicks_per_user': round(avg_clicks, 2),
|
||||
'most_popular_categories': top_categories,
|
||||
'most_popular_keywords': top_keywords
|
||||
}
|
||||
|
||||
|
||||
def delete_user_interests(subscriber_email: str) -> bool:
|
||||
"""
|
||||
Delete user interest profile (for GDPR compliance).
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the user
|
||||
|
||||
Returns:
|
||||
bool: True if profile was deleted, False if not found
|
||||
"""
|
||||
result = user_interests_collection.delete_one({'email': subscriber_email})
|
||||
return result.deleted_count > 0
|
||||
295
backend/services/personalization_service.py
Normal file
295
backend/services/personalization_service.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""
|
||||
Newsletter Personalization Service for Munich News Daily.
|
||||
Ranks and selects articles based on user interest profiles.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from services.interest_profiling_service import get_user_interests
|
||||
|
||||
|
||||
def calculate_article_score(
|
||||
article: Dict,
|
||||
user_interests: Optional[Dict],
|
||||
category_weight: float = 0.4,
|
||||
keyword_weight: float = 0.6
|
||||
) -> float:
|
||||
"""
|
||||
Calculate personalization score for an article based on user interests.
|
||||
|
||||
Score is calculated as:
|
||||
- Category match: 0-1.0 based on user's interest in the category
|
||||
- Keyword match: Average of user's interest in article keywords
|
||||
- Final score: (category_score * 0.4) + (keyword_score * 0.6)
|
||||
|
||||
Args:
|
||||
article: Article dictionary with 'category' and 'keywords' fields
|
||||
user_interests: User interest profile (None for non-personalized)
|
||||
category_weight: Weight for category matching (default: 0.4)
|
||||
keyword_weight: Weight for keyword matching (default: 0.6)
|
||||
|
||||
Returns:
|
||||
float: Personalization score between 0.0 and 1.0
|
||||
"""
|
||||
# If no user interests, return neutral score
|
||||
if not user_interests:
|
||||
return 0.5
|
||||
|
||||
# Get article metadata
|
||||
article_category = article.get('category', 'general')
|
||||
article_keywords = article.get('keywords', [])
|
||||
|
||||
# Calculate category score
|
||||
user_categories = user_interests.get('categories', {})
|
||||
category_score = user_categories.get(article_category, 0.0)
|
||||
|
||||
# Calculate keyword score (average of all matching keywords)
|
||||
user_keywords = user_interests.get('keywords', {})
|
||||
keyword_scores = []
|
||||
|
||||
for keyword in article_keywords:
|
||||
if keyword in user_keywords:
|
||||
keyword_scores.append(user_keywords[keyword])
|
||||
|
||||
# Average keyword score (0.0 if no matches)
|
||||
keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
|
||||
|
||||
# Weighted final score
|
||||
final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
|
||||
|
||||
return round(final_score, 3)
|
||||
|
||||
|
||||
def rank_articles_for_user(
|
||||
articles: List[Dict],
|
||||
subscriber_email: str,
|
||||
personalization_ratio: float = 0.7
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Rank articles for a specific user based on their interests.
|
||||
|
||||
Mixes personalized content with trending content to avoid filter bubbles.
|
||||
|
||||
Args:
|
||||
articles: List of article dictionaries
|
||||
subscriber_email: Email address of the user
|
||||
personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
|
||||
|
||||
Returns:
|
||||
list: Articles sorted by personalization score with score added
|
||||
"""
|
||||
# Get user interests
|
||||
user_interests = get_user_interests(subscriber_email)
|
||||
|
||||
# Calculate score for each article
|
||||
scored_articles = []
|
||||
for article in articles:
|
||||
score = calculate_article_score(article, user_interests)
|
||||
|
||||
# Add score to article (don't modify original)
|
||||
article_with_score = article.copy()
|
||||
article_with_score['personalization_score'] = score
|
||||
scored_articles.append(article_with_score)
|
||||
|
||||
# Sort by score (highest first)
|
||||
scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
|
||||
|
||||
return scored_articles
|
||||
|
||||
|
||||
def select_personalized_articles(
|
||||
articles: List[Dict],
|
||||
subscriber_email: str,
|
||||
max_articles: int = 10,
|
||||
personalization_ratio: float = 0.7,
|
||||
min_score_threshold: float = 0.1
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Select and rank articles for a personalized newsletter.
|
||||
|
||||
Strategy:
|
||||
- Top N * personalization_ratio articles: Highest scoring (personalized)
|
||||
- Remaining articles: Most recent (trending/diverse content)
|
||||
- Ensures mix of personalized + diverse content
|
||||
|
||||
Args:
|
||||
articles: List of available articles
|
||||
subscriber_email: Email address of the user
|
||||
max_articles: Maximum number of articles to include (default: 10)
|
||||
personalization_ratio: Ratio of personalized content (default: 0.7)
|
||||
min_score_threshold: Minimum score to consider personalized (default: 0.1)
|
||||
|
||||
Returns:
|
||||
list: Selected articles with personalization scores
|
||||
"""
|
||||
if not articles:
|
||||
return []
|
||||
|
||||
# Rank all articles
|
||||
ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
|
||||
|
||||
# Calculate split
|
||||
num_personalized = int(max_articles * personalization_ratio)
|
||||
num_trending = max_articles - num_personalized
|
||||
|
||||
# Get personalized articles (high scoring)
|
||||
personalized = [
|
||||
a for a in ranked_articles
|
||||
if a['personalization_score'] >= min_score_threshold
|
||||
][:num_personalized]
|
||||
|
||||
# Get trending articles (most recent, not already selected)
|
||||
personalized_ids = {a.get('_id') for a in personalized}
|
||||
trending = [
|
||||
a for a in ranked_articles
|
||||
if a.get('_id') not in personalized_ids
|
||||
][:num_trending]
|
||||
|
||||
# Combine: personalized first, then trending
|
||||
selected = personalized + trending
|
||||
|
||||
# Ensure we don't exceed max_articles
|
||||
return selected[:max_articles]
|
||||
|
||||
|
||||
def get_personalization_explanation(
|
||||
article: Dict,
|
||||
user_interests: Optional[Dict]
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Generate explanation for why an article was recommended.
|
||||
|
||||
Useful for transparency and debugging.
|
||||
|
||||
Args:
|
||||
article: Article dictionary
|
||||
user_interests: User interest profile
|
||||
|
||||
Returns:
|
||||
dict: Explanation containing:
|
||||
- score: Overall personalization score
|
||||
- category_match: Category score
|
||||
- keyword_matches: List of matching keywords with scores
|
||||
- reason: Human-readable explanation
|
||||
"""
|
||||
if not user_interests:
|
||||
return {
|
||||
'score': 0.5,
|
||||
'category_match': 0.0,
|
||||
'keyword_matches': [],
|
||||
'reason': 'No personalization data available'
|
||||
}
|
||||
|
||||
article_category = article.get('category', 'general')
|
||||
article_keywords = article.get('keywords', [])
|
||||
|
||||
user_categories = user_interests.get('categories', {})
|
||||
user_keywords = user_interests.get('keywords', {})
|
||||
|
||||
# Category match
|
||||
category_score = user_categories.get(article_category, 0.0)
|
||||
|
||||
# Keyword matches
|
||||
keyword_matches = []
|
||||
for keyword in article_keywords:
|
||||
if keyword in user_keywords:
|
||||
keyword_matches.append({
|
||||
'keyword': keyword,
|
||||
'score': user_keywords[keyword]
|
||||
})
|
||||
|
||||
# Calculate overall score
|
||||
overall_score = calculate_article_score(article, user_interests)
|
||||
|
||||
# Generate reason
|
||||
if overall_score >= 0.5:
|
||||
reason = f"High match with your interests in {article_category}"
|
||||
if keyword_matches:
|
||||
top_keywords = [m['keyword'] for m in keyword_matches[:2]]
|
||||
reason += f" and topics like {', '.join(top_keywords)}"
|
||||
elif overall_score >= 0.3:
|
||||
reason = f"Moderate match with your interests"
|
||||
else:
|
||||
reason = "Trending article for diverse content"
|
||||
|
||||
return {
|
||||
'score': overall_score,
|
||||
'category_match': category_score,
|
||||
'keyword_matches': keyword_matches,
|
||||
'reason': reason
|
||||
}
|
||||
|
||||
|
||||
def get_personalization_stats(
|
||||
selected_articles: List[Dict],
|
||||
subscriber_email: str
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Get statistics about personalization for a newsletter.
|
||||
|
||||
Args:
|
||||
selected_articles: Articles selected for the newsletter
|
||||
subscriber_email: Email address of the user
|
||||
|
||||
Returns:
|
||||
dict: Statistics containing:
|
||||
- total_articles: Number of articles
|
||||
- avg_score: Average personalization score
|
||||
- highly_personalized: Number of articles with score >= 0.5
|
||||
- moderately_personalized: Number with score 0.3-0.5
|
||||
- trending: Number with score < 0.3
|
||||
"""
|
||||
if not selected_articles:
|
||||
return {
|
||||
'total_articles': 0,
|
||||
'avg_score': 0.0,
|
||||
'highly_personalized': 0,
|
||||
'moderately_personalized': 0,
|
||||
'trending': 0
|
||||
}
|
||||
|
||||
scores = [a.get('personalization_score', 0.0) for a in selected_articles]
|
||||
avg_score = sum(scores) / len(scores)
|
||||
|
||||
highly_personalized = sum(1 for s in scores if s >= 0.5)
|
||||
moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
|
||||
trending = sum(1 for s in scores if s < 0.3)
|
||||
|
||||
return {
|
||||
'total_articles': len(selected_articles),
|
||||
'avg_score': round(avg_score, 3),
|
||||
'highly_personalized': highly_personalized,
|
||||
'moderately_personalized': moderately_personalized,
|
||||
'trending': trending
|
||||
}
|
||||
|
||||
|
||||
def batch_personalize_newsletters(
|
||||
articles: List[Dict],
|
||||
subscribers: List[str],
|
||||
max_articles_per_user: int = 10
|
||||
) -> Dict[str, List[Dict]]:
|
||||
"""
|
||||
Generate personalized article selections for multiple subscribers.
|
||||
|
||||
Useful for batch newsletter generation.
|
||||
|
||||
Args:
|
||||
articles: List of available articles
|
||||
subscribers: List of subscriber email addresses
|
||||
max_articles_per_user: Max articles per newsletter (default: 10)
|
||||
|
||||
Returns:
|
||||
dict: Mapping of email -> personalized article list
|
||||
"""
|
||||
personalized_newsletters = {}
|
||||
|
||||
for subscriber_email in subscribers:
|
||||
personalized_articles = select_personalized_articles(
|
||||
articles,
|
||||
subscriber_email,
|
||||
max_articles=max_articles_per_user
|
||||
)
|
||||
personalized_newsletters[subscriber_email] = personalized_articles
|
||||
|
||||
return personalized_newsletters
|
||||
@@ -80,6 +80,9 @@ def create_newsletter_tracking(
|
||||
link_tracking_map = {}
|
||||
|
||||
if article_links:
|
||||
# Import here to avoid circular dependency
|
||||
from database import articles_collection
|
||||
|
||||
for article in article_links:
|
||||
article_url = article.get('url')
|
||||
article_title = article.get('title', '')
|
||||
@@ -87,13 +90,22 @@ def create_newsletter_tracking(
|
||||
if article_url:
|
||||
link_tracking_id = generate_tracking_id()
|
||||
|
||||
# Create link click tracking record
|
||||
# Look up article metadata from database for personalization
|
||||
article_doc = articles_collection.find_one({'link': article_url})
|
||||
article_id = str(article_doc['_id']) if article_doc else None
|
||||
category = article_doc.get('category', 'general') if article_doc else 'general'
|
||||
keywords = article_doc.get('keywords', []) if article_doc else []
|
||||
|
||||
# Create link click tracking record with metadata
|
||||
link_click_doc = {
|
||||
'tracking_id': link_tracking_id,
|
||||
'newsletter_id': newsletter_id,
|
||||
'subscriber_email': subscriber_email,
|
||||
'article_url': article_url,
|
||||
'article_title': article_title,
|
||||
'article_id': article_id, # NEW: Article database ID
|
||||
'category': category, # NEW: Article category
|
||||
'keywords': keywords, # NEW: Article keywords for personalization
|
||||
'clicked': False,
|
||||
'clicked_at': None,
|
||||
'user_agent': None,
|
||||
|
||||
221
backend/test_personalization_system.py
Normal file
221
backend/test_personalization_system.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive test suite for the personalization system.
|
||||
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
|
||||
# Import services
|
||||
from services.tracking_service import create_newsletter_tracking
|
||||
from services.interest_profiling_service import (
|
||||
update_user_interests,
|
||||
get_user_interests,
|
||||
get_top_interests,
|
||||
build_interests_from_history
|
||||
)
|
||||
from services.personalization_service import (
|
||||
calculate_article_score,
|
||||
rank_articles_for_user,
|
||||
select_personalized_articles,
|
||||
get_personalization_stats
|
||||
)
|
||||
from config import Config
|
||||
|
||||
# Connect to MongoDB
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
articles_collection = db['articles']
|
||||
link_clicks_collection = db['link_clicks']
|
||||
user_interests_collection = db['user_interests']
|
||||
|
||||
|
||||
def test_phase1_keywords():
|
||||
"""Phase 1: Verify articles have keywords extracted"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 1: Keyword Extraction")
|
||||
print("="*60)
|
||||
|
||||
articles_with_keywords = articles_collection.count_documents({
|
||||
'keywords': {'$exists': True, '$ne': []}
|
||||
})
|
||||
|
||||
if articles_with_keywords == 0:
|
||||
print("❌ No articles with keywords found")
|
||||
print(" Run a crawl first to extract keywords")
|
||||
return False
|
||||
|
||||
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||
print(f"✓ Found {articles_with_keywords} articles with keywords")
|
||||
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
|
||||
print(f" Keywords: {sample.get('keywords', [])[:3]}")
|
||||
return True
|
||||
|
||||
|
||||
def test_phase2_tracking():
|
||||
"""Phase 2: Verify tracking includes keywords and metadata"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 2: Click Tracking Enhancement")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase2@example.com'
|
||||
|
||||
# Clean up
|
||||
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||
|
||||
# Get article with keywords
|
||||
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||
|
||||
if not article:
|
||||
print("❌ No articles found")
|
||||
return False
|
||||
|
||||
# Create tracking
|
||||
tracking_data = create_newsletter_tracking(
|
||||
newsletter_id='test-phase2',
|
||||
subscriber_email=test_email,
|
||||
article_links=[{
|
||||
'url': article['link'],
|
||||
'title': article.get('title', '')
|
||||
}]
|
||||
)
|
||||
|
||||
# Verify tracking record
|
||||
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
|
||||
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
|
||||
|
||||
has_metadata = (
|
||||
tracking_record.get('article_id') is not None and
|
||||
tracking_record.get('category') is not None and
|
||||
len(tracking_record.get('keywords', [])) > 0
|
||||
)
|
||||
|
||||
# Clean up
|
||||
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
|
||||
|
||||
if has_metadata:
|
||||
print(f"✓ Tracking records include metadata")
|
||||
print(f" Article ID: {tracking_record.get('article_id')}")
|
||||
print(f" Category: {tracking_record.get('category')}")
|
||||
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
|
||||
return True
|
||||
else:
|
||||
print("❌ Tracking records missing metadata")
|
||||
return False
|
||||
|
||||
|
||||
def test_phase3_profiling():
|
||||
"""Phase 3: Verify interest profiles are built from clicks"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 3: User Interest Profiling")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase3@example.com'
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
# Create profile
|
||||
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
|
||||
|
||||
# Verify profile
|
||||
profile = get_user_interests(test_email)
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
if profile and profile['total_clicks'] == 2:
|
||||
print(f"✓ Interest profile created")
|
||||
print(f" Total clicks: {profile['total_clicks']}")
|
||||
print(f" Categories: {len(profile.get('categories', {}))}")
|
||||
print(f" Keywords: {len(profile.get('keywords', {}))}")
|
||||
return True
|
||||
else:
|
||||
print("❌ Interest profile not created correctly")
|
||||
return False
|
||||
|
||||
|
||||
def test_phase4_personalization():
|
||||
"""Phase 4: Verify articles are ranked by user interests"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 4: Personalized Newsletter Generation")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase4@example.com'
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
# Get articles
|
||||
articles = list(articles_collection.find(
|
||||
{'keywords': {'$exists': True, '$ne': []}},
|
||||
limit=5
|
||||
))
|
||||
|
||||
if len(articles) < 3:
|
||||
print("❌ Not enough articles found")
|
||||
return False
|
||||
|
||||
# Create profile
|
||||
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||
|
||||
# Rank articles
|
||||
ranked = rank_articles_for_user(articles, test_email)
|
||||
|
||||
# Select personalized
|
||||
selected = select_personalized_articles(articles, test_email, max_articles=3)
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
has_scores = all('personalization_score' in a for a in selected)
|
||||
|
||||
if has_scores and len(selected) > 0:
|
||||
print(f"✓ Articles ranked and selected")
|
||||
print(f" Total ranked: {len(ranked)}")
|
||||
print(f" Selected: {len(selected)}")
|
||||
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
|
||||
return True
|
||||
else:
|
||||
print("❌ Personalization failed")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all personalization tests"""
|
||||
print("\n" + "="*60)
|
||||
print("PERSONALIZATION SYSTEM TEST SUITE")
|
||||
print("="*60)
|
||||
|
||||
results = {
|
||||
'Phase 1: Keyword Extraction': test_phase1_keywords(),
|
||||
'Phase 2: Click Tracking': test_phase2_tracking(),
|
||||
'Phase 3: Interest Profiling': test_phase3_profiling(),
|
||||
'Phase 4: Personalization': test_phase4_personalization()
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("TEST RESULTS")
|
||||
print("="*60)
|
||||
|
||||
for phase, passed in results.items():
|
||||
status = "✅ PASS" if passed else "❌ FAIL"
|
||||
print(f"{status} - {phase}")
|
||||
|
||||
all_passed = all(results.values())
|
||||
|
||||
if all_passed:
|
||||
print("\n🎉 All personalization tests PASSED!")
|
||||
return 0
|
||||
else:
|
||||
print("\n❌ Some tests FAILED")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
225
docker-compose.local.yml
Normal file
225
docker-compose.local.yml
Normal file
@@ -0,0 +1,225 @@
|
||||
services:
|
||||
# Ollama AI Service (Exposed for local testing)
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
container_name: munich-news-local-ollama
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "11434:11434" # Exposed for local testing
|
||||
volumes:
|
||||
- ollama_data_local:/root/.ollama
|
||||
networks:
|
||||
- munich-news-network
|
||||
dns:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
# GPU support (uncomment if you have NVIDIA GPU)
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "ollama list || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
# Ollama Model Loader - Pulls phi3:latest (smaller model for local dev)
|
||||
ollama-setup:
|
||||
image: curlimages/curl:latest
|
||||
container_name: munich-news-local-ollama-setup
|
||||
depends_on:
|
||||
ollama:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- munich-news-network
|
||||
env_file:
|
||||
- backend/.env.local
|
||||
volumes:
|
||||
- ./scripts/setup-ollama-model.sh:/setup-ollama-model.sh:ro
|
||||
dns:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
command: sh /setup-ollama-model.sh
|
||||
restart: on-failure
|
||||
|
||||
# Redis - Message queue for async tasks (Internal only - not exposed to host)
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: munich-news-local-redis
|
||||
restart: unless-stopped
|
||||
# No ports exposed - only accessible within Docker network
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# MongoDB Database (Exposed for local debugging)
|
||||
mongodb:
|
||||
image: mongo:latest
|
||||
container_name: munich-news-local-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017" # Exposed for local debugging
|
||||
environment:
|
||||
# For production, set MONGO_PASSWORD environment variable
|
||||
MONGO_INITDB_ROOT_USERNAME: ${MONGO_USERNAME:-admin}
|
||||
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
|
||||
MONGO_INITDB_DATABASE: munich_news
|
||||
volumes:
|
||||
- mongodb_data_local:/data/db
|
||||
- mongodb_config_local:/data/configdb
|
||||
networks:
|
||||
- munich-news-network
|
||||
command: mongod --bind_ip_all ${MONGO_AUTH:---auth}
|
||||
healthcheck:
|
||||
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# News Crawler - Runs at 6 AM Berlin time
|
||||
crawler:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: news_crawler/Dockerfile
|
||||
container_name: munich-news-local-crawler
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- mongodb
|
||||
- ollama
|
||||
- redis
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- ./backend/.env.local:/app/.env:ro
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||
interval: 1m
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Backend API - Tracking and analytics
|
||||
backend:
|
||||
build:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
container_name: munich-news-local-backend
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- mongodb
|
||||
- redis
|
||||
ports:
|
||||
- "5001:5001"
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- FLASK_PORT=5001
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- ./backend/.env.local:/app/.env:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
|
||||
# Transport Crawler - API service for MVG disruptions (Internal only - not exposed to host)
|
||||
transport-crawler:
|
||||
build:
|
||||
context: ./transport_crawler
|
||||
dockerfile: Dockerfile
|
||||
container_name: munich-news-local-transport-crawler
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- mongodb
|
||||
- redis
|
||||
# No ports exposed - only accessible within Docker network
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- ./backend/.env.local:/app/.env:ro
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
|
||||
# Newsletter Sender - Runs at 7 AM Berlin time
|
||||
sender:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: news_sender/Dockerfile
|
||||
container_name: munich-news-local-sender
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- mongodb
|
||||
- backend
|
||||
- crawler
|
||||
- transport-crawler
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||
- TZ=Europe/Berlin
|
||||
volumes:
|
||||
- ./backend/.env.local:/app/.env:ro
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||
interval: 1m
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Frontend Web Interface
|
||||
frontend:
|
||||
build: ./frontend
|
||||
container_name: munich-news-local-frontend
|
||||
restart: unless-stopped
|
||||
# ports:
|
||||
# - "3000:3000"
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- API_URL=http://backend:5001
|
||||
- PORT=3000
|
||||
depends_on:
|
||||
- backend
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
mongodb_data_local:
|
||||
driver: local
|
||||
mongodb_config_local:
|
||||
driver: local
|
||||
ollama_data_local:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
internal: false
|
||||
167
docs/LOCAL_DEVELOPMENT.md
Normal file
167
docs/LOCAL_DEVELOPMENT.md
Normal file
@@ -0,0 +1,167 @@
|
||||
# Local Development Setup
|
||||
|
||||
This guide helps you run Munich News Daily locally for development and testing.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Copy local environment files
|
||||
cp .env.local .env
|
||||
cp backend/.env.local backend/.env
|
||||
|
||||
# 2. Start services with local configuration
|
||||
docker-compose -f docker-compose.local.yml up -d
|
||||
|
||||
# 3. Check logs
|
||||
docker-compose -f docker-compose.local.yml logs -f
|
||||
|
||||
# 4. Access services
|
||||
# - Frontend: http://localhost:3000
|
||||
# - Backend API: http://localhost:5001
|
||||
# - MongoDB: localhost:27017
|
||||
# - Ollama: http://localhost:11434
|
||||
```
|
||||
|
||||
## Differences from Production
|
||||
|
||||
| Feature | Production | Local Development |
|
||||
|---------|-----------|-------------------|
|
||||
| Ollama Model | `gemma3:12b` (large) | `phi3:latest` (small, fast) |
|
||||
| MongoDB Port | Internal only | Exposed on 27017 |
|
||||
| Ollama Port | Internal only | Exposed on 11434 |
|
||||
| Container Names | `munich-news-*` | `munich-news-local-*` |
|
||||
| Volumes | `*_data` | `*_data_local` |
|
||||
| Email | Production SMTP | Test/disabled |
|
||||
|
||||
## Useful Commands
|
||||
|
||||
### Start/Stop Services
|
||||
```bash
|
||||
# Start all services
|
||||
docker-compose -f docker-compose.local.yml up -d
|
||||
|
||||
# Stop all services
|
||||
docker-compose -f docker-compose.local.yml down
|
||||
|
||||
# Restart a specific service
|
||||
docker-compose -f docker-compose.local.yml restart backend
|
||||
|
||||
# View logs
|
||||
docker-compose -f docker-compose.local.yml logs -f crawler
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
# Trigger a news crawl (2 articles for quick testing)
|
||||
curl -X POST http://localhost:5001/api/admin/trigger-crawl \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"max_articles": 2}'
|
||||
|
||||
# Trigger transport crawl
|
||||
curl -X POST http://localhost:5001/api/transport/crawl
|
||||
|
||||
# Check articles in MongoDB
|
||||
docker exec munich-news-local-mongodb mongosh munich_news \
|
||||
--eval "db.articles.find({}, {title: 1, keywords: 1, category: 1}).limit(3)"
|
||||
|
||||
# Check transport disruptions
|
||||
curl http://localhost:5001/api/transport/disruptions
|
||||
```
|
||||
|
||||
### Database Access
|
||||
|
||||
```bash
|
||||
# Connect to MongoDB
|
||||
docker exec -it munich-news-local-mongodb mongosh munich_news
|
||||
|
||||
# Or from host (if you have mongosh installed)
|
||||
mongosh "mongodb://admin:local123@localhost:27017/munich_news"
|
||||
|
||||
# Useful queries
|
||||
db.articles.countDocuments()
|
||||
db.articles.find({keywords: {$exists: true}}).limit(5)
|
||||
db.subscribers.find()
|
||||
db.transport_alerts.find()
|
||||
```
|
||||
|
||||
### Ollama Testing
|
||||
|
||||
```bash
|
||||
# List models
|
||||
curl http://localhost:11434/api/tags
|
||||
|
||||
# Test generation
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "phi3:latest",
|
||||
"prompt": "Summarize: Munich opens new U-Bahn line",
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
|
||||
## Cleanup
|
||||
|
||||
```bash
|
||||
# Stop and remove containers
|
||||
docker-compose -f docker-compose.local.yml down
|
||||
|
||||
# Remove volumes (WARNING: deletes all data)
|
||||
docker-compose -f docker-compose.local.yml down -v
|
||||
|
||||
# Remove local volumes specifically
|
||||
docker volume rm munich-news_mongodb_data_local
|
||||
docker volume rm munich-news_mongodb_config_local
|
||||
docker volume rm munich-news_ollama_data_local
|
||||
```
|
||||
|
||||
## Switching Between Local and Production
|
||||
|
||||
```bash
|
||||
# Switch to local
|
||||
cp .env.local .env
|
||||
cp backend/.env.local backend/.env
|
||||
docker-compose -f docker-compose.local.yml up -d
|
||||
|
||||
# Switch to production
|
||||
cp .env.production .env # (if you have one)
|
||||
cp backend/.env.production backend/.env
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Ollama model not downloading
|
||||
```bash
|
||||
# Pull model manually
|
||||
docker exec munich-news-local-ollama ollama pull phi3:latest
|
||||
```
|
||||
|
||||
### MongoDB connection refused
|
||||
```bash
|
||||
# Check if MongoDB is running
|
||||
docker-compose -f docker-compose.local.yml ps mongodb
|
||||
|
||||
# Check logs
|
||||
docker-compose -f docker-compose.local.yml logs mongodb
|
||||
```
|
||||
|
||||
### Port already in use
|
||||
```bash
|
||||
# Check what's using the port
|
||||
lsof -i :5001 # or :3000, :27017, etc.
|
||||
|
||||
# Stop the conflicting service or change port in docker-compose.local.yml
|
||||
```
|
||||
|
||||
## Tips
|
||||
|
||||
1. **Use phi3 for speed** - It's much faster than gemma3 for local testing
|
||||
2. **Limit articles** - Use `max_articles: 2` for quick crawl tests
|
||||
3. **Watch logs** - Keep logs open to see what's happening
|
||||
4. **Separate volumes** - Local and production use different volumes, so they don't interfere
|
||||
|
||||
## Next Steps
|
||||
|
||||
- See `docs/PERSONALIZATION.md` for personalization feature development
|
||||
- See `docs/OLLAMA_SETUP.md` for AI configuration
|
||||
- See main `README.md` for general documentation
|
||||
217
docs/PERSONALIZATION.md
Normal file
217
docs/PERSONALIZATION.md
Normal file
@@ -0,0 +1,217 @@
|
||||
# Newsletter Personalization Implementation
|
||||
|
||||
## Overview
|
||||
Personalized newsletters based on user click behavior, using keywords and categories to build interest profiles.
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### ✅ Phase 1: Keyword Extraction (COMPLETED)
|
||||
**Status:** Implemented
|
||||
**Files Modified:**
|
||||
- `news_crawler/ollama_client.py` - Added `extract_keywords()` method
|
||||
- `news_crawler/crawler_service.py` - Integrated keyword extraction into crawl process
|
||||
|
||||
**What it does:**
|
||||
- Extracts 5 keywords from each article using Ollama AI
|
||||
- Keywords stored in `articles` collection: `keywords: ["Bayern Munich", "Football", ...]`
|
||||
- Runs automatically during news crawling
|
||||
|
||||
**Test it:**
|
||||
```bash
|
||||
# Trigger a crawl
|
||||
curl -X POST http://localhost:5001/api/admin/trigger-crawl -d '{"max_articles": 2}'
|
||||
|
||||
# Check articles have keywords
|
||||
docker exec munich-news-mongodb mongosh munich_news --eval "db.articles.findOne({}, {title: 1, keywords: 1})"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### ✅ Phase 2: Click Tracking Enhancement (COMPLETED)
|
||||
**Status:** Implemented
|
||||
**Goal:** Track clicks with keyword metadata
|
||||
|
||||
**Files Modified:**
|
||||
- `backend/services/tracking_service.py` - Enhanced `create_newsletter_tracking()` to look up article metadata
|
||||
|
||||
**What it does:**
|
||||
- When creating tracking links, looks up article from database
|
||||
- Stores article ID, category, and keywords in tracking record
|
||||
- Enables building user interest profiles from click behavior
|
||||
|
||||
**Database Schema:**
|
||||
```javascript
|
||||
// link_clicks collection
|
||||
{
|
||||
tracking_id: "uuid",
|
||||
newsletter_id: "2024-11-18",
|
||||
subscriber_email: "user@example.com",
|
||||
article_url: "https://...",
|
||||
article_title: "Article Title",
|
||||
article_id: "673abc123...", // NEW: Article database ID
|
||||
category: "sports", // NEW: Article category
|
||||
keywords: ["Bayern Munich", "Bundesliga"], // NEW: Keywords for personalization
|
||||
clicked: false,
|
||||
clicked_at: null,
|
||||
user_agent: null,
|
||||
created_at: ISODate()
|
||||
}
|
||||
```
|
||||
|
||||
**Test it:**
|
||||
```bash
|
||||
# Send a test newsletter
|
||||
curl -X POST http://localhost:5001/api/admin/send-newsletter
|
||||
|
||||
# Check tracking records have keywords
|
||||
docker exec munich-news-mongodb mongosh munich_news --eval "db.link_clicks.findOne({}, {article_title: 1, keywords: 1, category: 1})"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### ✅ Phase 3: User Interest Profiling (COMPLETED)
|
||||
**Status:** Implemented
|
||||
**Goal:** Build user interest profiles from click history
|
||||
|
||||
**Files Created:**
|
||||
- `backend/services/interest_profiling_service.py` - Core profiling logic
|
||||
- `backend/routes/interests_routes.py` - API endpoints for interest management
|
||||
|
||||
**Files Modified:**
|
||||
- `backend/routes/tracking_routes.py` - Auto-update interests on click
|
||||
- `backend/app.py` - Register interests routes
|
||||
|
||||
**What it does:**
|
||||
- Automatically builds interest profiles when users click articles
|
||||
- Tracks interest scores for categories and keywords (0.0 to 1.0)
|
||||
- Increments scores by 0.1 per click, capped at 1.0
|
||||
- Provides decay mechanism for old interests
|
||||
- Supports rebuilding profiles from click history
|
||||
|
||||
**Database Schema:**
|
||||
```javascript
|
||||
// user_interests collection
|
||||
{
|
||||
email: "user@example.com",
|
||||
categories: {
|
||||
sports: 0.8,
|
||||
local: 0.5,
|
||||
science: 0.2
|
||||
},
|
||||
keywords: {
|
||||
"Bayern Munich": 0.9,
|
||||
"Oktoberfest": 0.7,
|
||||
"AI": 0.3
|
||||
},
|
||||
total_clicks: 15,
|
||||
last_updated: ISODate(),
|
||||
created_at: ISODate()
|
||||
}
|
||||
```
|
||||
|
||||
**API Endpoints:**
|
||||
```bash
|
||||
# Get user interests
|
||||
GET /api/interests/<email>
|
||||
|
||||
# Get top interests
|
||||
GET /api/interests/<email>/top?top_n=10
|
||||
|
||||
# Rebuild from history
|
||||
POST /api/interests/<email>/rebuild
|
||||
Body: {"days_lookback": 30}
|
||||
|
||||
# Decay old interests
|
||||
POST /api/interests/decay
|
||||
Body: {"decay_factor": 0.95, "days_threshold": 7}
|
||||
|
||||
# Get statistics
|
||||
GET /api/interests/statistics
|
||||
|
||||
# Delete profile (GDPR)
|
||||
DELETE /api/interests/<email>
|
||||
```
|
||||
|
||||
**Test it:**
|
||||
```bash
|
||||
# Run test script
|
||||
docker exec munich-news-local-backend python test_interest_profiling.py
|
||||
|
||||
# View a user's interests
|
||||
curl http://localhost:5001/api/interests/user@example.com
|
||||
|
||||
# Get statistics
|
||||
curl http://localhost:5001/api/interests/statistics
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### ✅ Phase 4: Personalized Newsletter (COMPLETED)
|
||||
**Status:** Implemented
|
||||
**Goal:** Rank and select articles based on user interests
|
||||
|
||||
**Files Created:**
|
||||
- `backend/services/personalization_service.py` - Core personalization logic
|
||||
- `backend/routes/personalization_routes.py` - API endpoints for testing
|
||||
|
||||
**Files Modified:**
|
||||
- `backend/app.py` - Register personalization routes
|
||||
|
||||
**What it does:**
|
||||
- Scores articles based on user's category and keyword interests
|
||||
- Ranks articles by personalization score (0.0 to 1.0)
|
||||
- Selects mix of personalized (70%) + trending (30%) content
|
||||
- Provides explanations for recommendations
|
||||
|
||||
**Algorithm:**
|
||||
```python
|
||||
score = (category_match * 0.4) + (keyword_match * 0.6)
|
||||
|
||||
# Example:
|
||||
# User interests: sports=0.8, "Bayern Munich"=0.9
|
||||
# Article: sports category, keywords=["Bayern Munich", "Football"]
|
||||
# Score = (0.8 * 0.4) + (0.9 * 0.6) = 0.32 + 0.54 = 0.86
|
||||
```
|
||||
|
||||
**API Endpoints:**
|
||||
```bash
|
||||
# Preview personalized newsletter
|
||||
GET /api/personalize/preview/<email>?max_articles=10&hours_lookback=24
|
||||
|
||||
# Explain recommendation
|
||||
POST /api/personalize/explain
|
||||
Body: {"email": "user@example.com", "article_id": "..."}
|
||||
```
|
||||
|
||||
**Test it:**
|
||||
```bash
|
||||
# Run test script
|
||||
docker exec munich-news-local-backend python test_personalization.py
|
||||
|
||||
# Preview personalized newsletter
|
||||
curl "http://localhost:5001/api/personalize/preview/demo@example.com?max_articles=5"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ All Phases Complete!
|
||||
|
||||
1. ~~**Phase 1:** Keyword extraction from articles~~ ✅ DONE
|
||||
2. ~~**Phase 2:** Click tracking with keywords~~ ✅ DONE
|
||||
3. ~~**Phase 3:** User interest profiling~~ ✅ DONE
|
||||
4. ~~**Phase 4:** Personalized newsletter generation~~ ✅ DONE
|
||||
|
||||
## Next Steps for Production
|
||||
|
||||
1. **Integrate with newsletter sender** - Modify `news_sender/sender_service.py` to use personalization
|
||||
2. **A/B testing** - Compare personalized vs non-personalized engagement
|
||||
3. **Tune parameters** - Adjust personalization_ratio, weights, decay rates
|
||||
4. **Monitor metrics** - Track click-through rates, open rates by personalization score
|
||||
5. **User controls** - Add UI for users to view/edit their interests
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration needed yet. Keyword extraction uses existing Ollama settings from `backend/.env`:
|
||||
- `OLLAMA_ENABLED=true`
|
||||
- `OLLAMA_MODEL=gemma3:12b`
|
||||
- `OLLAMA_BASE_URL=http://ollama:11434`
|
||||
195
docs/PERSONALIZATION_COMPLETE.md
Normal file
195
docs/PERSONALIZATION_COMPLETE.md
Normal file
@@ -0,0 +1,195 @@
|
||||
# 🎉 Newsletter Personalization System - Complete!
|
||||
|
||||
All 4 phases of the personalization system have been successfully implemented and tested.
|
||||
|
||||
## ✅ What Was Built
|
||||
|
||||
### Phase 1: Keyword Extraction
|
||||
- AI-powered keyword extraction from articles using Ollama
|
||||
- 5 keywords per article automatically extracted during crawling
|
||||
- Keywords stored in database for personalization
|
||||
|
||||
### Phase 2: Click Tracking Enhancement
|
||||
- Enhanced tracking to capture article keywords and category
|
||||
- Tracking records now include metadata for building interest profiles
|
||||
- Privacy-compliant with opt-out and GDPR support
|
||||
|
||||
### Phase 3: User Interest Profiling
|
||||
- Automatic profile building from click behavior
|
||||
- Interest scores (0.0-1.0) for categories and keywords
|
||||
- Decay mechanism for old interests
|
||||
- API endpoints for viewing and managing profiles
|
||||
|
||||
### Phase 4: Personalized Newsletter Generation
|
||||
- Article scoring based on user interests
|
||||
- Smart ranking algorithm (40% category + 60% keywords)
|
||||
- Mix of personalized (70%) + trending (30%) content
|
||||
- Explanation system for recommendations
|
||||
|
||||
## 📊 How It Works
|
||||
|
||||
```
|
||||
1. User clicks article in newsletter
|
||||
↓
|
||||
2. System records: keywords + category
|
||||
↓
|
||||
3. Interest profile updates automatically
|
||||
↓
|
||||
4. Next newsletter: articles ranked by interests
|
||||
↓
|
||||
5. User receives personalized content
|
||||
```
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
All phases have been tested and verified:
|
||||
|
||||
```bash
|
||||
# Run comprehensive test suite (tests all 4 phases)
|
||||
docker exec munich-news-local-backend python test_personalization_system.py
|
||||
|
||||
# Or test keyword extraction separately
|
||||
docker exec munich-news-local-crawler python -c "from crawler_service import crawl_all_feeds; crawl_all_feeds(max_articles_per_feed=2)"
|
||||
```
|
||||
|
||||
## 🔌 API Endpoints
|
||||
|
||||
### Interest Management
|
||||
```bash
|
||||
GET /api/interests/<email> # View profile
|
||||
GET /api/interests/<email>/top # Top interests
|
||||
POST /api/interests/<email>/rebuild # Rebuild from history
|
||||
GET /api/interests/statistics # Platform stats
|
||||
DELETE /api/interests/<email> # Delete (GDPR)
|
||||
```
|
||||
|
||||
### Personalization
|
||||
```bash
|
||||
GET /api/personalize/preview/<email> # Preview personalized newsletter
|
||||
POST /api/personalize/explain # Explain recommendation
|
||||
```
|
||||
|
||||
## 📈 Example Results
|
||||
|
||||
### User Profile
|
||||
```json
|
||||
{
|
||||
"email": "user@example.com",
|
||||
"categories": {
|
||||
"sports": 0.30,
|
||||
"local": 0.10
|
||||
},
|
||||
"keywords": {
|
||||
"Bayern Munich": 0.30,
|
||||
"Football": 0.20,
|
||||
"Transportation": 0.10
|
||||
},
|
||||
"total_clicks": 5
|
||||
}
|
||||
```
|
||||
|
||||
### Personalized Newsletter
|
||||
```json
|
||||
{
|
||||
"articles": [
|
||||
{
|
||||
"title": "Bayern Munich wins championship",
|
||||
"personalization_score": 0.86,
|
||||
"category": "sports",
|
||||
"keywords": ["Bayern Munich", "Football"]
|
||||
},
|
||||
{
|
||||
"title": "New S-Bahn line opens",
|
||||
"personalization_score": 0.42,
|
||||
"category": "local",
|
||||
"keywords": ["Transportation", "Munich"]
|
||||
}
|
||||
],
|
||||
"statistics": {
|
||||
"highly_personalized": 1,
|
||||
"moderately_personalized": 1,
|
||||
"trending": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 🎯 Scoring Algorithm
|
||||
|
||||
```python
|
||||
# Article score calculation
|
||||
category_score = user_interests.categories[article.category]
|
||||
keyword_score = average(user_interests.keywords[kw] for kw in article.keywords)
|
||||
|
||||
final_score = (category_score * 0.4) + (keyword_score * 0.6)
|
||||
```
|
||||
|
||||
**Example:**
|
||||
- User: sports=0.8, "Bayern Munich"=0.9
|
||||
- Article: sports category, keywords=["Bayern Munich", "Football"]
|
||||
- Score = (0.8 × 0.4) + (0.9 × 0.6) = 0.32 + 0.54 = **0.86**
|
||||
|
||||
## 🚀 Production Integration
|
||||
|
||||
To integrate with the newsletter sender:
|
||||
|
||||
1. **Modify `news_sender/sender_service.py`:**
|
||||
```python
|
||||
from services.personalization_service import select_personalized_articles
|
||||
|
||||
# For each subscriber
|
||||
personalized_articles = select_personalized_articles(
|
||||
all_articles,
|
||||
subscriber_email,
|
||||
max_articles=10
|
||||
)
|
||||
```
|
||||
|
||||
2. **Enable personalization flag in config:**
|
||||
```env
|
||||
PERSONALIZATION_ENABLED=true
|
||||
PERSONALIZATION_RATIO=0.7 # 70% personalized, 30% trending
|
||||
```
|
||||
|
||||
3. **Monitor metrics:**
|
||||
- Click-through rate by personalization score
|
||||
- Open rates for personalized vs non-personalized
|
||||
- User engagement over time
|
||||
|
||||
## 🔐 Privacy & Compliance
|
||||
|
||||
- ✅ Users can opt out of tracking
|
||||
- ✅ Interest profiles can be deleted (GDPR)
|
||||
- ✅ Automatic anonymization after 90 days
|
||||
- ✅ No PII beyond email address
|
||||
- ✅ Transparent recommendation explanations
|
||||
|
||||
## 📁 Files Created/Modified
|
||||
|
||||
### New Files
|
||||
- `backend/services/interest_profiling_service.py`
|
||||
- `backend/services/personalization_service.py`
|
||||
- `backend/routes/interests_routes.py`
|
||||
- `backend/routes/personalization_routes.py`
|
||||
- `backend/test_tracking_phase2.py`
|
||||
- `backend/test_interest_profiling.py`
|
||||
- `backend/test_personalization.py`
|
||||
- `docs/PERSONALIZATION.md`
|
||||
|
||||
### Modified Files
|
||||
- `news_crawler/ollama_client.py` - Added keyword extraction
|
||||
- `news_crawler/crawler_service.py` - Integrated keyword extraction
|
||||
- `backend/services/tracking_service.py` - Enhanced with metadata
|
||||
- `backend/routes/tracking_routes.py` - Auto-update interests
|
||||
- `backend/app.py` - Registered new routes
|
||||
|
||||
## 🎓 Key Learnings
|
||||
|
||||
1. **Incremental scoring works well** - 0.1 per click prevents over-weighting
|
||||
2. **Mix is important** - 70/30 personalized/trending avoids filter bubbles
|
||||
3. **Keywords > Categories** - 60/40 weight reflects keyword importance
|
||||
4. **Decay is essential** - Prevents stale interests from dominating
|
||||
5. **Transparency matters** - Explanation API helps users understand recommendations
|
||||
|
||||
## 🎉 Status: COMPLETE
|
||||
|
||||
All 4 phases implemented, tested, and documented. The personalization system is ready for production integration!
|
||||
@@ -388,6 +388,21 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||
failed_summaries += 1
|
||||
|
||||
# Extract keywords for personalization
|
||||
keywords_result = None
|
||||
if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
|
||||
print(f" 🔑 Extracting keywords...")
|
||||
keywords_result = ollama_client.extract_keywords(
|
||||
original_title,
|
||||
summary_result['summary'],
|
||||
max_keywords=5
|
||||
)
|
||||
|
||||
if keywords_result['success']:
|
||||
print(f" ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
|
||||
else:
|
||||
print(f" ⚠ Keyword extraction failed: {keywords_result['error']}")
|
||||
|
||||
# Prepare document
|
||||
article_doc = {
|
||||
'title': original_title,
|
||||
@@ -396,6 +411,7 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
'link': article_url,
|
||||
'content': article_data.get('content', ''), # Full article content
|
||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||
'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
|
||||
'word_count': article_data.get('word_count', 0),
|
||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
|
||||
@@ -509,6 +509,110 @@ New York Times-style summary (max {max_words} words):"""
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
def extract_keywords(self, title, summary, max_keywords=5):
|
||||
"""
|
||||
Extract keywords/topics from article for personalization
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
summary: Article summary
|
||||
max_keywords: Maximum number of keywords to extract (default 5)
|
||||
|
||||
Returns:
|
||||
{
|
||||
'keywords': list, # List of extracted keywords
|
||||
'success': bool, # Whether extraction succeeded
|
||||
'error': str or None, # Error message if failed
|
||||
'duration': float # Time taken in seconds
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': 'Ollama is disabled',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Construct prompt for keyword extraction
|
||||
prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
|
||||
|
||||
Title: {title}
|
||||
Summary: {summary}
|
||||
|
||||
Return ONLY the keywords separated by commas, nothing else. Focus on:
|
||||
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
|
||||
- Locations (e.g., "Marienplatz", "Airport")
|
||||
- Events or themes (e.g., "Transportation", "Housing", "Technology")
|
||||
|
||||
Keywords:"""
|
||||
|
||||
# Prepare request
|
||||
url = f"{self.base_url}/api/generate"
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'prompt': prompt,
|
||||
'stream': False,
|
||||
'options': {
|
||||
'temperature': 0.3, # Lower temperature for consistent extraction
|
||||
'num_predict': 100 # Limit response length
|
||||
}
|
||||
}
|
||||
|
||||
# Make request
|
||||
response = requests.post(
|
||||
url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse response
|
||||
result = response.json()
|
||||
keywords_text = result.get('response', '').strip()
|
||||
|
||||
if not keywords_text:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': 'Ollama returned empty response',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
# Parse keywords from response
|
||||
keywords = [k.strip() for k in keywords_text.split(',')]
|
||||
keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
|
||||
|
||||
return {
|
||||
'keywords': keywords,
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': f"Request timed out after {self.timeout}s",
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Quick test
|
||||
|
||||
221
tests/backend/test_personalization_system.py
Normal file
221
tests/backend/test_personalization_system.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive test suite for the personalization system.
|
||||
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
|
||||
# Import services
|
||||
from services.tracking_service import create_newsletter_tracking
|
||||
from services.interest_profiling_service import (
|
||||
update_user_interests,
|
||||
get_user_interests,
|
||||
get_top_interests,
|
||||
build_interests_from_history
|
||||
)
|
||||
from services.personalization_service import (
|
||||
calculate_article_score,
|
||||
rank_articles_for_user,
|
||||
select_personalized_articles,
|
||||
get_personalization_stats
|
||||
)
|
||||
from config import Config
|
||||
|
||||
# Connect to MongoDB
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
articles_collection = db['articles']
|
||||
link_clicks_collection = db['link_clicks']
|
||||
user_interests_collection = db['user_interests']
|
||||
|
||||
|
||||
def test_phase1_keywords():
|
||||
"""Phase 1: Verify articles have keywords extracted"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 1: Keyword Extraction")
|
||||
print("="*60)
|
||||
|
||||
articles_with_keywords = articles_collection.count_documents({
|
||||
'keywords': {'$exists': True, '$ne': []}
|
||||
})
|
||||
|
||||
if articles_with_keywords == 0:
|
||||
print("❌ No articles with keywords found")
|
||||
print(" Run a crawl first to extract keywords")
|
||||
return False
|
||||
|
||||
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||
print(f"✓ Found {articles_with_keywords} articles with keywords")
|
||||
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
|
||||
print(f" Keywords: {sample.get('keywords', [])[:3]}")
|
||||
return True
|
||||
|
||||
|
||||
def test_phase2_tracking():
|
||||
"""Phase 2: Verify tracking includes keywords and metadata"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 2: Click Tracking Enhancement")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase2@example.com'
|
||||
|
||||
# Clean up
|
||||
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||
|
||||
# Get article with keywords
|
||||
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||
|
||||
if not article:
|
||||
print("❌ No articles found")
|
||||
return False
|
||||
|
||||
# Create tracking
|
||||
tracking_data = create_newsletter_tracking(
|
||||
newsletter_id='test-phase2',
|
||||
subscriber_email=test_email,
|
||||
article_links=[{
|
||||
'url': article['link'],
|
||||
'title': article.get('title', '')
|
||||
}]
|
||||
)
|
||||
|
||||
# Verify tracking record
|
||||
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
|
||||
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
|
||||
|
||||
has_metadata = (
|
||||
tracking_record.get('article_id') is not None and
|
||||
tracking_record.get('category') is not None and
|
||||
len(tracking_record.get('keywords', [])) > 0
|
||||
)
|
||||
|
||||
# Clean up
|
||||
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
|
||||
|
||||
if has_metadata:
|
||||
print(f"✓ Tracking records include metadata")
|
||||
print(f" Article ID: {tracking_record.get('article_id')}")
|
||||
print(f" Category: {tracking_record.get('category')}")
|
||||
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
|
||||
return True
|
||||
else:
|
||||
print("❌ Tracking records missing metadata")
|
||||
return False
|
||||
|
||||
|
||||
def test_phase3_profiling():
|
||||
"""Phase 3: Verify interest profiles are built from clicks"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 3: User Interest Profiling")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase3@example.com'
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
# Create profile
|
||||
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
|
||||
|
||||
# Verify profile
|
||||
profile = get_user_interests(test_email)
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
if profile and profile['total_clicks'] == 2:
|
||||
print(f"✓ Interest profile created")
|
||||
print(f" Total clicks: {profile['total_clicks']}")
|
||||
print(f" Categories: {len(profile.get('categories', {}))}")
|
||||
print(f" Keywords: {len(profile.get('keywords', {}))}")
|
||||
return True
|
||||
else:
|
||||
print("❌ Interest profile not created correctly")
|
||||
return False
|
||||
|
||||
|
||||
def test_phase4_personalization():
|
||||
"""Phase 4: Verify articles are ranked by user interests"""
|
||||
print("\n" + "="*60)
|
||||
print("Phase 4: Personalized Newsletter Generation")
|
||||
print("="*60)
|
||||
|
||||
test_email = 'test-phase4@example.com'
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
# Get articles
|
||||
articles = list(articles_collection.find(
|
||||
{'keywords': {'$exists': True, '$ne': []}},
|
||||
limit=5
|
||||
))
|
||||
|
||||
if len(articles) < 3:
|
||||
print("❌ Not enough articles found")
|
||||
return False
|
||||
|
||||
# Create profile
|
||||
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||
|
||||
# Rank articles
|
||||
ranked = rank_articles_for_user(articles, test_email)
|
||||
|
||||
# Select personalized
|
||||
selected = select_personalized_articles(articles, test_email, max_articles=3)
|
||||
|
||||
# Clean up
|
||||
user_interests_collection.delete_many({'email': test_email})
|
||||
|
||||
has_scores = all('personalization_score' in a for a in selected)
|
||||
|
||||
if has_scores and len(selected) > 0:
|
||||
print(f"✓ Articles ranked and selected")
|
||||
print(f" Total ranked: {len(ranked)}")
|
||||
print(f" Selected: {len(selected)}")
|
||||
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
|
||||
return True
|
||||
else:
|
||||
print("❌ Personalization failed")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all personalization tests"""
|
||||
print("\n" + "="*60)
|
||||
print("PERSONALIZATION SYSTEM TEST SUITE")
|
||||
print("="*60)
|
||||
|
||||
results = {
|
||||
'Phase 1: Keyword Extraction': test_phase1_keywords(),
|
||||
'Phase 2: Click Tracking': test_phase2_tracking(),
|
||||
'Phase 3: Interest Profiling': test_phase3_profiling(),
|
||||
'Phase 4: Personalization': test_phase4_personalization()
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("TEST RESULTS")
|
||||
print("="*60)
|
||||
|
||||
for phase, passed in results.items():
|
||||
status = "✅ PASS" if passed else "❌ FAIL"
|
||||
print(f"{status} - {phase}")
|
||||
|
||||
all_passed = all(results.values())
|
||||
|
||||
if all_passed:
|
||||
print("\n🎉 All personalization tests PASSED!")
|
||||
return 0
|
||||
else:
|
||||
print("\n❌ Some tests FAILED")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user