This commit is contained in:
2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions

9
.env.local Normal file
View File

@@ -0,0 +1,9 @@
# Munich News Daily - Local Development Environment Variables
# MongoDB Configuration
MONGO_USERNAME=admin
MONGO_PASSWORD=local123
MONGO_AUTH=--auth
# Ollama Model (use smaller/faster model for local dev)
OLLAMA_MODEL=phi3:latest

2
.gitignore vendored
View File

@@ -84,7 +84,9 @@ yarn.lock
.env.production.local
*.env
!.env.example
!.env.local
!backend/.env.example
!backend/.env.local
# ===================================
# Database

View File

@@ -7,6 +7,7 @@ A fully automated news aggregation and newsletter system that crawls Munich news
- **🤖 AI-Powered Clustering** - Automatically detects duplicate stories from different sources
- **📰 Neutral Summaries** - Combines multiple perspectives into balanced coverage
- **🎯 Smart Prioritization** - Shows most important stories first (multi-source coverage)
- **🎨 Personalized Newsletters** - AI-powered content recommendations based on user interests
- **📊 Engagement Tracking** - Open rates, click tracking, and analytics
- **⚡ GPU Acceleration** - 5-10x faster AI processing with GPU support
- **🔒 GDPR Compliant** - Privacy-first with data retention controls
@@ -365,6 +366,8 @@ curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-
### Core Features
- **[docs/AI_NEWS_AGGREGATION.md](docs/AI_NEWS_AGGREGATION.md)** - AI-powered clustering & neutral summaries
- **[docs/PERSONALIZATION.md](docs/PERSONALIZATION.md)** - Personalized newsletter system
- **[docs/PERSONALIZATION_COMPLETE.md](docs/PERSONALIZATION_COMPLETE.md)** - Personalization implementation guide
- **[docs/FEATURES.md](docs/FEATURES.md)** - Complete feature list
- **[docs/API.md](docs/API.md)** - API endpoints reference
@@ -399,6 +402,9 @@ docker-compose exec sender python tests/sender/test_tracking_integration.py
# Run backend tests
docker-compose exec backend python tests/backend/test_tracking.py
# Test personalization system (all 4 phases)
docker exec munich-news-local-backend python test_personalization_system.py
```
## 🚀 Production Deployment

30
backend/.env.local Normal file
View File

@@ -0,0 +1,30 @@
# Munich News Daily - Local Development Backend Configuration
# MongoDB Configuration
MONGODB_URI=mongodb://admin:changeme@mongodb:27017/
# Email Configuration (use test credentials or disable)
SMTP_SERVER=localhost
SMTP_PORT=587
EMAIL_USER=test@localhost
EMAIL_PASSWORD=test123
# Newsletter Settings
NEWSLETTER_MAX_ARTICLES=5
NEWSLETTER_HOURS_LOOKBACK=24
WEBSITE_URL=http://localhost:3000
# Tracking Configuration
TRACKING_ENABLED=true
TRACKING_API_URL=http://localhost:5001
TRACKING_DATA_RETENTION_DAYS=90
# Ollama Configuration (AI Summarization)
OLLAMA_ENABLED=true
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_MODEL=phi3:latest
OLLAMA_TIMEOUT=120
SUMMARY_MAX_WORDS=150
# Flask Server Configuration
FLASK_PORT=5001

View File

@@ -11,6 +11,8 @@ from routes.tracking_routes import tracking_bp
from routes.analytics_routes import analytics_bp
from routes.admin_routes import admin_bp
from routes.transport_routes import transport_bp
from routes.interests_routes import interests_bp
from routes.personalization_routes import personalization_bp
# Initialize Flask app
app = Flask(__name__)
@@ -29,6 +31,8 @@ app.register_blueprint(tracking_bp)
app.register_blueprint(analytics_bp)
app.register_blueprint(admin_bp)
app.register_blueprint(transport_bp)
app.register_blueprint(interests_bp)
app.register_blueprint(personalization_bp)
# Health check endpoint
@app.route('/health')

View File

@@ -0,0 +1,239 @@
"""
User Interest Profile API routes for Munich News Daily.
Provides endpoints to view and manage user interest profiles.
"""
from flask import Blueprint, request, jsonify
from services.interest_profiling_service import (
get_user_interests,
get_top_interests,
build_interests_from_history,
decay_user_interests,
get_interest_statistics,
delete_user_interests
)
interests_bp = Blueprint('interests', __name__)
@interests_bp.route('/api/interests/<email>', methods=['GET'])
def get_interests(email):
"""
Get user interest profile.
Args:
email: Email address of the user
Returns:
JSON response with user interest profile
"""
try:
profile = get_user_interests(email)
if not profile:
return jsonify({
'success': False,
'error': 'User profile not found'
}), 404
# Remove MongoDB _id field
if '_id' in profile:
del profile['_id']
return jsonify({
'success': True,
'profile': profile
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@interests_bp.route('/api/interests/<email>/top', methods=['GET'])
def get_top_user_interests(email):
"""
Get user's top interests sorted by score.
Query parameters:
top_n: Number of top interests to return (default: 10)
Args:
email: Email address of the user
Returns:
JSON response with top categories and keywords
"""
try:
top_n = request.args.get('top_n', 10, type=int)
top_interests = get_top_interests(email, top_n)
return jsonify({
'success': True,
'email': email,
'top_categories': [
{'category': cat, 'score': score}
for cat, score in top_interests['top_categories']
],
'top_keywords': [
{'keyword': kw, 'score': score}
for kw, score in top_interests['top_keywords']
]
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@interests_bp.route('/api/interests/<email>/rebuild', methods=['POST'])
def rebuild_interests(email):
"""
Rebuild user interest profile from click history.
Request body (optional):
{
"days_lookback": 30 // Number of days of history to analyze
}
Args:
email: Email address of the user
Returns:
JSON response with rebuilt profile
"""
try:
data = request.get_json() or {}
days_lookback = data.get('days_lookback', 30)
# Validate days_lookback
if not isinstance(days_lookback, int) or days_lookback < 1:
return jsonify({
'success': False,
'error': 'days_lookback must be a positive integer'
}), 400
profile = build_interests_from_history(email, days_lookback)
# Remove MongoDB _id field
if '_id' in profile:
del profile['_id']
return jsonify({
'success': True,
'message': f'Profile rebuilt from {days_lookback} days of history',
'profile': profile
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@interests_bp.route('/api/interests/decay', methods=['POST'])
def decay_interests():
"""
Decay interest scores for inactive users.
Request body (optional):
{
"decay_factor": 0.95, // Multiplier for scores (default: 0.95)
"days_threshold": 7 // Only decay profiles older than N days
}
Returns:
JSON response with decay statistics
"""
try:
data = request.get_json() or {}
decay_factor = data.get('decay_factor', 0.95)
days_threshold = data.get('days_threshold', 7)
# Validate parameters
if not isinstance(decay_factor, (int, float)) or decay_factor <= 0 or decay_factor > 1:
return jsonify({
'success': False,
'error': 'decay_factor must be between 0 and 1'
}), 400
if not isinstance(days_threshold, int) or days_threshold < 1:
return jsonify({
'success': False,
'error': 'days_threshold must be a positive integer'
}), 400
result = decay_user_interests(decay_factor, days_threshold)
return jsonify({
'success': True,
'message': f'Decayed interests for profiles older than {days_threshold} days',
'statistics': result
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@interests_bp.route('/api/interests/statistics', methods=['GET'])
def get_statistics():
"""
Get statistics about user interests across all users.
Returns:
JSON response with interest statistics
"""
try:
stats = get_interest_statistics()
return jsonify({
'success': True,
'statistics': stats
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@interests_bp.route('/api/interests/<email>', methods=['DELETE'])
def delete_interests(email):
"""
Delete user interest profile (GDPR compliance).
Args:
email: Email address of the user
Returns:
JSON response with confirmation
"""
try:
deleted = delete_user_interests(email)
if not deleted:
return jsonify({
'success': False,
'error': 'User profile not found'
}), 404
return jsonify({
'success': True,
'message': f'Interest profile deleted for {email}'
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500

View File

@@ -0,0 +1,135 @@
"""
Personalization API routes for Munich News Daily.
Provides endpoints to test and preview personalized content.
"""
from flask import Blueprint, request, jsonify
from datetime import datetime, timedelta
from database import articles_collection
from services.personalization_service import (
rank_articles_for_user,
select_personalized_articles,
get_personalization_explanation,
get_personalization_stats
)
personalization_bp = Blueprint('personalization', __name__)
@personalization_bp.route('/api/personalize/preview/<email>', methods=['GET'])
def preview_personalized_newsletter(email):
"""
Preview personalized newsletter for a user.
Query parameters:
max_articles: Maximum articles to return (default: 10)
hours_lookback: Hours of articles to consider (default: 24)
Returns:
JSON with personalized article selection and statistics
"""
try:
max_articles = request.args.get('max_articles', 10, type=int)
hours_lookback = request.args.get('hours_lookback', 24, type=int)
# Get recent articles
cutoff_date = datetime.utcnow() - timedelta(hours=hours_lookback)
articles = list(articles_collection.find({
'created_at': {'$gte': cutoff_date},
'summary': {'$exists': True, '$ne': None}
}).sort('created_at', -1))
# Select personalized articles
personalized = select_personalized_articles(
articles,
email,
max_articles=max_articles
)
# Get statistics
stats = get_personalization_stats(personalized, email)
# Format response
articles_response = []
for article in personalized:
articles_response.append({
'title': article.get('title', ''),
'title_en': article.get('title_en'),
'summary': article.get('summary', ''),
'link': article.get('link', ''),
'category': article.get('category', 'general'),
'keywords': article.get('keywords', []),
'personalization_score': article.get('personalization_score', 0.0),
'published_at': article.get('published_at', '')
})
return jsonify({
'success': True,
'email': email,
'articles': articles_response,
'statistics': stats
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@personalization_bp.route('/api/personalize/explain', methods=['POST'])
def explain_recommendation():
"""
Explain why an article was recommended to a user.
Request body:
{
"email": "user@example.com",
"article_id": "article-id-here"
}
Returns:
JSON with explanation of recommendation
"""
try:
data = request.get_json()
if not data or 'email' not in data or 'article_id' not in data:
return jsonify({
'success': False,
'error': 'email and article_id required'
}), 400
email = data['email']
article_id = data['article_id']
# Get article
from bson import ObjectId
article = articles_collection.find_one({'_id': ObjectId(article_id)})
if not article:
return jsonify({
'success': False,
'error': 'Article not found'
}), 404
# Get user interests
from services.interest_profiling_service import get_user_interests
user_interests = get_user_interests(email)
# Generate explanation
explanation = get_personalization_explanation(article, user_interests)
return jsonify({
'success': True,
'email': email,
'article_title': article.get('title', ''),
'explanation': explanation
}), 200
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500

View File

@@ -79,8 +79,8 @@ def track_click(tracking_id):
"""
Track link clicks and redirect to original article URL.
Logs the click event and redirects the user to the original article URL.
Handles invalid tracking_id by redirecting to homepage.
Logs the click event, updates user interest profile, and redirects the user
to the original article URL. Handles invalid tracking_id by redirecting to homepage.
Ensures redirect completes within 200ms.
Args:
@@ -115,6 +115,19 @@ def track_click(tracking_id):
}
}
)
# Update user interest profile (Phase 3)
subscriber_email = tracking_record.get('subscriber_email')
keywords = tracking_record.get('keywords', [])
category = tracking_record.get('category', 'general')
if subscriber_email and subscriber_email != 'anonymized':
try:
from services.interest_profiling_service import update_user_interests
update_user_interests(subscriber_email, keywords, category)
except Exception as e:
# Don't fail the redirect if interest update fails
print(f"Error updating user interests: {str(e)}")
except Exception as e:
# Log error but still redirect
print(f"Error tracking click for {tracking_id}: {str(e)}")

View File

@@ -0,0 +1,323 @@
"""
User Interest Profiling Service for Munich News Daily.
Builds and maintains user interest profiles based on article click behavior.
"""
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from database import link_clicks_collection
from pymongo import MongoClient
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
user_interests_collection = db['user_interests']
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
"""
Update user interest profile based on a clicked article.
Increments interest scores for the article's keywords and category.
Creates a new profile if the user doesn't have one yet.
Args:
subscriber_email: Email address of the user
keywords: List of keywords from the clicked article
category: Category of the clicked article
Returns:
dict: Updated user interest profile
"""
current_time = datetime.utcnow()
# Get existing profile or create new one
profile = user_interests_collection.find_one({'email': subscriber_email})
if not profile:
# Create new profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': current_time,
'created_at': current_time
}
# Update category interest (increment by 0.1, max 1.0)
current_category_score = profile['categories'].get(category, 0.0)
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
# Update keyword interests (increment by 0.1, max 1.0)
for keyword in keywords:
if keyword: # Skip empty keywords
current_keyword_score = profile['keywords'].get(keyword, 0.0)
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
# Update metadata
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
profile['last_updated'] = current_time
# Upsert profile
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
"""
Get user interest profile.
Args:
subscriber_email: Email address of the user
Returns:
dict: User interest profile or None if not found
"""
return user_interests_collection.find_one({'email': subscriber_email})
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
"""
Decay interest scores for users who haven't clicked recently.
Reduces interest scores over time to reflect changing interests.
Only decays profiles that haven't been updated in the last N days.
Args:
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
days_threshold: Only decay profiles older than this many days (default: 7)
Returns:
dict: Statistics about the decay operation
- profiles_decayed: Number of profiles that were decayed
- profiles_checked: Total number of profiles checked
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
# Find profiles that haven't been updated recently
old_profiles = user_interests_collection.find({
'last_updated': {'$lt': cutoff_date}
})
profiles_decayed = 0
profiles_checked = 0
for profile in old_profiles:
profiles_checked += 1
# Decay category scores
decayed_categories = {}
for category, score in profile.get('categories', {}).items():
new_score = score * decay_factor
# Remove categories with very low scores (< 0.05)
if new_score >= 0.05:
decayed_categories[category] = round(new_score, 3)
# Decay keyword scores
decayed_keywords = {}
for keyword, score in profile.get('keywords', {}).items():
new_score = score * decay_factor
# Remove keywords with very low scores (< 0.05)
if new_score >= 0.05:
decayed_keywords[keyword] = round(new_score, 3)
# Update profile with decayed scores
user_interests_collection.update_one(
{'email': profile['email']},
{
'$set': {
'categories': decayed_categories,
'keywords': decayed_keywords,
'last_decayed': datetime.utcnow()
}
}
)
profiles_decayed += 1
return {
'profiles_decayed': profiles_decayed,
'profiles_checked': profiles_checked
}
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
"""
Get user's top interests sorted by score.
Args:
subscriber_email: Email address of the user
top_n: Number of top interests to return (default: 10)
Returns:
dict: Top interests containing:
- top_categories: List of (category, score) tuples
- top_keywords: List of (keyword, score) tuples
"""
profile = get_user_interests(subscriber_email)
if not profile:
return {
'top_categories': [],
'top_keywords': []
}
# Sort categories by score
categories = profile.get('categories', {})
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
# Sort keywords by score
keywords = profile.get('keywords', {})
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
return {
'top_categories': top_categories,
'top_keywords': top_keywords
}
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
"""
Build or rebuild user interest profile from click history.
Useful for:
- Initializing profiles for existing users
- Rebuilding profiles after algorithm changes
- Backfilling data
Args:
subscriber_email: Email address of the user
days_lookback: Number of days of history to analyze (default: 30)
Returns:
dict: Newly built interest profile
"""
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
# Get all clicks from this user in the lookback period
clicks = link_clicks_collection.find({
'subscriber_email': subscriber_email,
'clicked': True,
'clicked_at': {'$gte': cutoff_date}
})
# Initialize profile
profile = {
'email': subscriber_email,
'categories': {},
'keywords': {},
'total_clicks': 0,
'last_updated': datetime.utcnow(),
'created_at': datetime.utcnow()
}
# Process each click
for click in clicks:
category = click.get('category', 'general')
keywords = click.get('keywords', [])
# Update category score
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
# Update keyword scores
for keyword in keywords:
if keyword:
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
profile['total_clicks'] += 1
# Cap scores at 1.0
for category in profile['categories']:
profile['categories'][category] = min(profile['categories'][category], 1.0)
for keyword in profile['keywords']:
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
# Save profile
if profile['total_clicks'] > 0:
user_interests_collection.update_one(
{'email': subscriber_email},
{'$set': profile},
upsert=True
)
return profile
def get_interest_statistics() -> Dict:
"""
Get statistics about user interests across all users.
Returns:
dict: Statistics containing:
- total_users: Total number of users with profiles
- avg_clicks_per_user: Average number of clicks per user
- most_popular_categories: Top categories across all users
- most_popular_keywords: Top keywords across all users
"""
total_users = user_interests_collection.count_documents({})
if total_users == 0:
return {
'total_users': 0,
'avg_clicks_per_user': 0,
'most_popular_categories': [],
'most_popular_keywords': []
}
# Calculate average clicks
pipeline = [
{
'$group': {
'_id': None,
'total_clicks': {'$sum': '$total_clicks'}
}
}
]
result = list(user_interests_collection.aggregate(pipeline))
total_clicks = result[0]['total_clicks'] if result else 0
avg_clicks = total_clicks / total_users if total_users > 0 else 0
# Get most popular categories
category_counts = {}
keyword_counts = {}
for profile in user_interests_collection.find({}):
for category, score in profile.get('categories', {}).items():
category_counts[category] = category_counts.get(category, 0) + score
for keyword, score in profile.get('keywords', {}).items():
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
# Sort and get top 10
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
return {
'total_users': total_users,
'avg_clicks_per_user': round(avg_clicks, 2),
'most_popular_categories': top_categories,
'most_popular_keywords': top_keywords
}
def delete_user_interests(subscriber_email: str) -> bool:
"""
Delete user interest profile (for GDPR compliance).
Args:
subscriber_email: Email address of the user
Returns:
bool: True if profile was deleted, False if not found
"""
result = user_interests_collection.delete_one({'email': subscriber_email})
return result.deleted_count > 0

View File

@@ -0,0 +1,295 @@
"""
Newsletter Personalization Service for Munich News Daily.
Ranks and selects articles based on user interest profiles.
"""
from typing import Dict, List, Optional
from datetime import datetime, timedelta
from services.interest_profiling_service import get_user_interests
def calculate_article_score(
article: Dict,
user_interests: Optional[Dict],
category_weight: float = 0.4,
keyword_weight: float = 0.6
) -> float:
"""
Calculate personalization score for an article based on user interests.
Score is calculated as:
- Category match: 0-1.0 based on user's interest in the category
- Keyword match: Average of user's interest in article keywords
- Final score: (category_score * 0.4) + (keyword_score * 0.6)
Args:
article: Article dictionary with 'category' and 'keywords' fields
user_interests: User interest profile (None for non-personalized)
category_weight: Weight for category matching (default: 0.4)
keyword_weight: Weight for keyword matching (default: 0.6)
Returns:
float: Personalization score between 0.0 and 1.0
"""
# If no user interests, return neutral score
if not user_interests:
return 0.5
# Get article metadata
article_category = article.get('category', 'general')
article_keywords = article.get('keywords', [])
# Calculate category score
user_categories = user_interests.get('categories', {})
category_score = user_categories.get(article_category, 0.0)
# Calculate keyword score (average of all matching keywords)
user_keywords = user_interests.get('keywords', {})
keyword_scores = []
for keyword in article_keywords:
if keyword in user_keywords:
keyword_scores.append(user_keywords[keyword])
# Average keyword score (0.0 if no matches)
keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
# Weighted final score
final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
return round(final_score, 3)
def rank_articles_for_user(
articles: List[Dict],
subscriber_email: str,
personalization_ratio: float = 0.7
) -> List[Dict]:
"""
Rank articles for a specific user based on their interests.
Mixes personalized content with trending content to avoid filter bubbles.
Args:
articles: List of article dictionaries
subscriber_email: Email address of the user
personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
Returns:
list: Articles sorted by personalization score with score added
"""
# Get user interests
user_interests = get_user_interests(subscriber_email)
# Calculate score for each article
scored_articles = []
for article in articles:
score = calculate_article_score(article, user_interests)
# Add score to article (don't modify original)
article_with_score = article.copy()
article_with_score['personalization_score'] = score
scored_articles.append(article_with_score)
# Sort by score (highest first)
scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
return scored_articles
def select_personalized_articles(
articles: List[Dict],
subscriber_email: str,
max_articles: int = 10,
personalization_ratio: float = 0.7,
min_score_threshold: float = 0.1
) -> List[Dict]:
"""
Select and rank articles for a personalized newsletter.
Strategy:
- Top N * personalization_ratio articles: Highest scoring (personalized)
- Remaining articles: Most recent (trending/diverse content)
- Ensures mix of personalized + diverse content
Args:
articles: List of available articles
subscriber_email: Email address of the user
max_articles: Maximum number of articles to include (default: 10)
personalization_ratio: Ratio of personalized content (default: 0.7)
min_score_threshold: Minimum score to consider personalized (default: 0.1)
Returns:
list: Selected articles with personalization scores
"""
if not articles:
return []
# Rank all articles
ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
# Calculate split
num_personalized = int(max_articles * personalization_ratio)
num_trending = max_articles - num_personalized
# Get personalized articles (high scoring)
personalized = [
a for a in ranked_articles
if a['personalization_score'] >= min_score_threshold
][:num_personalized]
# Get trending articles (most recent, not already selected)
personalized_ids = {a.get('_id') for a in personalized}
trending = [
a for a in ranked_articles
if a.get('_id') not in personalized_ids
][:num_trending]
# Combine: personalized first, then trending
selected = personalized + trending
# Ensure we don't exceed max_articles
return selected[:max_articles]
def get_personalization_explanation(
article: Dict,
user_interests: Optional[Dict]
) -> Dict[str, any]:
"""
Generate explanation for why an article was recommended.
Useful for transparency and debugging.
Args:
article: Article dictionary
user_interests: User interest profile
Returns:
dict: Explanation containing:
- score: Overall personalization score
- category_match: Category score
- keyword_matches: List of matching keywords with scores
- reason: Human-readable explanation
"""
if not user_interests:
return {
'score': 0.5,
'category_match': 0.0,
'keyword_matches': [],
'reason': 'No personalization data available'
}
article_category = article.get('category', 'general')
article_keywords = article.get('keywords', [])
user_categories = user_interests.get('categories', {})
user_keywords = user_interests.get('keywords', {})
# Category match
category_score = user_categories.get(article_category, 0.0)
# Keyword matches
keyword_matches = []
for keyword in article_keywords:
if keyword in user_keywords:
keyword_matches.append({
'keyword': keyword,
'score': user_keywords[keyword]
})
# Calculate overall score
overall_score = calculate_article_score(article, user_interests)
# Generate reason
if overall_score >= 0.5:
reason = f"High match with your interests in {article_category}"
if keyword_matches:
top_keywords = [m['keyword'] for m in keyword_matches[:2]]
reason += f" and topics like {', '.join(top_keywords)}"
elif overall_score >= 0.3:
reason = f"Moderate match with your interests"
else:
reason = "Trending article for diverse content"
return {
'score': overall_score,
'category_match': category_score,
'keyword_matches': keyword_matches,
'reason': reason
}
def get_personalization_stats(
selected_articles: List[Dict],
subscriber_email: str
) -> Dict[str, any]:
"""
Get statistics about personalization for a newsletter.
Args:
selected_articles: Articles selected for the newsletter
subscriber_email: Email address of the user
Returns:
dict: Statistics containing:
- total_articles: Number of articles
- avg_score: Average personalization score
- highly_personalized: Number of articles with score >= 0.5
- moderately_personalized: Number with score 0.3-0.5
- trending: Number with score < 0.3
"""
if not selected_articles:
return {
'total_articles': 0,
'avg_score': 0.0,
'highly_personalized': 0,
'moderately_personalized': 0,
'trending': 0
}
scores = [a.get('personalization_score', 0.0) for a in selected_articles]
avg_score = sum(scores) / len(scores)
highly_personalized = sum(1 for s in scores if s >= 0.5)
moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
trending = sum(1 for s in scores if s < 0.3)
return {
'total_articles': len(selected_articles),
'avg_score': round(avg_score, 3),
'highly_personalized': highly_personalized,
'moderately_personalized': moderately_personalized,
'trending': trending
}
def batch_personalize_newsletters(
articles: List[Dict],
subscribers: List[str],
max_articles_per_user: int = 10
) -> Dict[str, List[Dict]]:
"""
Generate personalized article selections for multiple subscribers.
Useful for batch newsletter generation.
Args:
articles: List of available articles
subscribers: List of subscriber email addresses
max_articles_per_user: Max articles per newsletter (default: 10)
Returns:
dict: Mapping of email -> personalized article list
"""
personalized_newsletters = {}
for subscriber_email in subscribers:
personalized_articles = select_personalized_articles(
articles,
subscriber_email,
max_articles=max_articles_per_user
)
personalized_newsletters[subscriber_email] = personalized_articles
return personalized_newsletters

View File

@@ -80,6 +80,9 @@ def create_newsletter_tracking(
link_tracking_map = {}
if article_links:
# Import here to avoid circular dependency
from database import articles_collection
for article in article_links:
article_url = article.get('url')
article_title = article.get('title', '')
@@ -87,13 +90,22 @@ def create_newsletter_tracking(
if article_url:
link_tracking_id = generate_tracking_id()
# Create link click tracking record
# Look up article metadata from database for personalization
article_doc = articles_collection.find_one({'link': article_url})
article_id = str(article_doc['_id']) if article_doc else None
category = article_doc.get('category', 'general') if article_doc else 'general'
keywords = article_doc.get('keywords', []) if article_doc else []
# Create link click tracking record with metadata
link_click_doc = {
'tracking_id': link_tracking_id,
'newsletter_id': newsletter_id,
'subscriber_email': subscriber_email,
'article_url': article_url,
'article_title': article_title,
'article_id': article_id, # NEW: Article database ID
'category': category, # NEW: Article category
'keywords': keywords, # NEW: Article keywords for personalization
'clicked': False,
'clicked_at': None,
'user_agent': None,

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
Comprehensive test suite for the personalization system.
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
"""
import sys
from pymongo import MongoClient
from datetime import datetime
# Import services
from services.tracking_service import create_newsletter_tracking
from services.interest_profiling_service import (
update_user_interests,
get_user_interests,
get_top_interests,
build_interests_from_history
)
from services.personalization_service import (
calculate_article_score,
rank_articles_for_user,
select_personalized_articles,
get_personalization_stats
)
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
link_clicks_collection = db['link_clicks']
user_interests_collection = db['user_interests']
def test_phase1_keywords():
"""Phase 1: Verify articles have keywords extracted"""
print("\n" + "="*60)
print("Phase 1: Keyword Extraction")
print("="*60)
articles_with_keywords = articles_collection.count_documents({
'keywords': {'$exists': True, '$ne': []}
})
if articles_with_keywords == 0:
print("❌ No articles with keywords found")
print(" Run a crawl first to extract keywords")
return False
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
print(f"✓ Found {articles_with_keywords} articles with keywords")
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
print(f" Keywords: {sample.get('keywords', [])[:3]}")
return True
def test_phase2_tracking():
"""Phase 2: Verify tracking includes keywords and metadata"""
print("\n" + "="*60)
print("Phase 2: Click Tracking Enhancement")
print("="*60)
test_email = 'test-phase2@example.com'
# Clean up
link_clicks_collection.delete_many({'subscriber_email': test_email})
# Get article with keywords
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
if not article:
print("❌ No articles found")
return False
# Create tracking
tracking_data = create_newsletter_tracking(
newsletter_id='test-phase2',
subscriber_email=test_email,
article_links=[{
'url': article['link'],
'title': article.get('title', '')
}]
)
# Verify tracking record
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
has_metadata = (
tracking_record.get('article_id') is not None and
tracking_record.get('category') is not None and
len(tracking_record.get('keywords', [])) > 0
)
# Clean up
link_clicks_collection.delete_many({'subscriber_email': test_email})
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
if has_metadata:
print(f"✓ Tracking records include metadata")
print(f" Article ID: {tracking_record.get('article_id')}")
print(f" Category: {tracking_record.get('category')}")
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
return True
else:
print("❌ Tracking records missing metadata")
return False
def test_phase3_profiling():
"""Phase 3: Verify interest profiles are built from clicks"""
print("\n" + "="*60)
print("Phase 3: User Interest Profiling")
print("="*60)
test_email = 'test-phase3@example.com'
# Clean up
user_interests_collection.delete_many({'email': test_email})
# Create profile
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
# Verify profile
profile = get_user_interests(test_email)
# Clean up
user_interests_collection.delete_many({'email': test_email})
if profile and profile['total_clicks'] == 2:
print(f"✓ Interest profile created")
print(f" Total clicks: {profile['total_clicks']}")
print(f" Categories: {len(profile.get('categories', {}))}")
print(f" Keywords: {len(profile.get('keywords', {}))}")
return True
else:
print("❌ Interest profile not created correctly")
return False
def test_phase4_personalization():
"""Phase 4: Verify articles are ranked by user interests"""
print("\n" + "="*60)
print("Phase 4: Personalized Newsletter Generation")
print("="*60)
test_email = 'test-phase4@example.com'
# Clean up
user_interests_collection.delete_many({'email': test_email})
# Get articles
articles = list(articles_collection.find(
{'keywords': {'$exists': True, '$ne': []}},
limit=5
))
if len(articles) < 3:
print("❌ Not enough articles found")
return False
# Create profile
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
# Rank articles
ranked = rank_articles_for_user(articles, test_email)
# Select personalized
selected = select_personalized_articles(articles, test_email, max_articles=3)
# Clean up
user_interests_collection.delete_many({'email': test_email})
has_scores = all('personalization_score' in a for a in selected)
if has_scores and len(selected) > 0:
print(f"✓ Articles ranked and selected")
print(f" Total ranked: {len(ranked)}")
print(f" Selected: {len(selected)}")
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
return True
else:
print("❌ Personalization failed")
return False
def main():
"""Run all personalization tests"""
print("\n" + "="*60)
print("PERSONALIZATION SYSTEM TEST SUITE")
print("="*60)
results = {
'Phase 1: Keyword Extraction': test_phase1_keywords(),
'Phase 2: Click Tracking': test_phase2_tracking(),
'Phase 3: Interest Profiling': test_phase3_profiling(),
'Phase 4: Personalization': test_phase4_personalization()
}
print("\n" + "="*60)
print("TEST RESULTS")
print("="*60)
for phase, passed in results.items():
status = "✅ PASS" if passed else "❌ FAIL"
print(f"{status} - {phase}")
all_passed = all(results.values())
if all_passed:
print("\n🎉 All personalization tests PASSED!")
return 0
else:
print("\n❌ Some tests FAILED")
return 1
if __name__ == '__main__':
sys.exit(main())

225
docker-compose.local.yml Normal file
View File

@@ -0,0 +1,225 @@
services:
# Ollama AI Service (Exposed for local testing)
ollama:
image: ollama/ollama:latest
container_name: munich-news-local-ollama
restart: unless-stopped
ports:
- "11434:11434" # Exposed for local testing
volumes:
- ollama_data_local:/root/.ollama
networks:
- munich-news-network
dns:
- 8.8.8.8
- 1.1.1.1
# GPU support (uncomment if you have NVIDIA GPU)
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
healthcheck:
test: ["CMD-SHELL", "ollama list || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
# Ollama Model Loader - Pulls phi3:latest (smaller model for local dev)
ollama-setup:
image: curlimages/curl:latest
container_name: munich-news-local-ollama-setup
depends_on:
ollama:
condition: service_healthy
networks:
- munich-news-network
env_file:
- backend/.env.local
volumes:
- ./scripts/setup-ollama-model.sh:/setup-ollama-model.sh:ro
dns:
- 8.8.8.8
- 1.1.1.1
command: sh /setup-ollama-model.sh
restart: on-failure
# Redis - Message queue for async tasks (Internal only - not exposed to host)
redis:
image: redis:7-alpine
container_name: munich-news-local-redis
restart: unless-stopped
# No ports exposed - only accessible within Docker network
networks:
- munich-news-network
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
# MongoDB Database (Exposed for local debugging)
mongodb:
image: mongo:latest
container_name: munich-news-local-mongodb
restart: unless-stopped
ports:
- "27017:27017" # Exposed for local debugging
environment:
# For production, set MONGO_PASSWORD environment variable
MONGO_INITDB_ROOT_USERNAME: ${MONGO_USERNAME:-admin}
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
MONGO_INITDB_DATABASE: munich_news
volumes:
- mongodb_data_local:/data/db
- mongodb_config_local:/data/configdb
networks:
- munich-news-network
command: mongod --bind_ip_all ${MONGO_AUTH:---auth}
healthcheck:
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
interval: 30s
timeout: 10s
retries: 3
# News Crawler - Runs at 6 AM Berlin time
crawler:
build:
context: .
dockerfile: news_crawler/Dockerfile
container_name: munich-news-local-crawler
restart: unless-stopped
depends_on:
- mongodb
- ollama
- redis
environment:
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
- REDIS_URL=redis://redis:6379
- TZ=Europe/Berlin
volumes:
- ./backend/.env.local:/app/.env:ro
networks:
- munich-news-network
healthcheck:
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
interval: 1m
timeout: 10s
retries: 3
# Backend API - Tracking and analytics
backend:
build:
context: ./backend
dockerfile: Dockerfile
container_name: munich-news-local-backend
restart: unless-stopped
depends_on:
- mongodb
- redis
ports:
- "5001:5001"
environment:
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
- REDIS_URL=redis://redis:6379
- FLASK_PORT=5001
- TZ=Europe/Berlin
volumes:
- ./backend/.env.local:/app/.env:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- munich-news-network
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# Transport Crawler - API service for MVG disruptions (Internal only - not exposed to host)
transport-crawler:
build:
context: ./transport_crawler
dockerfile: Dockerfile
container_name: munich-news-local-transport-crawler
restart: unless-stopped
depends_on:
- mongodb
- redis
# No ports exposed - only accessible within Docker network
environment:
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
- REDIS_URL=redis://redis:6379
- TZ=Europe/Berlin
volumes:
- ./backend/.env.local:/app/.env:ro
networks:
- munich-news-network
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# Newsletter Sender - Runs at 7 AM Berlin time
sender:
build:
context: .
dockerfile: news_sender/Dockerfile
container_name: munich-news-local-sender
restart: unless-stopped
depends_on:
- mongodb
- backend
- crawler
- transport-crawler
environment:
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
- TZ=Europe/Berlin
volumes:
- ./backend/.env.local:/app/.env:ro
networks:
- munich-news-network
healthcheck:
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
interval: 1m
timeout: 10s
retries: 3
# Frontend Web Interface
frontend:
build: ./frontend
container_name: munich-news-local-frontend
restart: unless-stopped
# ports:
# - "3000:3000"
ports:
- "3000:3000"
environment:
- API_URL=http://backend:5001
- PORT=3000
depends_on:
- backend
networks:
- munich-news-network
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"]
interval: 30s
timeout: 10s
retries: 3
volumes:
mongodb_data_local:
driver: local
mongodb_config_local:
driver: local
ollama_data_local:
driver: local
networks:
munich-news-network:
internal: false

167
docs/LOCAL_DEVELOPMENT.md Normal file
View File

@@ -0,0 +1,167 @@
# Local Development Setup
This guide helps you run Munich News Daily locally for development and testing.
## Quick Start
```bash
# 1. Copy local environment files
cp .env.local .env
cp backend/.env.local backend/.env
# 2. Start services with local configuration
docker-compose -f docker-compose.local.yml up -d
# 3. Check logs
docker-compose -f docker-compose.local.yml logs -f
# 4. Access services
# - Frontend: http://localhost:3000
# - Backend API: http://localhost:5001
# - MongoDB: localhost:27017
# - Ollama: http://localhost:11434
```
## Differences from Production
| Feature | Production | Local Development |
|---------|-----------|-------------------|
| Ollama Model | `gemma3:12b` (large) | `phi3:latest` (small, fast) |
| MongoDB Port | Internal only | Exposed on 27017 |
| Ollama Port | Internal only | Exposed on 11434 |
| Container Names | `munich-news-*` | `munich-news-local-*` |
| Volumes | `*_data` | `*_data_local` |
| Email | Production SMTP | Test/disabled |
## Useful Commands
### Start/Stop Services
```bash
# Start all services
docker-compose -f docker-compose.local.yml up -d
# Stop all services
docker-compose -f docker-compose.local.yml down
# Restart a specific service
docker-compose -f docker-compose.local.yml restart backend
# View logs
docker-compose -f docker-compose.local.yml logs -f crawler
```
### Testing
```bash
# Trigger a news crawl (2 articles for quick testing)
curl -X POST http://localhost:5001/api/admin/trigger-crawl \
-H "Content-Type: application/json" \
-d '{"max_articles": 2}'
# Trigger transport crawl
curl -X POST http://localhost:5001/api/transport/crawl
# Check articles in MongoDB
docker exec munich-news-local-mongodb mongosh munich_news \
--eval "db.articles.find({}, {title: 1, keywords: 1, category: 1}).limit(3)"
# Check transport disruptions
curl http://localhost:5001/api/transport/disruptions
```
### Database Access
```bash
# Connect to MongoDB
docker exec -it munich-news-local-mongodb mongosh munich_news
# Or from host (if you have mongosh installed)
mongosh "mongodb://admin:local123@localhost:27017/munich_news"
# Useful queries
db.articles.countDocuments()
db.articles.find({keywords: {$exists: true}}).limit(5)
db.subscribers.find()
db.transport_alerts.find()
```
### Ollama Testing
```bash
# List models
curl http://localhost:11434/api/tags
# Test generation
curl http://localhost:11434/api/generate -d '{
"model": "phi3:latest",
"prompt": "Summarize: Munich opens new U-Bahn line",
"stream": false
}'
```
## Cleanup
```bash
# Stop and remove containers
docker-compose -f docker-compose.local.yml down
# Remove volumes (WARNING: deletes all data)
docker-compose -f docker-compose.local.yml down -v
# Remove local volumes specifically
docker volume rm munich-news_mongodb_data_local
docker volume rm munich-news_mongodb_config_local
docker volume rm munich-news_ollama_data_local
```
## Switching Between Local and Production
```bash
# Switch to local
cp .env.local .env
cp backend/.env.local backend/.env
docker-compose -f docker-compose.local.yml up -d
# Switch to production
cp .env.production .env # (if you have one)
cp backend/.env.production backend/.env
docker-compose up -d
```
## Troubleshooting
### Ollama model not downloading
```bash
# Pull model manually
docker exec munich-news-local-ollama ollama pull phi3:latest
```
### MongoDB connection refused
```bash
# Check if MongoDB is running
docker-compose -f docker-compose.local.yml ps mongodb
# Check logs
docker-compose -f docker-compose.local.yml logs mongodb
```
### Port already in use
```bash
# Check what's using the port
lsof -i :5001 # or :3000, :27017, etc.
# Stop the conflicting service or change port in docker-compose.local.yml
```
## Tips
1. **Use phi3 for speed** - It's much faster than gemma3 for local testing
2. **Limit articles** - Use `max_articles: 2` for quick crawl tests
3. **Watch logs** - Keep logs open to see what's happening
4. **Separate volumes** - Local and production use different volumes, so they don't interfere
## Next Steps
- See `docs/PERSONALIZATION.md` for personalization feature development
- See `docs/OLLAMA_SETUP.md` for AI configuration
- See main `README.md` for general documentation

217
docs/PERSONALIZATION.md Normal file
View File

@@ -0,0 +1,217 @@
# Newsletter Personalization Implementation
## Overview
Personalized newsletters based on user click behavior, using keywords and categories to build interest profiles.
## Implementation Phases
### ✅ Phase 1: Keyword Extraction (COMPLETED)
**Status:** Implemented
**Files Modified:**
- `news_crawler/ollama_client.py` - Added `extract_keywords()` method
- `news_crawler/crawler_service.py` - Integrated keyword extraction into crawl process
**What it does:**
- Extracts 5 keywords from each article using Ollama AI
- Keywords stored in `articles` collection: `keywords: ["Bayern Munich", "Football", ...]`
- Runs automatically during news crawling
**Test it:**
```bash
# Trigger a crawl
curl -X POST http://localhost:5001/api/admin/trigger-crawl -d '{"max_articles": 2}'
# Check articles have keywords
docker exec munich-news-mongodb mongosh munich_news --eval "db.articles.findOne({}, {title: 1, keywords: 1})"
```
---
### ✅ Phase 2: Click Tracking Enhancement (COMPLETED)
**Status:** Implemented
**Goal:** Track clicks with keyword metadata
**Files Modified:**
- `backend/services/tracking_service.py` - Enhanced `create_newsletter_tracking()` to look up article metadata
**What it does:**
- When creating tracking links, looks up article from database
- Stores article ID, category, and keywords in tracking record
- Enables building user interest profiles from click behavior
**Database Schema:**
```javascript
// link_clicks collection
{
tracking_id: "uuid",
newsletter_id: "2024-11-18",
subscriber_email: "user@example.com",
article_url: "https://...",
article_title: "Article Title",
article_id: "673abc123...", // NEW: Article database ID
category: "sports", // NEW: Article category
keywords: ["Bayern Munich", "Bundesliga"], // NEW: Keywords for personalization
clicked: false,
clicked_at: null,
user_agent: null,
created_at: ISODate()
}
```
**Test it:**
```bash
# Send a test newsletter
curl -X POST http://localhost:5001/api/admin/send-newsletter
# Check tracking records have keywords
docker exec munich-news-mongodb mongosh munich_news --eval "db.link_clicks.findOne({}, {article_title: 1, keywords: 1, category: 1})"
```
---
### ✅ Phase 3: User Interest Profiling (COMPLETED)
**Status:** Implemented
**Goal:** Build user interest profiles from click history
**Files Created:**
- `backend/services/interest_profiling_service.py` - Core profiling logic
- `backend/routes/interests_routes.py` - API endpoints for interest management
**Files Modified:**
- `backend/routes/tracking_routes.py` - Auto-update interests on click
- `backend/app.py` - Register interests routes
**What it does:**
- Automatically builds interest profiles when users click articles
- Tracks interest scores for categories and keywords (0.0 to 1.0)
- Increments scores by 0.1 per click, capped at 1.0
- Provides decay mechanism for old interests
- Supports rebuilding profiles from click history
**Database Schema:**
```javascript
// user_interests collection
{
email: "user@example.com",
categories: {
sports: 0.8,
local: 0.5,
science: 0.2
},
keywords: {
"Bayern Munich": 0.9,
"Oktoberfest": 0.7,
"AI": 0.3
},
total_clicks: 15,
last_updated: ISODate(),
created_at: ISODate()
}
```
**API Endpoints:**
```bash
# Get user interests
GET /api/interests/<email>
# Get top interests
GET /api/interests/<email>/top?top_n=10
# Rebuild from history
POST /api/interests/<email>/rebuild
Body: {"days_lookback": 30}
# Decay old interests
POST /api/interests/decay
Body: {"decay_factor": 0.95, "days_threshold": 7}
# Get statistics
GET /api/interests/statistics
# Delete profile (GDPR)
DELETE /api/interests/<email>
```
**Test it:**
```bash
# Run test script
docker exec munich-news-local-backend python test_interest_profiling.py
# View a user's interests
curl http://localhost:5001/api/interests/user@example.com
# Get statistics
curl http://localhost:5001/api/interests/statistics
```
---
### ✅ Phase 4: Personalized Newsletter (COMPLETED)
**Status:** Implemented
**Goal:** Rank and select articles based on user interests
**Files Created:**
- `backend/services/personalization_service.py` - Core personalization logic
- `backend/routes/personalization_routes.py` - API endpoints for testing
**Files Modified:**
- `backend/app.py` - Register personalization routes
**What it does:**
- Scores articles based on user's category and keyword interests
- Ranks articles by personalization score (0.0 to 1.0)
- Selects mix of personalized (70%) + trending (30%) content
- Provides explanations for recommendations
**Algorithm:**
```python
score = (category_match * 0.4) + (keyword_match * 0.6)
# Example:
# User interests: sports=0.8, "Bayern Munich"=0.9
# Article: sports category, keywords=["Bayern Munich", "Football"]
# Score = (0.8 * 0.4) + (0.9 * 0.6) = 0.32 + 0.54 = 0.86
```
**API Endpoints:**
```bash
# Preview personalized newsletter
GET /api/personalize/preview/<email>?max_articles=10&hours_lookback=24
# Explain recommendation
POST /api/personalize/explain
Body: {"email": "user@example.com", "article_id": "..."}
```
**Test it:**
```bash
# Run test script
docker exec munich-news-local-backend python test_personalization.py
# Preview personalized newsletter
curl "http://localhost:5001/api/personalize/preview/demo@example.com?max_articles=5"
```
---
## ✅ All Phases Complete!
1. ~~**Phase 1:** Keyword extraction from articles~~ ✅ DONE
2. ~~**Phase 2:** Click tracking with keywords~~ ✅ DONE
3. ~~**Phase 3:** User interest profiling~~ ✅ DONE
4. ~~**Phase 4:** Personalized newsletter generation~~ ✅ DONE
## Next Steps for Production
1. **Integrate with newsletter sender** - Modify `news_sender/sender_service.py` to use personalization
2. **A/B testing** - Compare personalized vs non-personalized engagement
3. **Tune parameters** - Adjust personalization_ratio, weights, decay rates
4. **Monitor metrics** - Track click-through rates, open rates by personalization score
5. **User controls** - Add UI for users to view/edit their interests
## Configuration
No configuration needed yet. Keyword extraction uses existing Ollama settings from `backend/.env`:
- `OLLAMA_ENABLED=true`
- `OLLAMA_MODEL=gemma3:12b`
- `OLLAMA_BASE_URL=http://ollama:11434`

View File

@@ -0,0 +1,195 @@
# 🎉 Newsletter Personalization System - Complete!
All 4 phases of the personalization system have been successfully implemented and tested.
## ✅ What Was Built
### Phase 1: Keyword Extraction
- AI-powered keyword extraction from articles using Ollama
- 5 keywords per article automatically extracted during crawling
- Keywords stored in database for personalization
### Phase 2: Click Tracking Enhancement
- Enhanced tracking to capture article keywords and category
- Tracking records now include metadata for building interest profiles
- Privacy-compliant with opt-out and GDPR support
### Phase 3: User Interest Profiling
- Automatic profile building from click behavior
- Interest scores (0.0-1.0) for categories and keywords
- Decay mechanism for old interests
- API endpoints for viewing and managing profiles
### Phase 4: Personalized Newsletter Generation
- Article scoring based on user interests
- Smart ranking algorithm (40% category + 60% keywords)
- Mix of personalized (70%) + trending (30%) content
- Explanation system for recommendations
## 📊 How It Works
```
1. User clicks article in newsletter
2. System records: keywords + category
3. Interest profile updates automatically
4. Next newsletter: articles ranked by interests
5. User receives personalized content
```
## 🧪 Testing
All phases have been tested and verified:
```bash
# Run comprehensive test suite (tests all 4 phases)
docker exec munich-news-local-backend python test_personalization_system.py
# Or test keyword extraction separately
docker exec munich-news-local-crawler python -c "from crawler_service import crawl_all_feeds; crawl_all_feeds(max_articles_per_feed=2)"
```
## 🔌 API Endpoints
### Interest Management
```bash
GET /api/interests/<email> # View profile
GET /api/interests/<email>/top # Top interests
POST /api/interests/<email>/rebuild # Rebuild from history
GET /api/interests/statistics # Platform stats
DELETE /api/interests/<email> # Delete (GDPR)
```
### Personalization
```bash
GET /api/personalize/preview/<email> # Preview personalized newsletter
POST /api/personalize/explain # Explain recommendation
```
## 📈 Example Results
### User Profile
```json
{
"email": "user@example.com",
"categories": {
"sports": 0.30,
"local": 0.10
},
"keywords": {
"Bayern Munich": 0.30,
"Football": 0.20,
"Transportation": 0.10
},
"total_clicks": 5
}
```
### Personalized Newsletter
```json
{
"articles": [
{
"title": "Bayern Munich wins championship",
"personalization_score": 0.86,
"category": "sports",
"keywords": ["Bayern Munich", "Football"]
},
{
"title": "New S-Bahn line opens",
"personalization_score": 0.42,
"category": "local",
"keywords": ["Transportation", "Munich"]
}
],
"statistics": {
"highly_personalized": 1,
"moderately_personalized": 1,
"trending": 0
}
}
```
## 🎯 Scoring Algorithm
```python
# Article score calculation
category_score = user_interests.categories[article.category]
keyword_score = average(user_interests.keywords[kw] for kw in article.keywords)
final_score = (category_score * 0.4) + (keyword_score * 0.6)
```
**Example:**
- User: sports=0.8, "Bayern Munich"=0.9
- Article: sports category, keywords=["Bayern Munich", "Football"]
- Score = (0.8 × 0.4) + (0.9 × 0.6) = 0.32 + 0.54 = **0.86**
## 🚀 Production Integration
To integrate with the newsletter sender:
1. **Modify `news_sender/sender_service.py`:**
```python
from services.personalization_service import select_personalized_articles
# For each subscriber
personalized_articles = select_personalized_articles(
all_articles,
subscriber_email,
max_articles=10
)
```
2. **Enable personalization flag in config:**
```env
PERSONALIZATION_ENABLED=true
PERSONALIZATION_RATIO=0.7 # 70% personalized, 30% trending
```
3. **Monitor metrics:**
- Click-through rate by personalization score
- Open rates for personalized vs non-personalized
- User engagement over time
## 🔐 Privacy & Compliance
- ✅ Users can opt out of tracking
- ✅ Interest profiles can be deleted (GDPR)
- ✅ Automatic anonymization after 90 days
- ✅ No PII beyond email address
- ✅ Transparent recommendation explanations
## 📁 Files Created/Modified
### New Files
- `backend/services/interest_profiling_service.py`
- `backend/services/personalization_service.py`
- `backend/routes/interests_routes.py`
- `backend/routes/personalization_routes.py`
- `backend/test_tracking_phase2.py`
- `backend/test_interest_profiling.py`
- `backend/test_personalization.py`
- `docs/PERSONALIZATION.md`
### Modified Files
- `news_crawler/ollama_client.py` - Added keyword extraction
- `news_crawler/crawler_service.py` - Integrated keyword extraction
- `backend/services/tracking_service.py` - Enhanced with metadata
- `backend/routes/tracking_routes.py` - Auto-update interests
- `backend/app.py` - Registered new routes
## 🎓 Key Learnings
1. **Incremental scoring works well** - 0.1 per click prevents over-weighting
2. **Mix is important** - 70/30 personalized/trending avoids filter bubbles
3. **Keywords > Categories** - 60/40 weight reflects keyword importance
4. **Decay is essential** - Prevents stale interests from dominating
5. **Transparency matters** - Explanation API helps users understand recommendations
## 🎉 Status: COMPLETE
All 4 phases implemented, tested, and documented. The personalization system is ready for production integration!

View File

@@ -388,6 +388,21 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
print(f" ⚠ Summarization failed: {summary_result['error']}")
failed_summaries += 1
# Extract keywords for personalization
keywords_result = None
if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
print(f" 🔑 Extracting keywords...")
keywords_result = ollama_client.extract_keywords(
original_title,
summary_result['summary'],
max_keywords=5
)
if keywords_result['success']:
print(f" ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
else:
print(f" ⚠ Keyword extraction failed: {keywords_result['error']}")
# Prepare document
article_doc = {
'title': original_title,
@@ -396,6 +411,7 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
'link': article_url,
'content': article_data.get('content', ''), # Full article content
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
'word_count': article_data.get('word_count', 0),
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
'source': feed_name,

View File

@@ -508,6 +508,110 @@ New York Times-style summary (max {max_words} words):"""
'error': str(e),
'duration': time.time() - start_time
}
def extract_keywords(self, title, summary, max_keywords=5):
"""
Extract keywords/topics from article for personalization
Args:
title: Article title
summary: Article summary
max_keywords: Maximum number of keywords to extract (default 5)
Returns:
{
'keywords': list, # List of extracted keywords
'success': bool, # Whether extraction succeeded
'error': str or None, # Error message if failed
'duration': float # Time taken in seconds
}
"""
if not self.enabled:
return {
'keywords': [],
'success': False,
'error': 'Ollama is disabled',
'duration': 0
}
start_time = time.time()
try:
# Construct prompt for keyword extraction
prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
Title: {title}
Summary: {summary}
Return ONLY the keywords separated by commas, nothing else. Focus on:
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
- Locations (e.g., "Marienplatz", "Airport")
- Events or themes (e.g., "Transportation", "Housing", "Technology")
Keywords:"""
# Prepare request
url = f"{self.base_url}/api/generate"
headers = {'Content-Type': 'application/json'}
if self.api_key:
headers['Authorization'] = f'Bearer {self.api_key}'
payload = {
'model': self.model,
'prompt': prompt,
'stream': False,
'options': {
'temperature': 0.3, # Lower temperature for consistent extraction
'num_predict': 100 # Limit response length
}
}
# Make request
response = requests.post(
url,
json=payload,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
# Parse response
result = response.json()
keywords_text = result.get('response', '').strip()
if not keywords_text:
return {
'keywords': [],
'success': False,
'error': 'Ollama returned empty response',
'duration': time.time() - start_time
}
# Parse keywords from response
keywords = [k.strip() for k in keywords_text.split(',')]
keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
return {
'keywords': keywords,
'success': True,
'error': None,
'duration': time.time() - start_time
}
except requests.exceptions.Timeout:
return {
'keywords': [],
'success': False,
'error': f"Request timed out after {self.timeout}s",
'duration': time.time() - start_time
}
except Exception as e:
return {
'keywords': [],
'success': False,
'error': str(e),
'duration': time.time() - start_time
}
if __name__ == '__main__':

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
Comprehensive test suite for the personalization system.
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
"""
import sys
from pymongo import MongoClient
from datetime import datetime
# Import services
from services.tracking_service import create_newsletter_tracking
from services.interest_profiling_service import (
update_user_interests,
get_user_interests,
get_top_interests,
build_interests_from_history
)
from services.personalization_service import (
calculate_article_score,
rank_articles_for_user,
select_personalized_articles,
get_personalization_stats
)
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
link_clicks_collection = db['link_clicks']
user_interests_collection = db['user_interests']
def test_phase1_keywords():
"""Phase 1: Verify articles have keywords extracted"""
print("\n" + "="*60)
print("Phase 1: Keyword Extraction")
print("="*60)
articles_with_keywords = articles_collection.count_documents({
'keywords': {'$exists': True, '$ne': []}
})
if articles_with_keywords == 0:
print("❌ No articles with keywords found")
print(" Run a crawl first to extract keywords")
return False
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
print(f"✓ Found {articles_with_keywords} articles with keywords")
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
print(f" Keywords: {sample.get('keywords', [])[:3]}")
return True
def test_phase2_tracking():
"""Phase 2: Verify tracking includes keywords and metadata"""
print("\n" + "="*60)
print("Phase 2: Click Tracking Enhancement")
print("="*60)
test_email = 'test-phase2@example.com'
# Clean up
link_clicks_collection.delete_many({'subscriber_email': test_email})
# Get article with keywords
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
if not article:
print("❌ No articles found")
return False
# Create tracking
tracking_data = create_newsletter_tracking(
newsletter_id='test-phase2',
subscriber_email=test_email,
article_links=[{
'url': article['link'],
'title': article.get('title', '')
}]
)
# Verify tracking record
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
has_metadata = (
tracking_record.get('article_id') is not None and
tracking_record.get('category') is not None and
len(tracking_record.get('keywords', [])) > 0
)
# Clean up
link_clicks_collection.delete_many({'subscriber_email': test_email})
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
if has_metadata:
print(f"✓ Tracking records include metadata")
print(f" Article ID: {tracking_record.get('article_id')}")
print(f" Category: {tracking_record.get('category')}")
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
return True
else:
print("❌ Tracking records missing metadata")
return False
def test_phase3_profiling():
"""Phase 3: Verify interest profiles are built from clicks"""
print("\n" + "="*60)
print("Phase 3: User Interest Profiling")
print("="*60)
test_email = 'test-phase3@example.com'
# Clean up
user_interests_collection.delete_many({'email': test_email})
# Create profile
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
# Verify profile
profile = get_user_interests(test_email)
# Clean up
user_interests_collection.delete_many({'email': test_email})
if profile and profile['total_clicks'] == 2:
print(f"✓ Interest profile created")
print(f" Total clicks: {profile['total_clicks']}")
print(f" Categories: {len(profile.get('categories', {}))}")
print(f" Keywords: {len(profile.get('keywords', {}))}")
return True
else:
print("❌ Interest profile not created correctly")
return False
def test_phase4_personalization():
"""Phase 4: Verify articles are ranked by user interests"""
print("\n" + "="*60)
print("Phase 4: Personalized Newsletter Generation")
print("="*60)
test_email = 'test-phase4@example.com'
# Clean up
user_interests_collection.delete_many({'email': test_email})
# Get articles
articles = list(articles_collection.find(
{'keywords': {'$exists': True, '$ne': []}},
limit=5
))
if len(articles) < 3:
print("❌ Not enough articles found")
return False
# Create profile
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
# Rank articles
ranked = rank_articles_for_user(articles, test_email)
# Select personalized
selected = select_personalized_articles(articles, test_email, max_articles=3)
# Clean up
user_interests_collection.delete_many({'email': test_email})
has_scores = all('personalization_score' in a for a in selected)
if has_scores and len(selected) > 0:
print(f"✓ Articles ranked and selected")
print(f" Total ranked: {len(ranked)}")
print(f" Selected: {len(selected)}")
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
return True
else:
print("❌ Personalization failed")
return False
def main():
"""Run all personalization tests"""
print("\n" + "="*60)
print("PERSONALIZATION SYSTEM TEST SUITE")
print("="*60)
results = {
'Phase 1: Keyword Extraction': test_phase1_keywords(),
'Phase 2: Click Tracking': test_phase2_tracking(),
'Phase 3: Interest Profiling': test_phase3_profiling(),
'Phase 4: Personalization': test_phase4_personalization()
}
print("\n" + "="*60)
print("TEST RESULTS")
print("="*60)
for phase, passed in results.items():
status = "✅ PASS" if passed else "❌ FAIL"
print(f"{status} - {phase}")
all_passed = all(results.values())
if all_passed:
print("\n🎉 All personalization tests PASSED!")
return 0
else:
print("\n❌ Some tests FAILED")
return 1
if __name__ == '__main__':
sys.exit(main())