Compare commits
15 Commits
news
...
57f37c8dc0
| Author | SHA1 | Date | |
|---|---|---|---|
| 57f37c8dc0 | |||
| 19fabacf5a | |||
| 1656f19077 | |||
| 1c0926a7f0 | |||
| ab47e518fa | |||
| e52042d5a8 | |||
| 2dc6bf44f8 | |||
| 4415e895e2 | |||
| 2034d96c9e | |||
| 5e9820136f | |||
| 84fce9a82c | |||
| 2e80d64ff6 | |||
| 467886a5dd | |||
| 2c1d7fa81a | |||
| c4b0e800e0 |
9
.env.local
Normal file
9
.env.local
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Munich News Daily - Local Development Environment Variables
|
||||||
|
|
||||||
|
# MongoDB Configuration
|
||||||
|
MONGO_USERNAME=admin
|
||||||
|
MONGO_PASSWORD=local123
|
||||||
|
MONGO_AUTH=--auth
|
||||||
|
|
||||||
|
# Ollama Model (use smaller/faster model for local dev)
|
||||||
|
OLLAMA_MODEL=phi3:latest
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -84,7 +84,9 @@ yarn.lock
|
|||||||
.env.production.local
|
.env.production.local
|
||||||
*.env
|
*.env
|
||||||
!.env.example
|
!.env.example
|
||||||
|
!.env.local
|
||||||
!backend/.env.example
|
!backend/.env.example
|
||||||
|
!backend/.env.local
|
||||||
|
|
||||||
# ===================================
|
# ===================================
|
||||||
# Database
|
# Database
|
||||||
|
|||||||
@@ -1,487 +0,0 @@
|
|||||||
# Design Document - AI Article Summarization
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This design integrates Ollama AI into the news crawler workflow to automatically generate concise summaries of articles. The system will extract full article content, send it to Ollama for summarization, and store both the original content and the AI-generated summary in MongoDB.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### High-Level Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
RSS Feed → Extract Content → Summarize with Ollama → Store in MongoDB
|
|
||||||
↓ ↓ ↓
|
|
||||||
Full Article Text AI Summary (≤150 words) Both Stored
|
|
||||||
```
|
|
||||||
|
|
||||||
### Component Diagram
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ News Crawler Service │
|
|
||||||
│ │
|
|
||||||
│ ┌────────────────┐ ┌──────────────────┐ │
|
|
||||||
│ │ RSS Parser │──────→│ Content Extractor│ │
|
|
||||||
│ └────────────────┘ └──────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ ↓ │
|
|
||||||
│ ┌──────────────────┐ │
|
|
||||||
│ │ Ollama Client │ │
|
|
||||||
│ │ (New Component) │ │
|
|
||||||
│ └──────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ ↓ │
|
|
||||||
│ ┌──────────────────┐ │
|
|
||||||
│ │ Database Writer │ │
|
|
||||||
│ └──────────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
↓
|
|
||||||
┌──────────────────┐
|
|
||||||
│ Ollama Server │
|
|
||||||
│ (External) │
|
|
||||||
└──────────────────┘
|
|
||||||
│
|
|
||||||
↓
|
|
||||||
┌──────────────────┐
|
|
||||||
│ MongoDB │
|
|
||||||
└──────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Components and Interfaces
|
|
||||||
|
|
||||||
### 1. Ollama Client Module
|
|
||||||
|
|
||||||
**File:** `news_crawler/ollama_client.py`
|
|
||||||
|
|
||||||
**Purpose:** Handle communication with Ollama server for summarization
|
|
||||||
|
|
||||||
**Interface:**
|
|
||||||
```python
|
|
||||||
class OllamaClient:
|
|
||||||
def __init__(self, base_url, model, api_key=None, enabled=True):
|
|
||||||
"""Initialize Ollama client with configuration"""
|
|
||||||
|
|
||||||
def summarize_article(self, content: str, max_words: int = 150) -> dict:
|
|
||||||
"""
|
|
||||||
Summarize article content using Ollama
|
|
||||||
|
|
||||||
Args:
|
|
||||||
content: Full article text
|
|
||||||
max_words: Maximum words in summary (default 150)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
{
|
|
||||||
'summary': str, # AI-generated summary
|
|
||||||
'word_count': int, # Summary word count
|
|
||||||
'success': bool, # Whether summarization succeeded
|
|
||||||
'error': str or None, # Error message if failed
|
|
||||||
'duration': float # Time taken in seconds
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
def is_available(self) -> bool:
|
|
||||||
"""Check if Ollama server is reachable"""
|
|
||||||
|
|
||||||
def test_connection(self) -> dict:
|
|
||||||
"""Test connection and return server info"""
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Methods:**
|
|
||||||
|
|
||||||
1. **summarize_article()**
|
|
||||||
- Constructs prompt for Ollama
|
|
||||||
- Sends HTTP POST request
|
|
||||||
- Handles timeouts and errors
|
|
||||||
- Validates response
|
|
||||||
- Returns structured result
|
|
||||||
|
|
||||||
2. **is_available()**
|
|
||||||
- Quick health check
|
|
||||||
- Returns True/False
|
|
||||||
- Used before attempting summarization
|
|
||||||
|
|
||||||
3. **test_connection()**
|
|
||||||
- Detailed connection test
|
|
||||||
- Returns server info and model list
|
|
||||||
- Used for diagnostics
|
|
||||||
|
|
||||||
### 2. Enhanced Crawler Service
|
|
||||||
|
|
||||||
**File:** `news_crawler/crawler_service.py`
|
|
||||||
|
|
||||||
**Changes:**
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Add Ollama client initialization
|
|
||||||
from ollama_client import OllamaClient
|
|
||||||
|
|
||||||
# Initialize at module level
|
|
||||||
ollama_client = OllamaClient(
|
|
||||||
base_url=os.getenv('OLLAMA_BASE_URL'),
|
|
||||||
model=os.getenv('OLLAMA_MODEL'),
|
|
||||||
api_key=os.getenv('OLLAMA_API_KEY'),
|
|
||||||
enabled=os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Modify crawl_rss_feed() to include summarization
|
|
||||||
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
|
||||||
# ... existing code ...
|
|
||||||
|
|
||||||
# After extracting content
|
|
||||||
article_data = extract_article_content(article_url)
|
|
||||||
|
|
||||||
# NEW: Summarize with Ollama
|
|
||||||
summary_result = None
|
|
||||||
if ollama_client.enabled and article_data.get('content'):
|
|
||||||
print(f" 🤖 Summarizing with AI...")
|
|
||||||
summary_result = ollama_client.summarize_article(
|
|
||||||
article_data['content'],
|
|
||||||
max_words=150
|
|
||||||
)
|
|
||||||
|
|
||||||
if summary_result['success']:
|
|
||||||
print(f" ✓ Summary generated ({summary_result['word_count']} words)")
|
|
||||||
else:
|
|
||||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
|
||||||
|
|
||||||
# Build article document with summary
|
|
||||||
article_doc = {
|
|
||||||
'title': article_data.get('title'),
|
|
||||||
'author': article_data.get('author'),
|
|
||||||
'link': article_url,
|
|
||||||
'content': article_data.get('content'),
|
|
||||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
|
||||||
'word_count': article_data.get('word_count'),
|
|
||||||
'summary_word_count': summary_result['word_count'] if summary_result and summary_result['success'] else None,
|
|
||||||
'source': feed_name,
|
|
||||||
'published_at': extract_published_date(entry),
|
|
||||||
'crawled_at': article_data.get('crawled_at'),
|
|
||||||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
|
||||||
'created_at': datetime.utcnow()
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Configuration Module
|
|
||||||
|
|
||||||
**File:** `news_crawler/config.py` (new file)
|
|
||||||
|
|
||||||
**Purpose:** Centralize configuration management
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv(dotenv_path='../.env')
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
# MongoDB
|
|
||||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
|
||||||
DB_NAME = 'munich_news'
|
|
||||||
|
|
||||||
# Ollama
|
|
||||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
|
||||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
|
||||||
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
|
|
||||||
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
|
||||||
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
|
|
||||||
|
|
||||||
# Crawler
|
|
||||||
RATE_LIMIT_DELAY = 1 # seconds between requests
|
|
||||||
MAX_CONTENT_LENGTH = 50000 # characters
|
|
||||||
```
|
|
||||||
|
|
||||||
## Data Models
|
|
||||||
|
|
||||||
### Updated Article Schema
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
_id: ObjectId,
|
|
||||||
title: String,
|
|
||||||
author: String,
|
|
||||||
link: String, // Unique index
|
|
||||||
content: String, // Full article content
|
|
||||||
summary: String, // AI-generated summary (≤150 words)
|
|
||||||
word_count: Number, // Original content word count
|
|
||||||
summary_word_count: Number, // Summary word count
|
|
||||||
source: String,
|
|
||||||
published_at: String,
|
|
||||||
crawled_at: DateTime,
|
|
||||||
summarized_at: DateTime, // When AI summary was generated
|
|
||||||
created_at: DateTime
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Ollama Request Format
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "phi3:latest",
|
|
||||||
"prompt": "Summarize the following article in 150 words or less. Focus on the key points and main message:\n\n[ARTICLE CONTENT]",
|
|
||||||
"stream": false,
|
|
||||||
"options": {
|
|
||||||
"temperature": 0.7,
|
|
||||||
"max_tokens": 200
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Ollama Response Format
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "phi3:latest",
|
|
||||||
"created_at": "2024-11-10T16:30:00Z",
|
|
||||||
"response": "The AI-generated summary text here...",
|
|
||||||
"done": true,
|
|
||||||
"total_duration": 5000000000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
### Error Scenarios and Responses
|
|
||||||
|
|
||||||
| Scenario | Handling | User Impact |
|
|
||||||
|----------|----------|-------------|
|
|
||||||
| Ollama server down | Log warning, store original content | Article saved without summary |
|
|
||||||
| Ollama timeout (>30s) | Cancel request, store original | Article saved without summary |
|
|
||||||
| Empty summary returned | Log error, store original | Article saved without summary |
|
|
||||||
| Invalid response format | Log error, store original | Article saved without summary |
|
|
||||||
| Network error | Retry once, then store original | Article saved without summary |
|
|
||||||
| Model not found | Log error, disable Ollama | All articles saved without summaries |
|
|
||||||
|
|
||||||
### Error Logging Format
|
|
||||||
|
|
||||||
```python
|
|
||||||
{
|
|
||||||
'timestamp': datetime.utcnow(),
|
|
||||||
'article_url': article_url,
|
|
||||||
'error_type': 'timeout|connection|invalid_response|empty_summary',
|
|
||||||
'error_message': str(error),
|
|
||||||
'ollama_config': {
|
|
||||||
'base_url': OLLAMA_BASE_URL,
|
|
||||||
'model': OLLAMA_MODEL,
|
|
||||||
'enabled': OLLAMA_ENABLED
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing Strategy
|
|
||||||
|
|
||||||
### Unit Tests
|
|
||||||
|
|
||||||
1. **test_ollama_client.py**
|
|
||||||
- Test summarization with mock responses
|
|
||||||
- Test timeout handling
|
|
||||||
- Test error scenarios
|
|
||||||
- Test connection checking
|
|
||||||
|
|
||||||
2. **test_crawler_with_ollama.py**
|
|
||||||
- Test crawler with Ollama enabled
|
|
||||||
- Test crawler with Ollama disabled
|
|
||||||
- Test fallback when Ollama fails
|
|
||||||
- Test rate limiting
|
|
||||||
|
|
||||||
### Integration Tests
|
|
||||||
|
|
||||||
1. **test_end_to_end.py**
|
|
||||||
- Crawl real RSS feed
|
|
||||||
- Summarize with real Ollama
|
|
||||||
- Verify database storage
|
|
||||||
- Check all fields populated
|
|
||||||
|
|
||||||
### Manual Testing
|
|
||||||
|
|
||||||
1. Test with Ollama enabled and working
|
|
||||||
2. Test with Ollama disabled
|
|
||||||
3. Test with Ollama unreachable
|
|
||||||
4. Test with slow Ollama responses
|
|
||||||
5. Test with various article lengths
|
|
||||||
|
|
||||||
## Performance Considerations
|
|
||||||
|
|
||||||
### Timing Estimates
|
|
||||||
|
|
||||||
- Article extraction: 2-5 seconds
|
|
||||||
- Ollama summarization: 5-15 seconds (depends on article length and model)
|
|
||||||
- Database write: <1 second
|
|
||||||
- **Total per article: 8-21 seconds**
|
|
||||||
|
|
||||||
### Optimization Strategies
|
|
||||||
|
|
||||||
1. **Sequential Processing**
|
|
||||||
- Process one article at a time
|
|
||||||
- Prevents overwhelming Ollama
|
|
||||||
- Easier to debug
|
|
||||||
|
|
||||||
2. **Timeout Management**
|
|
||||||
- 30-second timeout per request
|
|
||||||
- Prevents hanging on slow responses
|
|
||||||
|
|
||||||
3. **Rate Limiting**
|
|
||||||
- 1-second delay between articles
|
|
||||||
- Respects server resources
|
|
||||||
|
|
||||||
4. **Future: Batch Processing**
|
|
||||||
- Queue articles for summarization
|
|
||||||
- Process in batches
|
|
||||||
- Use Celery for async processing
|
|
||||||
|
|
||||||
### Resource Usage
|
|
||||||
|
|
||||||
- **Memory**: ~100MB per crawler instance
|
|
||||||
- **Network**: ~1-5KB per article (to Ollama)
|
|
||||||
- **Storage**: +150 words per article (~1KB)
|
|
||||||
- **CPU**: Minimal (Ollama does the heavy lifting)
|
|
||||||
|
|
||||||
## Security Considerations
|
|
||||||
|
|
||||||
1. **API Key Storage**
|
|
||||||
- Store in environment variables
|
|
||||||
- Never commit to git
|
|
||||||
- Use secrets management in production
|
|
||||||
|
|
||||||
2. **Content Sanitization**
|
|
||||||
- Don't log full article content
|
|
||||||
- Sanitize URLs in logs
|
|
||||||
- Limit error message detail
|
|
||||||
|
|
||||||
3. **Network Security**
|
|
||||||
- Support HTTPS for Ollama
|
|
||||||
- Validate SSL certificates
|
|
||||||
- Use secure connections
|
|
||||||
|
|
||||||
4. **Rate Limiting**
|
|
||||||
- Prevent abuse of Ollama server
|
|
||||||
- Implement backoff on errors
|
|
||||||
- Monitor usage patterns
|
|
||||||
|
|
||||||
## Deployment Considerations
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Required
|
|
||||||
OLLAMA_BASE_URL=http://localhost:11434
|
|
||||||
OLLAMA_MODEL=phi3:latest
|
|
||||||
OLLAMA_ENABLED=true
|
|
||||||
|
|
||||||
# Optional
|
|
||||||
OLLAMA_API_KEY=your-api-key
|
|
||||||
OLLAMA_TIMEOUT=30
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker Deployment
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# docker-compose.yml
|
|
||||||
services:
|
|
||||||
crawler:
|
|
||||||
build: ./news_crawler
|
|
||||||
environment:
|
|
||||||
- OLLAMA_BASE_URL=http://ollama:11434
|
|
||||||
- OLLAMA_ENABLED=true
|
|
||||||
depends_on:
|
|
||||||
- ollama
|
|
||||||
- mongodb
|
|
||||||
|
|
||||||
ollama:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
ports:
|
|
||||||
- "11434:11434"
|
|
||||||
volumes:
|
|
||||||
- ollama_data:/root/.ollama
|
|
||||||
```
|
|
||||||
|
|
||||||
### Monitoring
|
|
||||||
|
|
||||||
1. **Metrics to Track**
|
|
||||||
- Summarization success rate
|
|
||||||
- Average summarization time
|
|
||||||
- Ollama server uptime
|
|
||||||
- Error frequency by type
|
|
||||||
|
|
||||||
2. **Logging**
|
|
||||||
- Log all summarization attempts
|
|
||||||
- Log errors with context
|
|
||||||
- Log performance metrics
|
|
||||||
|
|
||||||
3. **Alerts**
|
|
||||||
- Alert if Ollama is down >5 minutes
|
|
||||||
- Alert if success rate <80%
|
|
||||||
- Alert if average time >20 seconds
|
|
||||||
|
|
||||||
## Migration Plan
|
|
||||||
|
|
||||||
### Phase 1: Add Ollama Client (Week 1)
|
|
||||||
- Create ollama_client.py
|
|
||||||
- Add configuration
|
|
||||||
- Write unit tests
|
|
||||||
- Test with sample articles
|
|
||||||
|
|
||||||
### Phase 2: Integrate with Crawler (Week 1)
|
|
||||||
- Modify crawler_service.py
|
|
||||||
- Add summarization step
|
|
||||||
- Update database schema
|
|
||||||
- Test end-to-end
|
|
||||||
|
|
||||||
### Phase 3: Update Backend API (Week 2)
|
|
||||||
- Update news routes
|
|
||||||
- Add summary fields to responses
|
|
||||||
- Update frontend to display summaries
|
|
||||||
- Deploy to production
|
|
||||||
|
|
||||||
### Phase 4: Monitor and Optimize (Ongoing)
|
|
||||||
- Monitor performance
|
|
||||||
- Tune prompts for better summaries
|
|
||||||
- Optimize rate limiting
|
|
||||||
- Add batch processing if needed
|
|
||||||
|
|
||||||
## Rollback Plan
|
|
||||||
|
|
||||||
If issues arise:
|
|
||||||
|
|
||||||
1. **Immediate**: Set `OLLAMA_ENABLED=false`
|
|
||||||
2. **Short-term**: Revert crawler code changes
|
|
||||||
3. **Long-term**: Remove Ollama integration
|
|
||||||
|
|
||||||
System will continue to work with original content if Ollama is disabled.
|
|
||||||
|
|
||||||
## Success Metrics
|
|
||||||
|
|
||||||
- ✅ 95%+ of articles successfully summarized
|
|
||||||
- ✅ Average summarization time <15 seconds
|
|
||||||
- ✅ Zero data loss (all articles stored even if summarization fails)
|
|
||||||
- ✅ Ollama uptime >99%
|
|
||||||
- ✅ Summary quality: readable and accurate (manual review)
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
1. **Multi-language Support**
|
|
||||||
- Detect article language
|
|
||||||
- Use appropriate model
|
|
||||||
- Translate summaries
|
|
||||||
|
|
||||||
2. **Custom Summary Lengths**
|
|
||||||
- Allow configuration per feed
|
|
||||||
- Support different lengths for different use cases
|
|
||||||
|
|
||||||
3. **Sentiment Analysis**
|
|
||||||
- Add sentiment score
|
|
||||||
- Categorize as positive/negative/neutral
|
|
||||||
|
|
||||||
4. **Keyword Extraction**
|
|
||||||
- Extract key topics
|
|
||||||
- Enable better search
|
|
||||||
|
|
||||||
5. **Batch Processing**
|
|
||||||
- Queue articles
|
|
||||||
- Process in parallel
|
|
||||||
- Use Celery for async
|
|
||||||
|
|
||||||
6. **Caching**
|
|
||||||
- Cache summaries
|
|
||||||
- Avoid re-processing
|
|
||||||
- Use Redis for cache
|
|
||||||
@@ -1,164 +0,0 @@
|
|||||||
# Requirements Document
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
This feature integrates Ollama AI into the news crawler to automatically summarize articles before storing them in the database. Instead of storing full article content, the system will generate concise 150-word summaries using AI, making the content more digestible for newsletter readers and reducing storage requirements.
|
|
||||||
|
|
||||||
## Glossary
|
|
||||||
|
|
||||||
- **Crawler Service**: The standalone microservice that fetches and processes article content from RSS feeds
|
|
||||||
- **Ollama Server**: The AI inference server that provides text summarization capabilities
|
|
||||||
- **Article Content**: The full text extracted from a news article webpage
|
|
||||||
- **Summary**: A concise AI-generated version of the article content (max 150 words)
|
|
||||||
- **MongoDB**: The database where articles and summaries are stored
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
### Requirement 1: Ollama Integration in Crawler
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want the crawler to use Ollama for summarization, so that articles are automatically condensed before storage.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN the crawler extracts article content, THE Crawler Service SHALL send the content to the Ollama Server for summarization
|
|
||||||
2. WHEN sending content to Ollama, THE Crawler Service SHALL include a prompt requesting a summary of 150 words or less
|
|
||||||
3. WHEN Ollama returns a summary, THE Crawler Service SHALL validate that the summary is not empty
|
|
||||||
4. IF the Ollama Server is unavailable, THEN THE Crawler Service SHALL store the original content without summarization and log a warning
|
|
||||||
5. WHEN summarization fails, THE Crawler Service SHALL continue processing other articles without stopping
|
|
||||||
|
|
||||||
### Requirement 2: Configuration Management
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want to configure Ollama settings, so that I can control the summarization behavior.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Crawler Service SHALL read Ollama configuration from environment variables
|
|
||||||
2. THE Crawler Service SHALL support the following configuration options:
|
|
||||||
- OLLAMA_BASE_URL (server URL)
|
|
||||||
- OLLAMA_MODEL (model name)
|
|
||||||
- OLLAMA_ENABLED (enable/disable flag)
|
|
||||||
- OLLAMA_API_KEY (optional authentication)
|
|
||||||
3. WHERE OLLAMA_ENABLED is false, THE Crawler Service SHALL store original content without summarization
|
|
||||||
4. WHERE OLLAMA_ENABLED is true AND Ollama is unreachable, THE Crawler Service SHALL log an error and store original content
|
|
||||||
|
|
||||||
### Requirement 3: Summary Storage
|
|
||||||
|
|
||||||
**User Story:** As a developer, I want summaries stored in the database, so that the frontend can display concise article previews.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN a summary is generated, THE Crawler Service SHALL store it in the `summary` field in MongoDB
|
|
||||||
2. WHEN storing an article, THE Crawler Service SHALL include both the original content and the AI summary
|
|
||||||
3. THE Crawler Service SHALL store the following fields:
|
|
||||||
- `content` (original full text)
|
|
||||||
- `summary` (AI-generated, max 150 words)
|
|
||||||
- `word_count` (original content word count)
|
|
||||||
- `summary_word_count` (summary word count)
|
|
||||||
- `summarized_at` (timestamp when summarized)
|
|
||||||
4. WHEN an article already has a summary, THE Crawler Service SHALL not re-summarize it
|
|
||||||
|
|
||||||
### Requirement 4: Error Handling and Resilience
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want the crawler to handle AI failures gracefully, so that the system remains reliable.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. IF Ollama returns an error, THEN THE Crawler Service SHALL log the error and store the original content
|
|
||||||
2. IF Ollama times out (>30 seconds), THEN THE Crawler Service SHALL cancel the request and store the original content
|
|
||||||
3. IF the summary is empty or invalid, THEN THE Crawler Service SHALL store the original content
|
|
||||||
4. WHEN an error occurs, THE Crawler Service SHALL include an error indicator in the database record
|
|
||||||
5. THE Crawler Service SHALL continue processing remaining articles after any summarization failure
|
|
||||||
|
|
||||||
### Requirement 5: Performance and Rate Limiting
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want the crawler to respect rate limits, so that it doesn't overwhelm the Ollama server.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Crawler Service SHALL wait at least 1 second between Ollama API calls
|
|
||||||
2. THE Crawler Service SHALL set a timeout of 30 seconds for each Ollama request
|
|
||||||
3. WHEN processing multiple articles, THE Crawler Service SHALL process them sequentially to avoid overloading Ollama
|
|
||||||
4. THE Crawler Service SHALL log the time taken for each summarization
|
|
||||||
5. THE Crawler Service SHALL display progress indicators showing summarization status
|
|
||||||
|
|
||||||
### Requirement 6: Monitoring and Logging
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want detailed logs of summarization activity, so that I can monitor and troubleshoot the system.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Crawler Service SHALL log when summarization starts for each article
|
|
||||||
2. THE Crawler Service SHALL log the original word count and summary word count
|
|
||||||
3. THE Crawler Service SHALL log any errors or warnings from Ollama
|
|
||||||
4. THE Crawler Service SHALL display a summary of total articles summarized at the end
|
|
||||||
5. THE Crawler Service SHALL include summarization statistics in the final report
|
|
||||||
|
|
||||||
### Requirement 7: API Endpoint Updates
|
|
||||||
|
|
||||||
**User Story:** As a frontend developer, I want API endpoints to return summaries, so that I can display them to users.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN fetching articles via GET /api/news, THE Backend API SHALL include the `summary` field if available
|
|
||||||
2. WHEN fetching a single article via GET /api/news/<url>, THE Backend API SHALL include both `content` and `summary`
|
|
||||||
3. THE Backend API SHALL include a `has_summary` boolean field indicating if AI summarization was performed
|
|
||||||
4. THE Backend API SHALL include `summarized_at` timestamp if available
|
|
||||||
5. WHERE no summary exists, THE Backend API SHALL return a preview of the original content (first 200 chars)
|
|
||||||
|
|
||||||
### Requirement 8: Backward Compatibility
|
|
||||||
|
|
||||||
**User Story:** As a developer, I want the system to work with existing articles, so that no data migration is required.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Crawler Service SHALL work with articles that don't have summaries
|
|
||||||
2. THE Backend API SHALL handle articles with or without summaries gracefully
|
|
||||||
3. WHERE an article has no summary, THE Backend API SHALL generate a preview from the content field
|
|
||||||
4. THE Crawler Service SHALL not re-process articles that already have summaries
|
|
||||||
5. THE system SHALL continue to function if Ollama is disabled or unavailable
|
|
||||||
|
|
||||||
## Non-Functional Requirements
|
|
||||||
|
|
||||||
### Performance
|
|
||||||
- Summarization SHALL complete within 30 seconds per article
|
|
||||||
- The crawler SHALL process at least 10 articles per minute (including summarization)
|
|
||||||
- Database operations SHALL not be significantly slower with summary storage
|
|
||||||
|
|
||||||
### Reliability
|
|
||||||
- The system SHALL maintain 99% uptime even if Ollama is unavailable
|
|
||||||
- Failed summarizations SHALL not prevent article storage
|
|
||||||
- The crawler SHALL recover from Ollama errors without manual intervention
|
|
||||||
|
|
||||||
### Security
|
|
||||||
- Ollama API keys SHALL be stored in environment variables, not in code
|
|
||||||
- Article content SHALL not be logged to prevent sensitive data exposure
|
|
||||||
- API communication with Ollama SHALL support HTTPS
|
|
||||||
|
|
||||||
### Scalability
|
|
||||||
- The system SHALL support multiple Ollama servers for load balancing (future)
|
|
||||||
- The crawler SHALL handle articles of any length (up to 50,000 words)
|
|
||||||
- The database schema SHALL support future enhancements (tags, categories, etc.)
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
- Ollama server must be running and accessible
|
|
||||||
- `requests` Python library for HTTP communication
|
|
||||||
- Environment variables properly configured
|
|
||||||
- MongoDB with sufficient storage for both content and summaries
|
|
||||||
|
|
||||||
## Assumptions
|
|
||||||
|
|
||||||
- Ollama server is already set up and configured
|
|
||||||
- The phi3:latest model (or configured model) supports summarization tasks
|
|
||||||
- Network connectivity between crawler and Ollama server is reliable
|
|
||||||
- Articles are in English or the configured Ollama model supports the article language
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
- Support for multiple languages
|
|
||||||
- Customizable summary length
|
|
||||||
- Sentiment analysis integration
|
|
||||||
- Keyword extraction
|
|
||||||
- Category classification
|
|
||||||
- Batch summarization for improved performance
|
|
||||||
- Caching of summaries to avoid re-processing
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
# Implementation Plan
|
|
||||||
|
|
||||||
- [x] 1. Create Ollama client module
|
|
||||||
- Create `news_crawler/ollama_client.py` with OllamaClient class
|
|
||||||
- Implement `summarize_article()` method with prompt construction and API call
|
|
||||||
- Implement `is_available()` method for health checks
|
|
||||||
- Implement `test_connection()` method for diagnostics
|
|
||||||
- Add timeout handling (30 seconds)
|
|
||||||
- Add error handling for connection, timeout, and invalid responses
|
|
||||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 4.1, 4.2, 4.3, 5.2_
|
|
||||||
|
|
||||||
- [x] 2. Create configuration module for crawler
|
|
||||||
- Create `news_crawler/config.py` with Config class
|
|
||||||
- Load environment variables (OLLAMA_BASE_URL, OLLAMA_MODEL, OLLAMA_ENABLED, OLLAMA_API_KEY, OLLAMA_TIMEOUT)
|
|
||||||
- Add validation for required configuration
|
|
||||||
- Add default values for optional configuration
|
|
||||||
- _Requirements: 2.1, 2.2, 2.3, 2.4_
|
|
||||||
|
|
||||||
- [x] 3. Integrate Ollama client into crawler service
|
|
||||||
- Import OllamaClient in `news_crawler/crawler_service.py`
|
|
||||||
- Initialize Ollama client at module level using Config
|
|
||||||
- Modify `crawl_rss_feed()` to call summarization after content extraction
|
|
||||||
- Add conditional logic to skip summarization if OLLAMA_ENABLED is false
|
|
||||||
- Add error handling to continue processing if summarization fails
|
|
||||||
- Add logging for summarization start, success, and failure
|
|
||||||
- Add rate limiting delay after summarization
|
|
||||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 2.3, 2.4, 4.1, 4.5, 5.1, 5.3, 6.1, 6.2, 6.3_
|
|
||||||
|
|
||||||
- [x] 4. Update database schema and storage
|
|
||||||
- Modify article document structure in `crawl_rss_feed()` to include:
|
|
||||||
- `summary` field (AI-generated summary)
|
|
||||||
- `summary_word_count` field
|
|
||||||
- `summarized_at` field (timestamp)
|
|
||||||
- Update MongoDB upsert logic to handle new fields
|
|
||||||
- Add check to skip re-summarization if article already has summary
|
|
||||||
- _Requirements: 3.1, 3.2, 3.3, 3.4, 8.4_
|
|
||||||
|
|
||||||
- [x] 5. Update backend API to return summaries
|
|
||||||
- Modify `backend/routes/news_routes.py` GET /api/news endpoint
|
|
||||||
- Add `summary`, `summary_word_count`, `summarized_at` fields to response
|
|
||||||
- Add `has_summary` boolean field to indicate if AI summarization was performed
|
|
||||||
- Modify GET /api/news/<url> endpoint to include summary fields
|
|
||||||
- Add fallback to content preview if no summary exists
|
|
||||||
- _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3_
|
|
||||||
|
|
||||||
- [x] 6. Update database schema documentation
|
|
||||||
- Update `backend/DATABASE_SCHEMA.md` with new summary fields
|
|
||||||
- Add example document showing summary fields
|
|
||||||
- Document the summarization workflow
|
|
||||||
- _Requirements: 3.1, 3.2, 3.3_
|
|
||||||
|
|
||||||
- [x] 7. Add environment variable configuration
|
|
||||||
- Update `backend/env.template` with Ollama configuration
|
|
||||||
- Add comments explaining each Ollama setting
|
|
||||||
- Document default values
|
|
||||||
- _Requirements: 2.1, 2.2_
|
|
||||||
|
|
||||||
- [x] 8. Create test script for Ollama integration
|
|
||||||
- Create `news_crawler/test_ollama.py` to test Ollama connection
|
|
||||||
- Test summarization with sample article
|
|
||||||
- Test error handling (timeout, connection failure)
|
|
||||||
- Display configuration and connection status
|
|
||||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 4.1, 4.2_
|
|
||||||
|
|
||||||
- [x] 9. Update crawler statistics and logging
|
|
||||||
- Add summarization statistics to final report in `crawl_all_feeds()`
|
|
||||||
- Track total articles summarized vs failed
|
|
||||||
- Log average summarization time
|
|
||||||
- Display progress indicators during summarization
|
|
||||||
- _Requirements: 5.4, 6.1, 6.2, 6.3, 6.4, 6.5_
|
|
||||||
|
|
||||||
- [x] 10. Create documentation for AI summarization
|
|
||||||
- Create `news_crawler/AI_SUMMARIZATION.md` explaining the feature
|
|
||||||
- Document configuration options
|
|
||||||
- Provide troubleshooting guide
|
|
||||||
- Add examples of usage
|
|
||||||
- _Requirements: 2.1, 2.2, 2.3, 2.4, 6.1, 6.2, 6.3_
|
|
||||||
|
|
||||||
- [x] 11. Update main README with AI summarization info
|
|
||||||
- Add section about AI summarization feature
|
|
||||||
- Document Ollama setup requirements
|
|
||||||
- Add configuration examples
|
|
||||||
- Update API endpoint documentation
|
|
||||||
- _Requirements: 2.1, 2.2, 7.1, 7.2_
|
|
||||||
|
|
||||||
- [x] 12. Test end-to-end workflow
|
|
||||||
- Run crawler with Ollama enabled
|
|
||||||
- Verify articles are summarized correctly
|
|
||||||
- Check database contains all expected fields
|
|
||||||
- Test API endpoints return summaries
|
|
||||||
- Verify error handling when Ollama is disabled/unavailable
|
|
||||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5, 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3, 8.4, 8.5_
|
|
||||||
@@ -1,328 +0,0 @@
|
|||||||
# Design Document: Article Title Translation
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This feature extends the existing Ollama AI integration to translate German article titles to English during the crawling process. The translation will be performed immediately after article content extraction and before AI summarization. Both the original German title and English translation will be stored in the MongoDB article document, and the newsletter template will be updated to display the English title prominently with the original as a subtitle.
|
|
||||||
|
|
||||||
The design leverages the existing Ollama infrastructure (same server, configuration, and error handling patterns) to minimize complexity and maintain consistency with the current summarization feature.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### Component Interaction Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
RSS Feed Entry
|
|
||||||
↓
|
|
||||||
Crawler Service (extract_article_content)
|
|
||||||
↓
|
|
||||||
Article Data (with German title)
|
|
||||||
↓
|
|
||||||
Ollama Client (translate_title) ← New Method
|
|
||||||
↓
|
|
||||||
Translation Result
|
|
||||||
↓
|
|
||||||
Crawler Service (prepare article_doc)
|
|
||||||
↓
|
|
||||||
MongoDB (articles collection with title + title_en)
|
|
||||||
↓
|
|
||||||
Newsletter Service (fetch articles)
|
|
||||||
↓
|
|
||||||
Newsletter Template (display English title + German subtitle)
|
|
||||||
↓
|
|
||||||
Email to Subscribers
|
|
||||||
```
|
|
||||||
|
|
||||||
### Integration Points
|
|
||||||
|
|
||||||
1. **Ollama Client** - Add new `translate_title()` method alongside existing `summarize_article()` method
|
|
||||||
2. **Crawler Service** - Call translation after content extraction, before summarization
|
|
||||||
3. **Article Document Schema** - Add `title_en` and `translated_at` fields
|
|
||||||
4. **Newsletter Template** - Update title display logic to show English/German titles
|
|
||||||
|
|
||||||
## Components and Interfaces
|
|
||||||
|
|
||||||
### 1. Ollama Client Extension
|
|
||||||
|
|
||||||
**New Method: `translate_title(title, target_language='English')`**
|
|
||||||
|
|
||||||
```python
|
|
||||||
def translate_title(self, title, target_language='English'):
|
|
||||||
"""
|
|
||||||
Translate article title to target language
|
|
||||||
|
|
||||||
Args:
|
|
||||||
title (str): Original German title
|
|
||||||
target_language (str): Target language (default: 'English')
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: {
|
|
||||||
'success': bool,
|
|
||||||
'translated_title': str or None,
|
|
||||||
'error': str or None,
|
|
||||||
'duration': float
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation Details:**
|
|
||||||
|
|
||||||
- **Prompt Engineering**: Clear, concise prompt instructing the model to translate only the headline without explanations
|
|
||||||
- **Temperature**: 0.3 (lower than summarization's 0.7) for more consistent, deterministic translations
|
|
||||||
- **Token Limit**: 100 tokens (sufficient for title-length outputs)
|
|
||||||
- **Response Cleaning**:
|
|
||||||
- Remove surrounding quotes (single and double)
|
|
||||||
- Extract first line only (ignore any extra text)
|
|
||||||
- Trim whitespace
|
|
||||||
- **Error Handling**: Same pattern as `summarize_article()` - catch timeouts, connection errors, HTTP errors
|
|
||||||
- **Validation**: Check for empty title input before making API call
|
|
||||||
|
|
||||||
### 2. Crawler Service Integration
|
|
||||||
|
|
||||||
**Location**: In `crawl_rss_feed()` function, after content extraction
|
|
||||||
|
|
||||||
**Execution Order**:
|
|
||||||
1. Extract article content (existing)
|
|
||||||
2. **Translate title** (new)
|
|
||||||
3. Summarize article (existing)
|
|
||||||
4. Save to database (modified)
|
|
||||||
|
|
||||||
**Implementation Pattern**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# After article_data extraction
|
|
||||||
translation_result = None
|
|
||||||
original_title = article_data.get('title') or entry.get('title', '')
|
|
||||||
|
|
||||||
if Config.OLLAMA_ENABLED:
|
|
||||||
# Translate title
|
|
||||||
print(f" 🌐 Translating title...")
|
|
||||||
translation_result = ollama_client.translate_title(original_title)
|
|
||||||
|
|
||||||
if translation_result and translation_result['success']:
|
|
||||||
print(f" ✓ Title translated ({translation_result['duration']:.1f}s)")
|
|
||||||
else:
|
|
||||||
print(f" ⚠ Translation failed: {translation_result['error']}")
|
|
||||||
|
|
||||||
# Then summarize (existing code)
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
**Console Output Format**:
|
|
||||||
- Success: `✓ Title translated (0.8s)`
|
|
||||||
- Failure: `⚠ Translation failed: Request timed out`
|
|
||||||
|
|
||||||
### 3. Data Models
|
|
||||||
|
|
||||||
**MongoDB Article Document Schema Extension**:
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
// Existing fields
|
|
||||||
title: String, // Original German title
|
|
||||||
author: String,
|
|
||||||
link: String,
|
|
||||||
content: String,
|
|
||||||
summary: String,
|
|
||||||
word_count: Number,
|
|
||||||
summary_word_count: Number,
|
|
||||||
source: String,
|
|
||||||
category: String,
|
|
||||||
published_at: Date,
|
|
||||||
crawled_at: Date,
|
|
||||||
summarized_at: Date,
|
|
||||||
created_at: Date,
|
|
||||||
|
|
||||||
// New fields
|
|
||||||
title_en: String, // English translation of title (nullable)
|
|
||||||
translated_at: Date // Timestamp when translation completed (nullable)
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Field Behavior**:
|
|
||||||
- `title_en`: NULL if translation fails or Ollama is disabled
|
|
||||||
- `translated_at`: NULL if translation fails, set to `datetime.utcnow()` on success
|
|
||||||
|
|
||||||
### 4. Newsletter Template Updates
|
|
||||||
|
|
||||||
**Current Title Display**:
|
|
||||||
```html
|
|
||||||
<h2 style="...">
|
|
||||||
{{ article.title }}
|
|
||||||
</h2>
|
|
||||||
```
|
|
||||||
|
|
||||||
**New Title Display Logic**:
|
|
||||||
```html
|
|
||||||
<!-- Primary title: English if available, otherwise German -->
|
|
||||||
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
|
|
||||||
{{ article.title_en if article.title_en else article.title }}
|
|
||||||
</h2>
|
|
||||||
|
|
||||||
<!-- Subtitle: Original German title (only if English translation exists and differs) -->
|
|
||||||
{% if article.title_en and article.title_en != article.title %}
|
|
||||||
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999; font-style: italic;">
|
|
||||||
Original: {{ article.title }}
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Display Rules**:
|
|
||||||
1. If `title_en` exists and differs from `title`: Show English as primary, German as subtitle
|
|
||||||
2. If `title_en` is NULL or same as `title`: Show only the original title
|
|
||||||
3. Subtitle styling: Smaller font (13px), gray color (#999999), italic
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
### Translation Failure Scenarios
|
|
||||||
|
|
||||||
| Scenario | Behavior | User Impact |
|
|
||||||
|----------|----------|-------------|
|
|
||||||
| Ollama server unavailable | Skip translation, continue with summarization | Newsletter shows German title only |
|
|
||||||
| Translation timeout | Log error, store NULL in title_en | Newsletter shows German title only |
|
|
||||||
| Empty title input | Return error immediately, skip API call | Newsletter shows German title only |
|
|
||||||
| Ollama disabled in config | Skip translation entirely | Newsletter shows German title only |
|
|
||||||
| Network error | Catch exception, log error, continue | Newsletter shows German title only |
|
|
||||||
|
|
||||||
### Error Handling Principles
|
|
||||||
|
|
||||||
1. **Non-blocking**: Translation failures never prevent article processing
|
|
||||||
2. **Graceful degradation**: Fall back to original German title
|
|
||||||
3. **Consistent logging**: All errors logged with descriptive messages
|
|
||||||
4. **No retry logic**: Single attempt per article (same as summarization)
|
|
||||||
5. **Silent failures**: Newsletter displays seamlessly regardless of translation status
|
|
||||||
|
|
||||||
### Console Output Examples
|
|
||||||
|
|
||||||
**Success Case**:
|
|
||||||
```
|
|
||||||
🔍 Crawling: Neuer U-Bahn-Ausbau in München geplant...
|
|
||||||
🌐 Translating title...
|
|
||||||
✓ Title translated (0.8s)
|
|
||||||
🤖 Summarizing with AI...
|
|
||||||
✓ Summary: 45 words (from 320 words, 2.3s)
|
|
||||||
✓ Saved (320 words)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Translation Failure Case**:
|
|
||||||
```
|
|
||||||
🔍 Crawling: Neuer U-Bahn-Ausbau in München geplant...
|
|
||||||
🌐 Translating title...
|
|
||||||
⚠ Translation failed: Request timed out after 30 seconds
|
|
||||||
🤖 Summarizing with AI...
|
|
||||||
✓ Summary: 45 words (from 320 words, 2.3s)
|
|
||||||
✓ Saved (320 words)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing Strategy
|
|
||||||
|
|
||||||
### Unit Testing
|
|
||||||
|
|
||||||
**Ollama Client Tests** (`test_ollama_client.py`):
|
|
||||||
1. Test successful translation with valid German title
|
|
||||||
2. Test empty title input handling
|
|
||||||
3. Test timeout handling
|
|
||||||
4. Test connection error handling
|
|
||||||
5. Test response cleaning (quotes, newlines, whitespace)
|
|
||||||
6. Test translation with special characters
|
|
||||||
7. Test translation with very long titles
|
|
||||||
|
|
||||||
**Test Data Examples**:
|
|
||||||
- Simple: "München plant neue U-Bahn-Linie"
|
|
||||||
- With quotes: "\"Historischer Tag\" für München"
|
|
||||||
- With special chars: "Oktoberfest 2024: 7,5 Millionen Besucher"
|
|
||||||
- Long: "Stadtrat beschließt umfassende Maßnahmen zur Verbesserung der Verkehrsinfrastruktur..."
|
|
||||||
|
|
||||||
### Integration Testing
|
|
||||||
|
|
||||||
**Crawler Service Tests**:
|
|
||||||
1. Test article processing with translation enabled
|
|
||||||
2. Test article processing with translation disabled
|
|
||||||
3. Test article processing when translation fails
|
|
||||||
4. Test database document structure includes new fields
|
|
||||||
5. Test console output formatting
|
|
||||||
|
|
||||||
### Manual Testing
|
|
||||||
|
|
||||||
**End-to-End Workflow**:
|
|
||||||
1. Enable Ollama in configuration
|
|
||||||
2. Trigger crawl with `max_articles=2`
|
|
||||||
3. Verify console shows translation status
|
|
||||||
4. Check MongoDB for `title_en` and `translated_at` fields
|
|
||||||
5. Send test newsletter
|
|
||||||
6. Verify email displays English title with German subtitle
|
|
||||||
|
|
||||||
**Test Scenarios**:
|
|
||||||
- Fresh crawl with Ollama enabled
|
|
||||||
- Re-crawl existing articles (should skip translation)
|
|
||||||
- Crawl with Ollama disabled
|
|
||||||
- Crawl with Ollama server stopped (simulate failure)
|
|
||||||
|
|
||||||
### Performance Testing
|
|
||||||
|
|
||||||
**Metrics to Monitor**:
|
|
||||||
- Translation duration per article (target: < 2 seconds)
|
|
||||||
- Impact on total crawl time (translation + summarization)
|
|
||||||
- Ollama server resource usage
|
|
||||||
|
|
||||||
**Expected Performance**:
|
|
||||||
- Translation: ~0.5-1.5 seconds per title
|
|
||||||
- Total per article: ~3-5 seconds (translation + summarization)
|
|
||||||
- Acceptable for batch processing during scheduled crawls
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### No New Configuration Required
|
|
||||||
|
|
||||||
The translation feature uses existing Ollama configuration:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# From config.py (existing)
|
|
||||||
OLLAMA_ENABLED = True/False
|
|
||||||
OLLAMA_BASE_URL = "http://ollama:11434"
|
|
||||||
OLLAMA_MODEL = "phi3:latest"
|
|
||||||
OLLAMA_TIMEOUT = 30
|
|
||||||
```
|
|
||||||
|
|
||||||
**Rationale**: Simplifies deployment and maintains consistency. Translation is automatically enabled/disabled with the existing `OLLAMA_ENABLED` flag.
|
|
||||||
|
|
||||||
## Deployment Considerations
|
|
||||||
|
|
||||||
### Docker Container Updates
|
|
||||||
|
|
||||||
**Affected Services**:
|
|
||||||
- `crawler` service: Needs rebuild to include new translation code
|
|
||||||
- `sender` service: Needs rebuild to include updated newsletter template
|
|
||||||
|
|
||||||
**Deployment Steps**:
|
|
||||||
1. Update code in `news_crawler/ollama_client.py`
|
|
||||||
2. Update code in `news_crawler/crawler_service.py`
|
|
||||||
3. Update template in `news_sender/newsletter_template.html`
|
|
||||||
4. Rebuild containers: `docker-compose up -d --build crawler sender`
|
|
||||||
5. No database migration needed (new fields are nullable)
|
|
||||||
|
|
||||||
### Backward Compatibility
|
|
||||||
|
|
||||||
**Existing Articles**: Articles without `title_en` will display German title only (graceful fallback)
|
|
||||||
|
|
||||||
**No Breaking Changes**: Newsletter template handles NULL `title_en` values
|
|
||||||
|
|
||||||
### Rollback Plan
|
|
||||||
|
|
||||||
If issues arise:
|
|
||||||
1. Revert code changes
|
|
||||||
2. Rebuild containers
|
|
||||||
3. Existing articles with `title_en` will continue to work
|
|
||||||
4. New articles will only have German titles
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
### Potential Improvements (Out of Scope)
|
|
||||||
|
|
||||||
1. **Batch Translation**: Translate multiple titles in single API call for efficiency
|
|
||||||
2. **Translation Caching**: Cache common phrases/words to reduce API calls
|
|
||||||
3. **Multi-language Support**: Add configuration for target language selection
|
|
||||||
4. **Translation Quality Metrics**: Track and log translation quality scores
|
|
||||||
5. **Retry Logic**: Implement retry with exponential backoff for failed translations
|
|
||||||
6. **Admin API**: Add endpoint to re-translate existing articles
|
|
||||||
|
|
||||||
These enhancements are not included in the current implementation to maintain simplicity and focus on core functionality.
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
# Requirements Document
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
This feature adds automatic translation of German article titles to English using the Ollama AI service. The translation will occur during the article crawling process and both the original German title and English translation will be stored in the database. The newsletter will display the English title prominently with the original German title as a subtitle when available.
|
|
||||||
|
|
||||||
## Glossary
|
|
||||||
|
|
||||||
- **Crawler Service**: The Python service that fetches articles from RSS feeds and processes them
|
|
||||||
- **Ollama Client**: The Python client that communicates with the Ollama AI server for text processing
|
|
||||||
- **Article Document**: The MongoDB document structure that stores article data
|
|
||||||
- **Newsletter Template**: The HTML template used to render the email newsletter sent to subscribers
|
|
||||||
- **Translation Result**: The response object returned by the Ollama translation function containing the translated title and metadata
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
### Requirement 1
|
|
||||||
|
|
||||||
**User Story:** As a newsletter subscriber, I want to see article titles in English, so that I can quickly understand the content without knowing German
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN the Crawler Service processes an article, THE Ollama Client SHALL translate the German title to English
|
|
||||||
2. THE Article Document SHALL store both the original German title and the English translation
|
|
||||||
3. THE Newsletter Template SHALL display the English title as the primary heading
|
|
||||||
4. WHERE an English translation exists, THE Newsletter Template SHALL display the original German title as a subtitle
|
|
||||||
5. IF the translation fails, THEN THE Newsletter Template SHALL display the original German title as the primary heading
|
|
||||||
|
|
||||||
### Requirement 2
|
|
||||||
|
|
||||||
**User Story:** As a system administrator, I want translation to be integrated with the existing Ollama service, so that I don't need to configure additional services
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Ollama Client SHALL provide a translate_title method that accepts a German title and returns an English translation
|
|
||||||
2. THE translate_title method SHALL use the same Ollama server configuration as the existing summarization feature
|
|
||||||
3. THE translate_title method SHALL use a temperature setting of 0.3 for consistent translations
|
|
||||||
4. THE translate_title method SHALL limit the response to 100 tokens maximum for title-length outputs
|
|
||||||
5. THE translate_title method SHALL return a Translation Result containing success status, translated title, error message, and duration
|
|
||||||
|
|
||||||
### Requirement 3
|
|
||||||
|
|
||||||
**User Story:** As a developer, I want translation errors to be handled gracefully, so that article processing continues even when translation fails
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. IF the Ollama server is unavailable, THEN THE Crawler Service SHALL continue processing articles without translations
|
|
||||||
2. IF a translation request times out, THEN THE Crawler Service SHALL log the error and store the article with only the original title
|
|
||||||
3. THE Crawler Service SHALL display translation status in the console output during crawling
|
|
||||||
4. THE Article Document SHALL include a translated_at timestamp field when translation succeeds
|
|
||||||
5. THE Article Document SHALL store NULL in the title_en field when translation fails
|
|
||||||
|
|
||||||
### Requirement 4
|
|
||||||
|
|
||||||
**User Story:** As a newsletter subscriber, I want translations to be accurate and natural, so that the English titles read fluently
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Ollama Client SHALL provide a clear prompt instructing the model to translate German news headlines to English
|
|
||||||
2. THE Ollama Client SHALL instruct the model to provide only the translation without explanations
|
|
||||||
3. THE Ollama Client SHALL clean the translation output by removing quotes and extra text
|
|
||||||
4. THE Ollama Client SHALL extract only the first line of the translation response
|
|
||||||
5. THE Ollama Client SHALL trim whitespace from the translated title
|
|
||||||
|
|
||||||
### Requirement 5
|
|
||||||
|
|
||||||
**User Story:** As a system operator, I want to see translation performance metrics, so that I can monitor the translation feature effectiveness
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Crawler Service SHALL log the translation duration for each article
|
|
||||||
2. THE Crawler Service SHALL display a success indicator when translation completes
|
|
||||||
3. THE Crawler Service SHALL display an error message when translation fails
|
|
||||||
4. THE Translation Result SHALL include the duration in seconds
|
|
||||||
5. THE Article Document SHALL store the translated_at timestamp for successful translations
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
# Implementation Plan
|
|
||||||
|
|
||||||
- [x] 1. Add translate_title method to Ollama client
|
|
||||||
- Create the `translate_title()` method in `news_crawler/ollama_client.py` that accepts a title string and target language parameter
|
|
||||||
- Implement the translation prompt that instructs the model to translate German headlines to English without explanations
|
|
||||||
- Configure Ollama API call with temperature=0.3 and num_predict=100 for consistent title-length translations
|
|
||||||
- Implement response cleaning logic to remove quotes, extract first line only, and trim whitespace
|
|
||||||
- Add error handling for timeout, connection errors, HTTP errors, and empty title input
|
|
||||||
- Return a dictionary with success status, translated_title, error message, and duration fields
|
|
||||||
- _Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 4.1, 4.2, 4.3, 4.4, 4.5_
|
|
||||||
|
|
||||||
- [x] 2. Integrate translation into crawler service
|
|
||||||
- [x] 2.1 Add translation call in crawl_rss_feed function
|
|
||||||
- Locate the article processing section in `news_crawler/crawler_service.py` after content extraction
|
|
||||||
- Store the original title from article_data or entry
|
|
||||||
- Add conditional check for Config.OLLAMA_ENABLED before calling translation
|
|
||||||
- Call `ollama_client.translate_title()` with the original title
|
|
||||||
- Store the translation_result for later use in article document
|
|
||||||
- _Requirements: 1.1, 2.1_
|
|
||||||
|
|
||||||
- [x] 2.2 Add console logging for translation status
|
|
||||||
- Add "🌐 Translating title..." message before translation call
|
|
||||||
- Add success message with duration: "✓ Title translated (X.Xs)"
|
|
||||||
- Add failure message with error: "⚠ Translation failed: {error}"
|
|
||||||
- _Requirements: 5.1, 5.2, 5.3_
|
|
||||||
|
|
||||||
- [x] 2.3 Update article document structure
|
|
||||||
- Modify the article_doc dictionary to include `title_en` field with translated title or None
|
|
||||||
- Add `translated_at` field set to datetime.utcnow() on success or None on failure
|
|
||||||
- Ensure the original `title` field still contains the German title
|
|
||||||
- _Requirements: 1.2, 3.5_
|
|
||||||
|
|
||||||
- [x] 3. Update newsletter template for bilingual title display
|
|
||||||
- Modify `news_sender/newsletter_template.html` to display English title as primary heading when available
|
|
||||||
- Add conditional logic to show original German title as subtitle only when English translation exists and differs
|
|
||||||
- Style the subtitle with smaller font (13px), gray color (#999999), and italic formatting
|
|
||||||
- Ensure fallback to German title when title_en is NULL or missing
|
|
||||||
- _Requirements: 1.3, 1.4, 1.5_
|
|
||||||
|
|
||||||
- [x] 4. Test the translation feature end-to-end
|
|
||||||
- Rebuild the crawler Docker container with the new translation code
|
|
||||||
- Clear existing articles from the database for clean testing
|
|
||||||
- Trigger a test crawl with max_articles=2 to process fresh articles
|
|
||||||
- Verify console output shows translation status messages
|
|
||||||
- Check MongoDB to confirm title_en and translated_at fields are populated
|
|
||||||
- Send a test newsletter email to verify English titles display correctly with German subtitles
|
|
||||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 5.1, 5.2, 5.4, 5.5_
|
|
||||||
@@ -1,407 +0,0 @@
|
|||||||
# Email Tracking System Design
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The email tracking system enables Munich News Daily to measure subscriber engagement through email opens and link clicks. The system uses industry-standard techniques (tracking pixels and redirect URLs) while maintaining privacy compliance and performance.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
### High-Level Components
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ Newsletter System │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────┐ ┌──────────────┐ │
|
|
||||||
│ │ Sender │─────▶│ Tracking │ │
|
|
||||||
│ │ Service │ │ Generator │ │
|
|
||||||
│ └──────────────┘ └──────────────┘ │
|
|
||||||
│ │ │ │
|
|
||||||
│ │ ▼ │
|
|
||||||
│ │ ┌──────────────┐ │
|
|
||||||
│ │ │ MongoDB │ │
|
|
||||||
│ │ │ (tracking) │ │
|
|
||||||
│ │ └──────────────┘ │
|
|
||||||
│ ▼ │
|
|
||||||
│ ┌──────────────┐ │
|
|
||||||
│ │ Email │ │
|
|
||||||
│ │ Client │ │
|
|
||||||
│ └──────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
│ ▲
|
|
||||||
│ │
|
|
||||||
▼ │
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ Backend API Server │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────┐ ┌──────────────┐ │
|
|
||||||
│ │ Pixel │ │ Link │ │
|
|
||||||
│ │ Endpoint │ │ Redirect │ │
|
|
||||||
│ └──────────────┘ └──────────────┘ │
|
|
||||||
│ │ │ │
|
|
||||||
│ └──────────┬───────────┘ │
|
|
||||||
│ ▼ │
|
|
||||||
│ ┌──────────────┐ │
|
|
||||||
│ │ MongoDB │ │
|
|
||||||
│ │ (tracking) │ │
|
|
||||||
│ └──────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Technology Stack
|
|
||||||
|
|
||||||
- **Backend**: Flask (Python) - existing backend server
|
|
||||||
- **Database**: MongoDB - existing database with new collections
|
|
||||||
- **Email**: SMTP (existing sender service)
|
|
||||||
- **Tracking**: UUID-based unique identifiers
|
|
||||||
- **Image**: 1x1 transparent PNG (base64 encoded)
|
|
||||||
|
|
||||||
## Components and Interfaces
|
|
||||||
|
|
||||||
### 1. Tracking ID Generator
|
|
||||||
|
|
||||||
**Purpose**: Generate unique tracking identifiers for emails and links
|
|
||||||
|
|
||||||
**Module**: `backend/services/tracking_service.py`
|
|
||||||
|
|
||||||
**Functions**:
|
|
||||||
```python
|
|
||||||
def generate_tracking_id() -> str:
|
|
||||||
"""Generate a unique tracking ID using UUID4"""
|
|
||||||
return str(uuid.uuid4())
|
|
||||||
|
|
||||||
def create_newsletter_tracking(newsletter_id: str, subscriber_email: str) -> dict:
|
|
||||||
"""Create tracking record for a newsletter send"""
|
|
||||||
# Returns tracking document with IDs for pixel and links
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Tracking Pixel Endpoint
|
|
||||||
|
|
||||||
**Purpose**: Serve 1x1 transparent PNG and log email opens
|
|
||||||
|
|
||||||
**Endpoint**: `GET /api/track/pixel/<tracking_id>`
|
|
||||||
|
|
||||||
**Flow**:
|
|
||||||
1. Receive request with tracking_id
|
|
||||||
2. Look up tracking record in database
|
|
||||||
3. Log open event (email, timestamp, user-agent)
|
|
||||||
4. Return 1x1 transparent PNG image
|
|
||||||
5. Handle multiple opens (update last_opened_at)
|
|
||||||
|
|
||||||
**Response**:
|
|
||||||
- Status: 200 OK
|
|
||||||
- Content-Type: image/png
|
|
||||||
- Body: 1x1 transparent PNG (43 bytes)
|
|
||||||
|
|
||||||
### 3. Link Tracking Endpoint
|
|
||||||
|
|
||||||
**Purpose**: Track link clicks and redirect to original URL
|
|
||||||
|
|
||||||
**Endpoint**: `GET /api/track/click/<tracking_id>`
|
|
||||||
|
|
||||||
**Flow**:
|
|
||||||
1. Receive request with tracking_id
|
|
||||||
2. Look up tracking record and original URL
|
|
||||||
3. Log click event (email, article_url, timestamp, user-agent)
|
|
||||||
4. Redirect to original article URL (302 redirect)
|
|
||||||
5. Handle errors gracefully (redirect to homepage if invalid)
|
|
||||||
|
|
||||||
**Response**:
|
|
||||||
- Status: 302 Found
|
|
||||||
- Location: Original article URL
|
|
||||||
- Performance: < 200ms redirect time
|
|
||||||
|
|
||||||
### 4. Newsletter Template Modifier
|
|
||||||
|
|
||||||
**Purpose**: Inject tracking pixel and replace article links
|
|
||||||
|
|
||||||
**Module**: `news_sender/tracking_integration.py`
|
|
||||||
|
|
||||||
**Functions**:
|
|
||||||
```python
|
|
||||||
def inject_tracking_pixel(html: str, tracking_id: str, api_url: str) -> str:
|
|
||||||
"""Inject tracking pixel before closing </body> tag"""
|
|
||||||
pixel_url = f"{api_url}/api/track/pixel/{tracking_id}"
|
|
||||||
pixel_html = f'<img src="{pixel_url}" width="1" height="1" alt="" />'
|
|
||||||
return html.replace('</body>', f'{pixel_html}</body>')
|
|
||||||
|
|
||||||
def replace_article_links(html: str, articles: list, tracking_map: dict, api_url: str) -> str:
|
|
||||||
"""Replace article links with tracking URLs"""
|
|
||||||
# For each article link, replace with tracking URL
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Analytics Service
|
|
||||||
|
|
||||||
**Purpose**: Calculate engagement metrics and identify active users
|
|
||||||
|
|
||||||
**Module**: `backend/services/analytics_service.py`
|
|
||||||
|
|
||||||
**Functions**:
|
|
||||||
```python
|
|
||||||
def get_open_rate(newsletter_id: str) -> float:
|
|
||||||
"""Calculate percentage of subscribers who opened"""
|
|
||||||
|
|
||||||
def get_click_rate(article_url: str) -> float:
|
|
||||||
"""Calculate percentage of subscribers who clicked"""
|
|
||||||
|
|
||||||
def get_subscriber_activity_status(email: str) -> str:
|
|
||||||
"""Return 'active', 'inactive', or 'dormant'"""
|
|
||||||
|
|
||||||
def update_subscriber_activity_statuses():
|
|
||||||
"""Batch update all subscriber activity statuses"""
|
|
||||||
```
|
|
||||||
|
|
||||||
## Data Models
|
|
||||||
|
|
||||||
### Newsletter Sends Collection (`newsletter_sends`)
|
|
||||||
|
|
||||||
Tracks each newsletter sent to each subscriber.
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
_id: ObjectId,
|
|
||||||
newsletter_id: String, // Unique ID for this newsletter batch (date-based)
|
|
||||||
subscriber_email: String, // Recipient email
|
|
||||||
tracking_id: String, // Unique tracking ID for this send (UUID)
|
|
||||||
sent_at: DateTime, // When email was sent
|
|
||||||
opened: Boolean, // Whether email was opened
|
|
||||||
first_opened_at: DateTime, // First open timestamp (null if not opened)
|
|
||||||
last_opened_at: DateTime, // Most recent open timestamp
|
|
||||||
open_count: Number, // Number of times opened
|
|
||||||
created_at: DateTime // Record creation time
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Indexes**:
|
|
||||||
- `tracking_id` (unique) - Fast lookup for pixel requests
|
|
||||||
- `newsletter_id` - Analytics queries
|
|
||||||
- `subscriber_email` - User activity queries
|
|
||||||
- `sent_at` - Time-based queries
|
|
||||||
|
|
||||||
### Link Clicks Collection (`link_clicks`)
|
|
||||||
|
|
||||||
Tracks individual link clicks.
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
_id: ObjectId,
|
|
||||||
tracking_id: String, // Unique tracking ID for this link (UUID)
|
|
||||||
newsletter_id: String, // Which newsletter this link was in
|
|
||||||
subscriber_email: String, // Who clicked
|
|
||||||
article_url: String, // Original article URL
|
|
||||||
article_title: String, // Article title for reporting
|
|
||||||
clicked_at: DateTime, // When link was clicked
|
|
||||||
user_agent: String, // Browser/client info
|
|
||||||
created_at: DateTime // Record creation time
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Indexes**:
|
|
||||||
- `tracking_id` (unique) - Fast lookup for redirect requests
|
|
||||||
- `newsletter_id` - Analytics queries
|
|
||||||
- `article_url` - Article performance queries
|
|
||||||
- `subscriber_email` - User activity queries
|
|
||||||
|
|
||||||
### Subscriber Activity Collection (`subscriber_activity`)
|
|
||||||
|
|
||||||
Aggregated activity status for each subscriber.
|
|
||||||
|
|
||||||
```javascript
|
|
||||||
{
|
|
||||||
_id: ObjectId,
|
|
||||||
email: String, // Subscriber email (unique)
|
|
||||||
status: String, // 'active', 'inactive', or 'dormant'
|
|
||||||
last_opened_at: DateTime, // Most recent email open
|
|
||||||
last_clicked_at: DateTime, // Most recent link click
|
|
||||||
total_opens: Number, // Lifetime open count
|
|
||||||
total_clicks: Number, // Lifetime click count
|
|
||||||
newsletters_received: Number, // Total newsletters sent
|
|
||||||
newsletters_opened: Number, // Total newsletters opened
|
|
||||||
updated_at: DateTime // Last status update
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Indexes**:
|
|
||||||
- `email` (unique) - Fast lookup
|
|
||||||
- `status` - Filter by activity level
|
|
||||||
- `last_opened_at` - Time-based queries
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
### Tracking Pixel Failures
|
|
||||||
|
|
||||||
- **Invalid tracking_id**: Return 1x1 transparent PNG anyway (don't break email rendering)
|
|
||||||
- **Database error**: Log error, return pixel (fail silently)
|
|
||||||
- **Multiple opens**: Update existing record, don't create duplicate
|
|
||||||
|
|
||||||
### Link Redirect Failures
|
|
||||||
|
|
||||||
- **Invalid tracking_id**: Redirect to website homepage
|
|
||||||
- **Database error**: Log error, redirect to homepage
|
|
||||||
- **Missing original URL**: Redirect to homepage
|
|
||||||
|
|
||||||
### Privacy Compliance
|
|
||||||
|
|
||||||
- **Data retention**: Anonymize tracking data after 90 days
|
|
||||||
- Remove email addresses
|
|
||||||
- Keep aggregated metrics
|
|
||||||
- **Opt-out**: Check subscriber preferences before tracking
|
|
||||||
- **GDPR deletion**: Provide endpoint to delete all tracking data for a user
|
|
||||||
|
|
||||||
## Testing Strategy
|
|
||||||
|
|
||||||
### Unit Tests
|
|
||||||
|
|
||||||
1. **Tracking ID Generation**
|
|
||||||
- Test UUID format
|
|
||||||
- Test uniqueness
|
|
||||||
|
|
||||||
2. **Pixel Endpoint**
|
|
||||||
- Test valid tracking_id returns PNG
|
|
||||||
- Test invalid tracking_id returns PNG
|
|
||||||
- Test database logging
|
|
||||||
|
|
||||||
3. **Link Redirect**
|
|
||||||
- Test valid tracking_id redirects correctly
|
|
||||||
- Test invalid tracking_id redirects to homepage
|
|
||||||
- Test click logging
|
|
||||||
|
|
||||||
4. **Analytics Calculations**
|
|
||||||
- Test open rate calculation
|
|
||||||
- Test click rate calculation
|
|
||||||
- Test activity status classification
|
|
||||||
|
|
||||||
### Integration Tests
|
|
||||||
|
|
||||||
1. **End-to-End Newsletter Flow**
|
|
||||||
- Send newsletter with tracking
|
|
||||||
- Simulate email open (pixel request)
|
|
||||||
- Simulate link click
|
|
||||||
- Verify database records
|
|
||||||
|
|
||||||
2. **Privacy Compliance**
|
|
||||||
- Test data anonymization
|
|
||||||
- Test user data deletion
|
|
||||||
- Test opt-out handling
|
|
||||||
|
|
||||||
### Performance Tests
|
|
||||||
|
|
||||||
1. **Redirect Speed**
|
|
||||||
- Measure redirect time (target: < 200ms)
|
|
||||||
- Test under load (100 concurrent requests)
|
|
||||||
|
|
||||||
2. **Pixel Serving**
|
|
||||||
- Test pixel response time
|
|
||||||
- Test caching headers
|
|
||||||
|
|
||||||
## API Endpoints
|
|
||||||
|
|
||||||
### Tracking Endpoints
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /api/track/pixel/<tracking_id>
|
|
||||||
- Returns: 1x1 transparent PNG
|
|
||||||
- Logs: Email open event
|
|
||||||
|
|
||||||
GET /api/track/click/<tracking_id>
|
|
||||||
- Returns: 302 redirect to article URL
|
|
||||||
- Logs: Link click event
|
|
||||||
```
|
|
||||||
|
|
||||||
### Analytics Endpoints
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /api/analytics/newsletter/<newsletter_id>
|
|
||||||
- Returns: Open rate, click rate, engagement metrics
|
|
||||||
|
|
||||||
GET /api/analytics/article/<article_id>
|
|
||||||
- Returns: Click count, click rate for specific article
|
|
||||||
|
|
||||||
GET /api/analytics/subscriber/<email>
|
|
||||||
- Returns: Activity status, engagement history
|
|
||||||
|
|
||||||
POST /api/analytics/update-activity
|
|
||||||
- Triggers: Batch update of subscriber activity statuses
|
|
||||||
- Returns: Update count
|
|
||||||
```
|
|
||||||
|
|
||||||
### Privacy Endpoints
|
|
||||||
|
|
||||||
```
|
|
||||||
DELETE /api/tracking/subscriber/<email>
|
|
||||||
- Deletes: All tracking data for subscriber
|
|
||||||
- Returns: Deletion confirmation
|
|
||||||
|
|
||||||
POST /api/tracking/anonymize
|
|
||||||
- Triggers: Anonymize tracking data older than 90 days
|
|
||||||
- Returns: Anonymization count
|
|
||||||
```
|
|
||||||
|
|
||||||
## Implementation Phases
|
|
||||||
|
|
||||||
### Phase 1: Core Tracking (MVP)
|
|
||||||
- Tracking ID generation
|
|
||||||
- Pixel endpoint
|
|
||||||
- Link redirect endpoint
|
|
||||||
- Database collections
|
|
||||||
- Newsletter template integration
|
|
||||||
|
|
||||||
### Phase 2: Analytics
|
|
||||||
- Open rate calculation
|
|
||||||
- Click rate calculation
|
|
||||||
- Activity status classification
|
|
||||||
- Analytics API endpoints
|
|
||||||
|
|
||||||
### Phase 3: Privacy & Compliance
|
|
||||||
- Data anonymization
|
|
||||||
- User data deletion
|
|
||||||
- Opt-out handling
|
|
||||||
- Privacy notices
|
|
||||||
|
|
||||||
### Phase 4: Optimization
|
|
||||||
- Caching for pixel endpoint
|
|
||||||
- Performance monitoring
|
|
||||||
- Batch processing for activity updates
|
|
||||||
|
|
||||||
## Security Considerations
|
|
||||||
|
|
||||||
1. **Rate Limiting**: Prevent abuse of tracking endpoints
|
|
||||||
2. **Input Validation**: Validate all tracking_ids (UUID format)
|
|
||||||
3. **SQL Injection**: Use parameterized queries (MongoDB safe by default)
|
|
||||||
4. **Privacy**: Don't expose subscriber emails in URLs
|
|
||||||
5. **HTTPS**: Ensure all tracking URLs use HTTPS in production
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
Add to `backend/.env`:
|
|
||||||
|
|
||||||
```env
|
|
||||||
# Tracking Configuration
|
|
||||||
TRACKING_ENABLED=true
|
|
||||||
TRACKING_API_URL=http://localhost:5000
|
|
||||||
TRACKING_DATA_RETENTION_DAYS=90
|
|
||||||
```
|
|
||||||
|
|
||||||
## Monitoring and Metrics
|
|
||||||
|
|
||||||
### Key Metrics to Track
|
|
||||||
|
|
||||||
1. **Email Opens**
|
|
||||||
- Overall open rate
|
|
||||||
- Open rate by newsletter
|
|
||||||
- Time to first open
|
|
||||||
|
|
||||||
2. **Link Clicks**
|
|
||||||
- Overall click rate
|
|
||||||
- Click rate by article
|
|
||||||
- Click-through rate (CTR)
|
|
||||||
|
|
||||||
3. **Subscriber Engagement**
|
|
||||||
- Active subscriber count
|
|
||||||
- Inactive subscriber count
|
|
||||||
- Dormant subscriber count
|
|
||||||
|
|
||||||
4. **System Performance**
|
|
||||||
- Pixel response time
|
|
||||||
- Redirect response time
|
|
||||||
- Database query performance
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
# Requirements Document
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
This document outlines the requirements for implementing email tracking functionality in the Munich News Daily newsletter system. The system will track email opens and link clicks to measure subscriber engagement and identify active users.
|
|
||||||
|
|
||||||
## Glossary
|
|
||||||
|
|
||||||
- **Newsletter System**: The Munich News Daily email sending service
|
|
||||||
- **Tracking Pixel**: A 1x1 transparent image embedded in emails to detect opens
|
|
||||||
- **Tracking Link**: A redirecting URL that logs clicks before sending users to the actual destination
|
|
||||||
- **Subscriber**: A user who receives the newsletter
|
|
||||||
- **Email Open**: When a subscriber's email client loads the tracking pixel
|
|
||||||
- **Link Click**: When a subscriber clicks a tracked link in the newsletter
|
|
||||||
- **Engagement Metrics**: Data about subscriber interactions with newsletters
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
### Requirement 1: Track Email Opens
|
|
||||||
|
|
||||||
**User Story:** As a newsletter administrator, I want to track when subscribers open emails, so that I can measure engagement and identify active users.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN a newsletter is sent, THE Newsletter System SHALL embed a unique tracking pixel in each email
|
|
||||||
2. WHEN a subscriber opens the email, THE Newsletter System SHALL record the open event with timestamp
|
|
||||||
3. THE Newsletter System SHALL store the subscriber email, newsletter ID, and open timestamp in the database
|
|
||||||
4. THE Newsletter System SHALL serve the tracking pixel as a 1x1 transparent PNG image
|
|
||||||
5. THE Newsletter System SHALL handle multiple opens from the same subscriber for the same newsletter
|
|
||||||
|
|
||||||
### Requirement 2: Track Link Clicks
|
|
||||||
|
|
||||||
**User Story:** As a newsletter administrator, I want to track when subscribers click on article links, so that I can understand which content is most engaging.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. WHEN a newsletter is generated, THE Newsletter System SHALL replace all article links with unique tracking URLs
|
|
||||||
2. WHEN a subscriber clicks a tracking URL, THE Newsletter System SHALL record the click event with timestamp
|
|
||||||
3. WHEN a click is recorded, THE Newsletter System SHALL redirect the subscriber to the original article URL
|
|
||||||
4. THE Newsletter System SHALL store the subscriber email, article link, and click timestamp in the database
|
|
||||||
5. THE Newsletter System SHALL complete the redirect within 200 milliseconds
|
|
||||||
|
|
||||||
### Requirement 3: Generate Engagement Reports
|
|
||||||
|
|
||||||
**User Story:** As a newsletter administrator, I want to view engagement metrics, so that I can understand subscriber behavior and content performance.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Newsletter System SHALL provide an API endpoint to retrieve open rates by newsletter
|
|
||||||
2. THE Newsletter System SHALL provide an API endpoint to retrieve click rates by article
|
|
||||||
3. THE Newsletter System SHALL calculate the percentage of subscribers who opened each newsletter
|
|
||||||
4. THE Newsletter System SHALL calculate the percentage of subscribers who clicked each article link
|
|
||||||
5. THE Newsletter System SHALL identify subscribers who have not opened emails in the last 30 days
|
|
||||||
|
|
||||||
### Requirement 4: Privacy and Compliance
|
|
||||||
|
|
||||||
**User Story:** As a newsletter administrator, I want to respect subscriber privacy, so that the system complies with privacy regulations.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Newsletter System SHALL include a privacy notice in the newsletter footer explaining tracking
|
|
||||||
2. THE Newsletter System SHALL anonymize tracking data after 90 days by removing email addresses
|
|
||||||
3. THE Newsletter System SHALL provide an API endpoint to delete all tracking data for a specific subscriber
|
|
||||||
4. THE Newsletter System SHALL not track subscribers who have opted out of tracking
|
|
||||||
5. WHERE a subscriber unsubscribes, THE Newsletter System SHALL delete all their tracking data
|
|
||||||
|
|
||||||
### Requirement 5: Identify Active Users
|
|
||||||
|
|
||||||
**User Story:** As a newsletter administrator, I want to identify active subscribers, so that I can segment my audience and improve targeting.
|
|
||||||
|
|
||||||
#### Acceptance Criteria
|
|
||||||
|
|
||||||
1. THE Newsletter System SHALL classify a subscriber as "active" if they opened an email in the last 30 days
|
|
||||||
2. THE Newsletter System SHALL classify a subscriber as "inactive" if they have not opened an email in 30-60 days
|
|
||||||
3. THE Newsletter System SHALL classify a subscriber as "dormant" if they have not opened an email in over 60 days
|
|
||||||
4. THE Newsletter System SHALL provide an API endpoint to retrieve subscriber activity status
|
|
||||||
5. THE Newsletter System SHALL update subscriber activity status daily
|
|
||||||
@@ -1,170 +0,0 @@
|
|||||||
# Implementation Plan
|
|
||||||
|
|
||||||
## Phase 1: Core Tracking Infrastructure
|
|
||||||
|
|
||||||
- [x] 1. Set up database collections and indexes
|
|
||||||
- Create MongoDB collections: `newsletter_sends`, `link_clicks`, `subscriber_activity`
|
|
||||||
- Add indexes for performance: `tracking_id` (unique), `newsletter_id`, `subscriber_email`, `sent_at`
|
|
||||||
- Write database initialization script
|
|
||||||
- _Requirements: 1.3, 2.4_
|
|
||||||
|
|
||||||
- [x] 2. Implement tracking service
|
|
||||||
- [x] 2.1 Create tracking ID generator
|
|
||||||
- Write `generate_tracking_id()` function using UUID4
|
|
||||||
- Write `create_newsletter_tracking()` function to create tracking records
|
|
||||||
- Add configuration for tracking enable/disable
|
|
||||||
- _Requirements: 1.1, 2.1_
|
|
||||||
|
|
||||||
- [x] 2.2 Implement tracking pixel endpoint
|
|
||||||
- Create Flask route `GET /api/track/pixel/<tracking_id>`
|
|
||||||
- Generate 1x1 transparent PNG (base64 encoded)
|
|
||||||
- Log email open event to `newsletter_sends` collection
|
|
||||||
- Handle multiple opens (update `last_opened_at` and `open_count`)
|
|
||||||
- Return PNG with proper headers (Content-Type: image/png)
|
|
||||||
- _Requirements: 1.2, 1.3, 1.4, 1.5_
|
|
||||||
|
|
||||||
- [x] 2.3 Implement link redirect endpoint
|
|
||||||
- Create Flask route `GET /api/track/click/<tracking_id>`
|
|
||||||
- Look up original article URL from tracking record
|
|
||||||
- Log click event to `link_clicks` collection
|
|
||||||
- Redirect to original URL with 302 status
|
|
||||||
- Handle invalid tracking_id (redirect to homepage)
|
|
||||||
- Ensure redirect completes within 200ms
|
|
||||||
- _Requirements: 2.2, 2.3, 2.4, 2.5_
|
|
||||||
|
|
||||||
- [x] 2.4 Write unit tests for tracking endpoints
|
|
||||||
- Test pixel endpoint returns PNG for valid tracking_id
|
|
||||||
- Test pixel endpoint returns PNG for invalid tracking_id (fail silently)
|
|
||||||
- Test link redirect works correctly
|
|
||||||
- Test link redirect handles invalid tracking_id
|
|
||||||
- Test database logging for opens and clicks
|
|
||||||
- _Requirements: 1.2, 1.4, 2.2, 2.3_
|
|
||||||
|
|
||||||
## Phase 2: Newsletter Integration
|
|
||||||
|
|
||||||
- [x] 3. Integrate tracking into sender service
|
|
||||||
- [x] 3.1 Create tracking integration module
|
|
||||||
- Write `inject_tracking_pixel()` function to add pixel to HTML
|
|
||||||
- Write `replace_article_links()` function to replace links with tracking URLs
|
|
||||||
- Write `generate_tracking_urls()` function to create tracking records for all links
|
|
||||||
- Add tracking configuration to sender service
|
|
||||||
- _Requirements: 1.1, 2.1_
|
|
||||||
|
|
||||||
- [x] 3.2 Modify newsletter sending flow
|
|
||||||
- Update `send_newsletter()` to generate tracking IDs for each subscriber
|
|
||||||
- Create tracking records in database before sending
|
|
||||||
- Inject tracking pixel into newsletter HTML
|
|
||||||
- Replace article links with tracking URLs
|
|
||||||
- Store newsletter_id and tracking metadata
|
|
||||||
- _Requirements: 1.1, 1.3, 2.1, 2.4_
|
|
||||||
|
|
||||||
- [x] 3.3 Update newsletter template
|
|
||||||
- Ensure template supports dynamic tracking pixel injection
|
|
||||||
- Ensure article links are properly structured for replacement
|
|
||||||
- Add privacy notice to footer about tracking
|
|
||||||
- _Requirements: 4.1_
|
|
||||||
|
|
||||||
- [x] 3.4 Test newsletter with tracking
|
|
||||||
- Send test newsletter with tracking enabled
|
|
||||||
- Verify tracking pixel is embedded correctly
|
|
||||||
- Verify article links are replaced with tracking URLs
|
|
||||||
- Test email open tracking works
|
|
||||||
- Test link click tracking works
|
|
||||||
- _Requirements: 1.1, 1.2, 2.1, 2.2_
|
|
||||||
|
|
||||||
## Phase 3: Analytics and Reporting
|
|
||||||
|
|
||||||
- [x] 4. Implement analytics service
|
|
||||||
- [x] 4.1 Create analytics calculation functions
|
|
||||||
- Write `get_open_rate(newsletter_id)` function
|
|
||||||
- Write `get_click_rate(article_url)` function
|
|
||||||
- Write `get_newsletter_metrics(newsletter_id)` function for overall stats
|
|
||||||
- Write `get_article_performance(article_url)` function
|
|
||||||
- _Requirements: 3.1, 3.2, 3.3, 3.4_
|
|
||||||
|
|
||||||
- [x] 4.2 Implement subscriber activity classification
|
|
||||||
- Write `get_subscriber_activity_status(email)` function
|
|
||||||
- Classify as 'active' (opened in last 30 days)
|
|
||||||
- Classify as 'inactive' (no opens in 30-60 days)
|
|
||||||
- Classify as 'dormant' (no opens in 60+ days)
|
|
||||||
- Write `update_subscriber_activity_statuses()` batch function
|
|
||||||
- _Requirements: 5.1, 5.2, 5.3, 5.4, 5.5_
|
|
||||||
|
|
||||||
- [x] 4.3 Create analytics API endpoints
|
|
||||||
- Create `GET /api/analytics/newsletter/<newsletter_id>` endpoint
|
|
||||||
- Create `GET /api/analytics/article/<article_id>` endpoint
|
|
||||||
- Create `GET /api/analytics/subscriber/<email>` endpoint
|
|
||||||
- Create `POST /api/analytics/update-activity` endpoint
|
|
||||||
- Return JSON with engagement metrics
|
|
||||||
- _Requirements: 3.1, 3.2, 3.4, 5.4_
|
|
||||||
|
|
||||||
- [x] 4.4 Write unit tests for analytics
|
|
||||||
- Test open rate calculation
|
|
||||||
- Test click rate calculation
|
|
||||||
- Test activity status classification
|
|
||||||
- Test edge cases (no opens, no clicks)
|
|
||||||
- _Requirements: 3.3, 3.4, 5.1, 5.2, 5.3_
|
|
||||||
|
|
||||||
## Phase 4: Privacy and Compliance
|
|
||||||
|
|
||||||
- [x] 5. Implement privacy features
|
|
||||||
- [x] 5.1 Create data anonymization function
|
|
||||||
- Write function to anonymize tracking data older than 90 days
|
|
||||||
- Remove email addresses from old records
|
|
||||||
- Keep aggregated metrics
|
|
||||||
- Create scheduled task to run daily
|
|
||||||
- _Requirements: 4.2_
|
|
||||||
|
|
||||||
- [x] 5.2 Implement user data deletion
|
|
||||||
- Create `DELETE /api/tracking/subscriber/<email>` endpoint
|
|
||||||
- Delete all tracking records for subscriber
|
|
||||||
- Delete from `newsletter_sends`, `link_clicks`, `subscriber_activity`
|
|
||||||
- Return confirmation response
|
|
||||||
- _Requirements: 4.3, 4.5_
|
|
||||||
|
|
||||||
- [x] 5.3 Add tracking opt-out support
|
|
||||||
- Add `tracking_enabled` field to subscribers collection
|
|
||||||
- Check opt-out status before creating tracking records
|
|
||||||
- Skip tracking for opted-out subscribers
|
|
||||||
- Update newsletter sending to respect opt-out
|
|
||||||
- _Requirements: 4.4_
|
|
||||||
|
|
||||||
- [x] 5.4 Create anonymization endpoint
|
|
||||||
- Create `POST /api/tracking/anonymize` endpoint
|
|
||||||
- Trigger anonymization of old data
|
|
||||||
- Return count of anonymized records
|
|
||||||
- Add authentication/authorization
|
|
||||||
- _Requirements: 4.2_
|
|
||||||
|
|
||||||
- [x] 5.5 Write privacy compliance tests
|
|
||||||
- Test data anonymization works correctly
|
|
||||||
- Test user data deletion removes all records
|
|
||||||
- Test opt-out prevents tracking
|
|
||||||
- Test anonymization preserves aggregated metrics
|
|
||||||
- _Requirements: 4.2, 4.3, 4.4, 4.5_
|
|
||||||
|
|
||||||
## Phase 5: Configuration and Documentation
|
|
||||||
|
|
||||||
- [x] 6. Add configuration and environment setup
|
|
||||||
- Add `TRACKING_ENABLED` to environment variables
|
|
||||||
- Add `TRACKING_API_URL` configuration
|
|
||||||
- Add `TRACKING_DATA_RETENTION_DAYS` configuration
|
|
||||||
- Update `.env.template` with tracking variables
|
|
||||||
- Update configuration documentation
|
|
||||||
- _Requirements: All_
|
|
||||||
|
|
||||||
- [x] 7. Update database schema documentation
|
|
||||||
- Document `newsletter_sends` collection schema
|
|
||||||
- Document `link_clicks` collection schema
|
|
||||||
- Document `subscriber_activity` collection schema
|
|
||||||
- Add indexes documentation
|
|
||||||
- Update DATABASE_SCHEMA.md
|
|
||||||
- _Requirements: 1.3, 2.4, 5.4_
|
|
||||||
|
|
||||||
- [x] 8. Create tracking usage documentation
|
|
||||||
- Document how to enable/disable tracking
|
|
||||||
- Document analytics API endpoints
|
|
||||||
- Document privacy features
|
|
||||||
- Add examples of querying tracking data
|
|
||||||
- Create README for tracking system
|
|
||||||
- _Requirements: All_
|
|
||||||
@@ -7,6 +7,7 @@ A fully automated news aggregation and newsletter system that crawls Munich news
|
|||||||
- **🤖 AI-Powered Clustering** - Automatically detects duplicate stories from different sources
|
- **🤖 AI-Powered Clustering** - Automatically detects duplicate stories from different sources
|
||||||
- **📰 Neutral Summaries** - Combines multiple perspectives into balanced coverage
|
- **📰 Neutral Summaries** - Combines multiple perspectives into balanced coverage
|
||||||
- **🎯 Smart Prioritization** - Shows most important stories first (multi-source coverage)
|
- **🎯 Smart Prioritization** - Shows most important stories first (multi-source coverage)
|
||||||
|
- **🎨 Personalized Newsletters** - AI-powered content recommendations based on user interests
|
||||||
- **📊 Engagement Tracking** - Open rates, click tracking, and analytics
|
- **📊 Engagement Tracking** - Open rates, click tracking, and analytics
|
||||||
- **⚡ GPU Acceleration** - 5-10x faster AI processing with GPU support
|
- **⚡ GPU Acceleration** - 5-10x faster AI processing with GPU support
|
||||||
- **🔒 GDPR Compliant** - Privacy-first with data retention controls
|
- **🔒 GDPR Compliant** - Privacy-first with data retention controls
|
||||||
@@ -365,6 +366,8 @@ curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-
|
|||||||
|
|
||||||
### Core Features
|
### Core Features
|
||||||
- **[docs/AI_NEWS_AGGREGATION.md](docs/AI_NEWS_AGGREGATION.md)** - AI-powered clustering & neutral summaries
|
- **[docs/AI_NEWS_AGGREGATION.md](docs/AI_NEWS_AGGREGATION.md)** - AI-powered clustering & neutral summaries
|
||||||
|
- **[docs/PERSONALIZATION.md](docs/PERSONALIZATION.md)** - Personalized newsletter system
|
||||||
|
- **[docs/PERSONALIZATION_COMPLETE.md](docs/PERSONALIZATION_COMPLETE.md)** - Personalization implementation guide
|
||||||
- **[docs/FEATURES.md](docs/FEATURES.md)** - Complete feature list
|
- **[docs/FEATURES.md](docs/FEATURES.md)** - Complete feature list
|
||||||
- **[docs/API.md](docs/API.md)** - API endpoints reference
|
- **[docs/API.md](docs/API.md)** - API endpoints reference
|
||||||
|
|
||||||
@@ -399,6 +402,9 @@ docker-compose exec sender python tests/sender/test_tracking_integration.py
|
|||||||
|
|
||||||
# Run backend tests
|
# Run backend tests
|
||||||
docker-compose exec backend python tests/backend/test_tracking.py
|
docker-compose exec backend python tests/backend/test_tracking.py
|
||||||
|
|
||||||
|
# Test personalization system (all 4 phases)
|
||||||
|
docker exec munich-news-local-backend python test_personalization_system.py
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🚀 Production Deployment
|
## 🚀 Production Deployment
|
||||||
|
|||||||
30
backend/.env.local
Normal file
30
backend/.env.local
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# Munich News Daily - Local Development Backend Configuration
|
||||||
|
|
||||||
|
# MongoDB Configuration
|
||||||
|
MONGODB_URI=mongodb://admin:changeme@mongodb:27017/
|
||||||
|
|
||||||
|
# Email Configuration (use test credentials or disable)
|
||||||
|
SMTP_SERVER=localhost
|
||||||
|
SMTP_PORT=587
|
||||||
|
EMAIL_USER=test@localhost
|
||||||
|
EMAIL_PASSWORD=test123
|
||||||
|
|
||||||
|
# Newsletter Settings
|
||||||
|
NEWSLETTER_MAX_ARTICLES=5
|
||||||
|
NEWSLETTER_HOURS_LOOKBACK=24
|
||||||
|
WEBSITE_URL=http://localhost:3000
|
||||||
|
|
||||||
|
# Tracking Configuration
|
||||||
|
TRACKING_ENABLED=true
|
||||||
|
TRACKING_API_URL=http://localhost:5001
|
||||||
|
TRACKING_DATA_RETENTION_DAYS=90
|
||||||
|
|
||||||
|
# Ollama Configuration (AI Summarization)
|
||||||
|
OLLAMA_ENABLED=true
|
||||||
|
OLLAMA_BASE_URL=http://ollama:11434
|
||||||
|
OLLAMA_MODEL=phi3:latest
|
||||||
|
OLLAMA_TIMEOUT=120
|
||||||
|
SUMMARY_MAX_WORDS=150
|
||||||
|
|
||||||
|
# Flask Server Configuration
|
||||||
|
FLASK_PORT=5001
|
||||||
77
backend/add_categories_to_feeds.py
Normal file
77
backend/add_categories_to_feeds.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Script to add categories to existing RSS feeds
|
||||||
|
"""
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
client = MongoClient(Config.MONGODB_URI)
|
||||||
|
db = client[Config.DB_NAME]
|
||||||
|
rss_feeds_collection = db['rss_feeds']
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📡 Adding Categories to RSS Feeds")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
# Get all feeds
|
||||||
|
all_feeds = list(rss_feeds_collection.find())
|
||||||
|
print(f"\nFound {len(all_feeds)} RSS feeds")
|
||||||
|
|
||||||
|
# Category mapping based on feed names/URLs
|
||||||
|
category_mapping = {
|
||||||
|
'münchen': 'local',
|
||||||
|
'munich': 'local',
|
||||||
|
'lokales': 'local',
|
||||||
|
'sport': 'sports',
|
||||||
|
'fussball': 'sports',
|
||||||
|
'fc bayern': 'sports',
|
||||||
|
'wissenschaft': 'science',
|
||||||
|
'science': 'science',
|
||||||
|
'tech': 'science',
|
||||||
|
'technologie': 'science',
|
||||||
|
}
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
for feed in all_feeds:
|
||||||
|
name = feed.get('name', '').lower()
|
||||||
|
url = feed.get('url', '').lower()
|
||||||
|
current_category = feed.get('category')
|
||||||
|
|
||||||
|
# Skip if already has a category
|
||||||
|
if current_category:
|
||||||
|
print(f" ✓ {feed['name']}: Already has category '{current_category}'")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to determine category from name or URL
|
||||||
|
detected_category = 'general' # Default
|
||||||
|
|
||||||
|
for keyword, category in category_mapping.items():
|
||||||
|
if keyword in name or keyword in url:
|
||||||
|
detected_category = category
|
||||||
|
break
|
||||||
|
|
||||||
|
# Update the feed
|
||||||
|
rss_feeds_collection.update_one(
|
||||||
|
{'_id': feed['_id']},
|
||||||
|
{'$set': {'category': detected_category}}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" ✓ {feed['name']}: Set category to '{detected_category}'")
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📊 Summary")
|
||||||
|
print("="*70)
|
||||||
|
print(f"✓ Updated: {updated} feeds")
|
||||||
|
print(f"✓ Already had categories: {len(all_feeds) - updated} feeds")
|
||||||
|
print("="*70 + "\n")
|
||||||
|
|
||||||
|
# Show final category distribution
|
||||||
|
print("Category distribution:")
|
||||||
|
categories = rss_feeds_collection.aggregate([
|
||||||
|
{'$group': {'_id': '$category', 'count': {'$sum': 1}}}
|
||||||
|
])
|
||||||
|
for cat in categories:
|
||||||
|
print(f" {cat['_id']}: {cat['count']} feeds")
|
||||||
|
|
||||||
|
print("\n✅ Done! Now run the migration script to update subscriber categories.")
|
||||||
@@ -11,6 +11,8 @@ from routes.tracking_routes import tracking_bp
|
|||||||
from routes.analytics_routes import analytics_bp
|
from routes.analytics_routes import analytics_bp
|
||||||
from routes.admin_routes import admin_bp
|
from routes.admin_routes import admin_bp
|
||||||
from routes.transport_routes import transport_bp
|
from routes.transport_routes import transport_bp
|
||||||
|
from routes.interests_routes import interests_bp
|
||||||
|
from routes.personalization_routes import personalization_bp
|
||||||
|
|
||||||
# Initialize Flask app
|
# Initialize Flask app
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
@@ -29,6 +31,8 @@ app.register_blueprint(tracking_bp)
|
|||||||
app.register_blueprint(analytics_bp)
|
app.register_blueprint(analytics_bp)
|
||||||
app.register_blueprint(admin_bp)
|
app.register_blueprint(admin_bp)
|
||||||
app.register_blueprint(transport_bp)
|
app.register_blueprint(transport_bp)
|
||||||
|
app.register_blueprint(interests_bp)
|
||||||
|
app.register_blueprint(personalization_bp)
|
||||||
|
|
||||||
# Health check endpoint
|
# Health check endpoint
|
||||||
@app.route('/health')
|
@app.route('/health')
|
||||||
|
|||||||
60
backend/check_categories.py
Normal file
60
backend/check_categories.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Check what categories exist in RSS feeds and articles
|
||||||
|
"""
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
client = MongoClient(Config.MONGODB_URI)
|
||||||
|
db = client[Config.DB_NAME]
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📊 Category Analysis")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
# Check RSS feed categories
|
||||||
|
print("\n🔹 RSS Feed Categories:")
|
||||||
|
rss_feeds_collection = db['rss_feeds']
|
||||||
|
feed_categories = rss_feeds_collection.distinct('category')
|
||||||
|
print(f" Unique categories: {feed_categories}")
|
||||||
|
print(f" Total: {len(feed_categories)} categories")
|
||||||
|
|
||||||
|
# Count feeds per category
|
||||||
|
print("\n Feeds per category:")
|
||||||
|
for cat in feed_categories:
|
||||||
|
count = rss_feeds_collection.count_documents({'category': cat})
|
||||||
|
feeds = list(rss_feeds_collection.find({'category': cat}, {'name': 1, '_id': 0}))
|
||||||
|
print(f" {cat}: {count} feeds")
|
||||||
|
for feed in feeds:
|
||||||
|
print(f" - {feed['name']}")
|
||||||
|
|
||||||
|
# Check article categories
|
||||||
|
print("\n🔹 Article Categories:")
|
||||||
|
articles_collection = db['articles']
|
||||||
|
article_categories = articles_collection.distinct('category')
|
||||||
|
print(f" Unique categories: {article_categories}")
|
||||||
|
print(f" Total: {len(article_categories)} categories")
|
||||||
|
|
||||||
|
# Count articles per category
|
||||||
|
print("\n Articles per category:")
|
||||||
|
for cat in article_categories:
|
||||||
|
count = articles_collection.count_documents({'category': cat})
|
||||||
|
with_summary = articles_collection.count_documents({'category': cat, 'summary': {'$exists': True, '$ne': None}})
|
||||||
|
print(f" {cat}: {count} articles ({with_summary} with summaries)")
|
||||||
|
|
||||||
|
# Check subscriber categories
|
||||||
|
print("\n🔹 Subscriber Categories:")
|
||||||
|
subscribers_collection = db['subscribers']
|
||||||
|
total_subs = subscribers_collection.count_documents({'status': 'active'})
|
||||||
|
print(f" Total active subscribers: {total_subs}")
|
||||||
|
|
||||||
|
# Sample a few subscribers to see their categories
|
||||||
|
sample_subs = list(subscribers_collection.find({'status': 'active'}, {'email': 1, 'categories': 1, '_id': 0}).limit(5))
|
||||||
|
print("\n Sample subscriber preferences:")
|
||||||
|
for sub in sample_subs:
|
||||||
|
cats = sub.get('categories', 'None')
|
||||||
|
print(f" {sub['email']}: {cats}")
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("✅ Analysis Complete")
|
||||||
|
print("="*70 + "\n")
|
||||||
79
backend/migrate_subscriber_categories.py
Normal file
79
backend/migrate_subscriber_categories.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Migration script to update subscriber categories
|
||||||
|
Ensures all subscribers have all available categories if they're missing some
|
||||||
|
"""
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from config import Config
|
||||||
|
from database import subscribers_collection, rss_feeds_collection
|
||||||
|
|
||||||
|
def migrate_subscriber_categories():
|
||||||
|
"""Update all subscribers to have all available categories"""
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📧 Subscriber Categories Migration")
|
||||||
|
print("="*70)
|
||||||
|
|
||||||
|
# Get all available categories from RSS feeds
|
||||||
|
available_categories = list(rss_feeds_collection.distinct('category'))
|
||||||
|
available_categories.sort()
|
||||||
|
|
||||||
|
print(f"\n✓ Available categories: {available_categories}")
|
||||||
|
|
||||||
|
# Get all subscribers
|
||||||
|
all_subscribers = list(subscribers_collection.find({}))
|
||||||
|
print(f"✓ Found {len(all_subscribers)} total subscribers")
|
||||||
|
|
||||||
|
updated_count = 0
|
||||||
|
no_change_count = 0
|
||||||
|
|
||||||
|
for subscriber in all_subscribers:
|
||||||
|
email = subscriber['email']
|
||||||
|
current_categories = subscriber.get('categories', [])
|
||||||
|
|
||||||
|
# Check if subscriber is missing any categories
|
||||||
|
if not current_categories:
|
||||||
|
# No categories set - give them all
|
||||||
|
print(f"\n {email}: No categories → Adding all {available_categories}")
|
||||||
|
subscribers_collection.update_one(
|
||||||
|
{'email': email},
|
||||||
|
{'$set': {'categories': available_categories}}
|
||||||
|
)
|
||||||
|
updated_count += 1
|
||||||
|
elif set(current_categories) != set(available_categories):
|
||||||
|
# Has some categories but not all
|
||||||
|
missing = set(available_categories) - set(current_categories)
|
||||||
|
print(f"\n {email}: {current_categories} → Adding all {available_categories}")
|
||||||
|
print(f" Missing: {list(missing)}")
|
||||||
|
subscribers_collection.update_one(
|
||||||
|
{'email': email},
|
||||||
|
{'$set': {'categories': available_categories}}
|
||||||
|
)
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
# Already has all categories
|
||||||
|
no_change_count += 1
|
||||||
|
|
||||||
|
print("\n" + "="*70)
|
||||||
|
print("📊 Migration Complete")
|
||||||
|
print("="*70)
|
||||||
|
print(f"✓ Updated: {updated_count} subscribers")
|
||||||
|
print(f"✓ No change needed: {no_change_count} subscribers")
|
||||||
|
print(f"✓ Total: {len(all_subscribers)} subscribers")
|
||||||
|
print("="*70 + "\n")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total': len(all_subscribers),
|
||||||
|
'updated': updated_count,
|
||||||
|
'no_change': no_change_count
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
result = migrate_subscriber_categories()
|
||||||
|
print(f"✅ Migration successful!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Migration failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
239
backend/routes/interests_routes.py
Normal file
239
backend/routes/interests_routes.py
Normal file
@@ -0,0 +1,239 @@
|
|||||||
|
"""
|
||||||
|
User Interest Profile API routes for Munich News Daily.
|
||||||
|
Provides endpoints to view and manage user interest profiles.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from flask import Blueprint, request, jsonify
|
||||||
|
from services.interest_profiling_service import (
|
||||||
|
get_user_interests,
|
||||||
|
get_top_interests,
|
||||||
|
build_interests_from_history,
|
||||||
|
decay_user_interests,
|
||||||
|
get_interest_statistics,
|
||||||
|
delete_user_interests
|
||||||
|
)
|
||||||
|
|
||||||
|
interests_bp = Blueprint('interests', __name__)
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/<email>', methods=['GET'])
|
||||||
|
def get_interests(email):
|
||||||
|
"""
|
||||||
|
Get user interest profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with user interest profile
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
profile = get_user_interests(email)
|
||||||
|
|
||||||
|
if not profile:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'User profile not found'
|
||||||
|
}), 404
|
||||||
|
|
||||||
|
# Remove MongoDB _id field
|
||||||
|
if '_id' in profile:
|
||||||
|
del profile['_id']
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'profile': profile
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/<email>/top', methods=['GET'])
|
||||||
|
def get_top_user_interests(email):
|
||||||
|
"""
|
||||||
|
Get user's top interests sorted by score.
|
||||||
|
|
||||||
|
Query parameters:
|
||||||
|
top_n: Number of top interests to return (default: 10)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with top categories and keywords
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
top_n = request.args.get('top_n', 10, type=int)
|
||||||
|
|
||||||
|
top_interests = get_top_interests(email, top_n)
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'email': email,
|
||||||
|
'top_categories': [
|
||||||
|
{'category': cat, 'score': score}
|
||||||
|
for cat, score in top_interests['top_categories']
|
||||||
|
],
|
||||||
|
'top_keywords': [
|
||||||
|
{'keyword': kw, 'score': score}
|
||||||
|
for kw, score in top_interests['top_keywords']
|
||||||
|
]
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/<email>/rebuild', methods=['POST'])
|
||||||
|
def rebuild_interests(email):
|
||||||
|
"""
|
||||||
|
Rebuild user interest profile from click history.
|
||||||
|
|
||||||
|
Request body (optional):
|
||||||
|
{
|
||||||
|
"days_lookback": 30 // Number of days of history to analyze
|
||||||
|
}
|
||||||
|
|
||||||
|
Args:
|
||||||
|
email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with rebuilt profile
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = request.get_json() or {}
|
||||||
|
days_lookback = data.get('days_lookback', 30)
|
||||||
|
|
||||||
|
# Validate days_lookback
|
||||||
|
if not isinstance(days_lookback, int) or days_lookback < 1:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'days_lookback must be a positive integer'
|
||||||
|
}), 400
|
||||||
|
|
||||||
|
profile = build_interests_from_history(email, days_lookback)
|
||||||
|
|
||||||
|
# Remove MongoDB _id field
|
||||||
|
if '_id' in profile:
|
||||||
|
del profile['_id']
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'message': f'Profile rebuilt from {days_lookback} days of history',
|
||||||
|
'profile': profile
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/decay', methods=['POST'])
|
||||||
|
def decay_interests():
|
||||||
|
"""
|
||||||
|
Decay interest scores for inactive users.
|
||||||
|
|
||||||
|
Request body (optional):
|
||||||
|
{
|
||||||
|
"decay_factor": 0.95, // Multiplier for scores (default: 0.95)
|
||||||
|
"days_threshold": 7 // Only decay profiles older than N days
|
||||||
|
}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with decay statistics
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = request.get_json() or {}
|
||||||
|
decay_factor = data.get('decay_factor', 0.95)
|
||||||
|
days_threshold = data.get('days_threshold', 7)
|
||||||
|
|
||||||
|
# Validate parameters
|
||||||
|
if not isinstance(decay_factor, (int, float)) or decay_factor <= 0 or decay_factor > 1:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'decay_factor must be between 0 and 1'
|
||||||
|
}), 400
|
||||||
|
|
||||||
|
if not isinstance(days_threshold, int) or days_threshold < 1:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'days_threshold must be a positive integer'
|
||||||
|
}), 400
|
||||||
|
|
||||||
|
result = decay_user_interests(decay_factor, days_threshold)
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'message': f'Decayed interests for profiles older than {days_threshold} days',
|
||||||
|
'statistics': result
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/statistics', methods=['GET'])
|
||||||
|
def get_statistics():
|
||||||
|
"""
|
||||||
|
Get statistics about user interests across all users.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with interest statistics
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
stats = get_interest_statistics()
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'statistics': stats
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@interests_bp.route('/api/interests/<email>', methods=['DELETE'])
|
||||||
|
def delete_interests(email):
|
||||||
|
"""
|
||||||
|
Delete user interest profile (GDPR compliance).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response with confirmation
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
deleted = delete_user_interests(email)
|
||||||
|
|
||||||
|
if not deleted:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'User profile not found'
|
||||||
|
}), 404
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'message': f'Interest profile deleted for {email}'
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
135
backend/routes/personalization_routes.py
Normal file
135
backend/routes/personalization_routes.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
"""
|
||||||
|
Personalization API routes for Munich News Daily.
|
||||||
|
Provides endpoints to test and preview personalized content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from flask import Blueprint, request, jsonify
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from database import articles_collection
|
||||||
|
from services.personalization_service import (
|
||||||
|
rank_articles_for_user,
|
||||||
|
select_personalized_articles,
|
||||||
|
get_personalization_explanation,
|
||||||
|
get_personalization_stats
|
||||||
|
)
|
||||||
|
|
||||||
|
personalization_bp = Blueprint('personalization', __name__)
|
||||||
|
|
||||||
|
|
||||||
|
@personalization_bp.route('/api/personalize/preview/<email>', methods=['GET'])
|
||||||
|
def preview_personalized_newsletter(email):
|
||||||
|
"""
|
||||||
|
Preview personalized newsletter for a user.
|
||||||
|
|
||||||
|
Query parameters:
|
||||||
|
max_articles: Maximum articles to return (default: 10)
|
||||||
|
hours_lookback: Hours of articles to consider (default: 24)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON with personalized article selection and statistics
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
max_articles = request.args.get('max_articles', 10, type=int)
|
||||||
|
hours_lookback = request.args.get('hours_lookback', 24, type=int)
|
||||||
|
|
||||||
|
# Get recent articles
|
||||||
|
cutoff_date = datetime.utcnow() - timedelta(hours=hours_lookback)
|
||||||
|
articles = list(articles_collection.find({
|
||||||
|
'created_at': {'$gte': cutoff_date},
|
||||||
|
'summary': {'$exists': True, '$ne': None}
|
||||||
|
}).sort('created_at', -1))
|
||||||
|
|
||||||
|
|
||||||
|
# Select personalized articles
|
||||||
|
personalized = select_personalized_articles(
|
||||||
|
articles,
|
||||||
|
email,
|
||||||
|
max_articles=max_articles
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
stats = get_personalization_stats(personalized, email)
|
||||||
|
|
||||||
|
# Format response
|
||||||
|
articles_response = []
|
||||||
|
for article in personalized:
|
||||||
|
articles_response.append({
|
||||||
|
'title': article.get('title', ''),
|
||||||
|
'title_en': article.get('title_en'),
|
||||||
|
'summary': article.get('summary', ''),
|
||||||
|
'link': article.get('link', ''),
|
||||||
|
'category': article.get('category', 'general'),
|
||||||
|
'keywords': article.get('keywords', []),
|
||||||
|
'personalization_score': article.get('personalization_score', 0.0),
|
||||||
|
'published_at': article.get('published_at', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'email': email,
|
||||||
|
'articles': articles_response,
|
||||||
|
'statistics': stats
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@personalization_bp.route('/api/personalize/explain', methods=['POST'])
|
||||||
|
def explain_recommendation():
|
||||||
|
"""
|
||||||
|
Explain why an article was recommended to a user.
|
||||||
|
|
||||||
|
Request body:
|
||||||
|
{
|
||||||
|
"email": "user@example.com",
|
||||||
|
"article_id": "article-id-here"
|
||||||
|
}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON with explanation of recommendation
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = request.get_json()
|
||||||
|
|
||||||
|
if not data or 'email' not in data or 'article_id' not in data:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'email and article_id required'
|
||||||
|
}), 400
|
||||||
|
|
||||||
|
email = data['email']
|
||||||
|
article_id = data['article_id']
|
||||||
|
|
||||||
|
# Get article
|
||||||
|
from bson import ObjectId
|
||||||
|
article = articles_collection.find_one({'_id': ObjectId(article_id)})
|
||||||
|
|
||||||
|
if not article:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': 'Article not found'
|
||||||
|
}), 404
|
||||||
|
|
||||||
|
# Get user interests
|
||||||
|
from services.interest_profiling_service import get_user_interests
|
||||||
|
user_interests = get_user_interests(email)
|
||||||
|
|
||||||
|
# Generate explanation
|
||||||
|
explanation = get_personalization_explanation(article, user_interests)
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'email': email,
|
||||||
|
'article_title': article.get('title', ''),
|
||||||
|
'explanation': explanation
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}), 500
|
||||||
@@ -79,8 +79,8 @@ def track_click(tracking_id):
|
|||||||
"""
|
"""
|
||||||
Track link clicks and redirect to original article URL.
|
Track link clicks and redirect to original article URL.
|
||||||
|
|
||||||
Logs the click event and redirects the user to the original article URL.
|
Logs the click event, updates user interest profile, and redirects the user
|
||||||
Handles invalid tracking_id by redirecting to homepage.
|
to the original article URL. Handles invalid tracking_id by redirecting to homepage.
|
||||||
Ensures redirect completes within 200ms.
|
Ensures redirect completes within 200ms.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -115,6 +115,19 @@ def track_click(tracking_id):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Update user interest profile (Phase 3)
|
||||||
|
subscriber_email = tracking_record.get('subscriber_email')
|
||||||
|
keywords = tracking_record.get('keywords', [])
|
||||||
|
category = tracking_record.get('category', 'general')
|
||||||
|
|
||||||
|
if subscriber_email and subscriber_email != 'anonymized':
|
||||||
|
try:
|
||||||
|
from services.interest_profiling_service import update_user_interests
|
||||||
|
update_user_interests(subscriber_email, keywords, category)
|
||||||
|
except Exception as e:
|
||||||
|
# Don't fail the redirect if interest update fails
|
||||||
|
print(f"Error updating user interests: {str(e)}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log error but still redirect
|
# Log error but still redirect
|
||||||
print(f"Error tracking click for {tracking_id}: {str(e)}")
|
print(f"Error tracking click for {tracking_id}: {str(e)}")
|
||||||
|
|||||||
323
backend/services/interest_profiling_service.py
Normal file
323
backend/services/interest_profiling_service.py
Normal file
@@ -0,0 +1,323 @@
|
|||||||
|
"""
|
||||||
|
User Interest Profiling Service for Munich News Daily.
|
||||||
|
Builds and maintains user interest profiles based on article click behavior.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from database import link_clicks_collection
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
# Connect to MongoDB
|
||||||
|
client = MongoClient(Config.MONGODB_URI)
|
||||||
|
db = client[Config.DB_NAME]
|
||||||
|
user_interests_collection = db['user_interests']
|
||||||
|
|
||||||
|
|
||||||
|
def update_user_interests(subscriber_email: str, keywords: List[str], category: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Update user interest profile based on a clicked article.
|
||||||
|
|
||||||
|
Increments interest scores for the article's keywords and category.
|
||||||
|
Creates a new profile if the user doesn't have one yet.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
keywords: List of keywords from the clicked article
|
||||||
|
category: Category of the clicked article
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Updated user interest profile
|
||||||
|
"""
|
||||||
|
current_time = datetime.utcnow()
|
||||||
|
|
||||||
|
# Get existing profile or create new one
|
||||||
|
profile = user_interests_collection.find_one({'email': subscriber_email})
|
||||||
|
|
||||||
|
if not profile:
|
||||||
|
# Create new profile
|
||||||
|
profile = {
|
||||||
|
'email': subscriber_email,
|
||||||
|
'categories': {},
|
||||||
|
'keywords': {},
|
||||||
|
'total_clicks': 0,
|
||||||
|
'last_updated': current_time,
|
||||||
|
'created_at': current_time
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update category interest (increment by 0.1, max 1.0)
|
||||||
|
current_category_score = profile['categories'].get(category, 0.0)
|
||||||
|
profile['categories'][category] = min(current_category_score + 0.1, 1.0)
|
||||||
|
|
||||||
|
# Update keyword interests (increment by 0.1, max 1.0)
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword: # Skip empty keywords
|
||||||
|
current_keyword_score = profile['keywords'].get(keyword, 0.0)
|
||||||
|
profile['keywords'][keyword] = min(current_keyword_score + 0.1, 1.0)
|
||||||
|
|
||||||
|
# Update metadata
|
||||||
|
profile['total_clicks'] = profile.get('total_clicks', 0) + 1
|
||||||
|
profile['last_updated'] = current_time
|
||||||
|
|
||||||
|
# Upsert profile
|
||||||
|
user_interests_collection.update_one(
|
||||||
|
{'email': subscriber_email},
|
||||||
|
{'$set': profile},
|
||||||
|
upsert=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_interests(subscriber_email: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Get user interest profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: User interest profile or None if not found
|
||||||
|
"""
|
||||||
|
return user_interests_collection.find_one({'email': subscriber_email})
|
||||||
|
|
||||||
|
|
||||||
|
def decay_user_interests(decay_factor: float = 0.95, days_threshold: int = 7) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Decay interest scores for users who haven't clicked recently.
|
||||||
|
|
||||||
|
Reduces interest scores over time to reflect changing interests.
|
||||||
|
Only decays profiles that haven't been updated in the last N days.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
decay_factor: Multiplier for interest scores (default: 0.95 = 5% decay)
|
||||||
|
days_threshold: Only decay profiles older than this many days (default: 7)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Statistics about the decay operation
|
||||||
|
- profiles_decayed: Number of profiles that were decayed
|
||||||
|
- profiles_checked: Total number of profiles checked
|
||||||
|
"""
|
||||||
|
cutoff_date = datetime.utcnow() - timedelta(days=days_threshold)
|
||||||
|
|
||||||
|
# Find profiles that haven't been updated recently
|
||||||
|
old_profiles = user_interests_collection.find({
|
||||||
|
'last_updated': {'$lt': cutoff_date}
|
||||||
|
})
|
||||||
|
|
||||||
|
profiles_decayed = 0
|
||||||
|
profiles_checked = 0
|
||||||
|
|
||||||
|
for profile in old_profiles:
|
||||||
|
profiles_checked += 1
|
||||||
|
|
||||||
|
# Decay category scores
|
||||||
|
decayed_categories = {}
|
||||||
|
for category, score in profile.get('categories', {}).items():
|
||||||
|
new_score = score * decay_factor
|
||||||
|
# Remove categories with very low scores (< 0.05)
|
||||||
|
if new_score >= 0.05:
|
||||||
|
decayed_categories[category] = round(new_score, 3)
|
||||||
|
|
||||||
|
# Decay keyword scores
|
||||||
|
decayed_keywords = {}
|
||||||
|
for keyword, score in profile.get('keywords', {}).items():
|
||||||
|
new_score = score * decay_factor
|
||||||
|
# Remove keywords with very low scores (< 0.05)
|
||||||
|
if new_score >= 0.05:
|
||||||
|
decayed_keywords[keyword] = round(new_score, 3)
|
||||||
|
|
||||||
|
# Update profile with decayed scores
|
||||||
|
user_interests_collection.update_one(
|
||||||
|
{'email': profile['email']},
|
||||||
|
{
|
||||||
|
'$set': {
|
||||||
|
'categories': decayed_categories,
|
||||||
|
'keywords': decayed_keywords,
|
||||||
|
'last_decayed': datetime.utcnow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
profiles_decayed += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
'profiles_decayed': profiles_decayed,
|
||||||
|
'profiles_checked': profiles_checked
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_interests(subscriber_email: str, top_n: int = 10) -> Dict[str, List[tuple]]:
|
||||||
|
"""
|
||||||
|
Get user's top interests sorted by score.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
top_n: Number of top interests to return (default: 10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Top interests containing:
|
||||||
|
- top_categories: List of (category, score) tuples
|
||||||
|
- top_keywords: List of (keyword, score) tuples
|
||||||
|
"""
|
||||||
|
profile = get_user_interests(subscriber_email)
|
||||||
|
|
||||||
|
if not profile:
|
||||||
|
return {
|
||||||
|
'top_categories': [],
|
||||||
|
'top_keywords': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sort categories by score
|
||||||
|
categories = profile.get('categories', {})
|
||||||
|
top_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
||||||
|
|
||||||
|
# Sort keywords by score
|
||||||
|
keywords = profile.get('keywords', {})
|
||||||
|
top_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'top_categories': top_categories,
|
||||||
|
'top_keywords': top_keywords
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_interests_from_history(subscriber_email: str, days_lookback: int = 30) -> Dict:
|
||||||
|
"""
|
||||||
|
Build or rebuild user interest profile from click history.
|
||||||
|
|
||||||
|
Useful for:
|
||||||
|
- Initializing profiles for existing users
|
||||||
|
- Rebuilding profiles after algorithm changes
|
||||||
|
- Backfilling data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
days_lookback: Number of days of history to analyze (default: 30)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Newly built interest profile
|
||||||
|
"""
|
||||||
|
cutoff_date = datetime.utcnow() - timedelta(days=days_lookback)
|
||||||
|
|
||||||
|
# Get all clicks from this user in the lookback period
|
||||||
|
clicks = link_clicks_collection.find({
|
||||||
|
'subscriber_email': subscriber_email,
|
||||||
|
'clicked': True,
|
||||||
|
'clicked_at': {'$gte': cutoff_date}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Initialize profile
|
||||||
|
profile = {
|
||||||
|
'email': subscriber_email,
|
||||||
|
'categories': {},
|
||||||
|
'keywords': {},
|
||||||
|
'total_clicks': 0,
|
||||||
|
'last_updated': datetime.utcnow(),
|
||||||
|
'created_at': datetime.utcnow()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each click
|
||||||
|
for click in clicks:
|
||||||
|
category = click.get('category', 'general')
|
||||||
|
keywords = click.get('keywords', [])
|
||||||
|
|
||||||
|
# Update category score
|
||||||
|
profile['categories'][category] = profile['categories'].get(category, 0.0) + 0.1
|
||||||
|
|
||||||
|
# Update keyword scores
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword:
|
||||||
|
profile['keywords'][keyword] = profile['keywords'].get(keyword, 0.0) + 0.1
|
||||||
|
|
||||||
|
profile['total_clicks'] += 1
|
||||||
|
|
||||||
|
# Cap scores at 1.0
|
||||||
|
for category in profile['categories']:
|
||||||
|
profile['categories'][category] = min(profile['categories'][category], 1.0)
|
||||||
|
|
||||||
|
for keyword in profile['keywords']:
|
||||||
|
profile['keywords'][keyword] = min(profile['keywords'][keyword], 1.0)
|
||||||
|
|
||||||
|
# Save profile
|
||||||
|
if profile['total_clicks'] > 0:
|
||||||
|
user_interests_collection.update_one(
|
||||||
|
{'email': subscriber_email},
|
||||||
|
{'$set': profile},
|
||||||
|
upsert=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
def get_interest_statistics() -> Dict:
|
||||||
|
"""
|
||||||
|
Get statistics about user interests across all users.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Statistics containing:
|
||||||
|
- total_users: Total number of users with profiles
|
||||||
|
- avg_clicks_per_user: Average number of clicks per user
|
||||||
|
- most_popular_categories: Top categories across all users
|
||||||
|
- most_popular_keywords: Top keywords across all users
|
||||||
|
"""
|
||||||
|
total_users = user_interests_collection.count_documents({})
|
||||||
|
|
||||||
|
if total_users == 0:
|
||||||
|
return {
|
||||||
|
'total_users': 0,
|
||||||
|
'avg_clicks_per_user': 0,
|
||||||
|
'most_popular_categories': [],
|
||||||
|
'most_popular_keywords': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate average clicks
|
||||||
|
pipeline = [
|
||||||
|
{
|
||||||
|
'$group': {
|
||||||
|
'_id': None,
|
||||||
|
'total_clicks': {'$sum': '$total_clicks'}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
result = list(user_interests_collection.aggregate(pipeline))
|
||||||
|
total_clicks = result[0]['total_clicks'] if result else 0
|
||||||
|
avg_clicks = total_clicks / total_users if total_users > 0 else 0
|
||||||
|
|
||||||
|
# Get most popular categories
|
||||||
|
category_counts = {}
|
||||||
|
keyword_counts = {}
|
||||||
|
|
||||||
|
for profile in user_interests_collection.find({}):
|
||||||
|
for category, score in profile.get('categories', {}).items():
|
||||||
|
category_counts[category] = category_counts.get(category, 0) + score
|
||||||
|
|
||||||
|
for keyword, score in profile.get('keywords', {}).items():
|
||||||
|
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + score
|
||||||
|
|
||||||
|
# Sort and get top 10
|
||||||
|
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_users': total_users,
|
||||||
|
'avg_clicks_per_user': round(avg_clicks, 2),
|
||||||
|
'most_popular_categories': top_categories,
|
||||||
|
'most_popular_keywords': top_keywords
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def delete_user_interests(subscriber_email: str) -> bool:
|
||||||
|
"""
|
||||||
|
Delete user interest profile (for GDPR compliance).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if profile was deleted, False if not found
|
||||||
|
"""
|
||||||
|
result = user_interests_collection.delete_one({'email': subscriber_email})
|
||||||
|
return result.deleted_count > 0
|
||||||
295
backend/services/personalization_service.py
Normal file
295
backend/services/personalization_service.py
Normal file
@@ -0,0 +1,295 @@
|
|||||||
|
"""
|
||||||
|
Newsletter Personalization Service for Munich News Daily.
|
||||||
|
Ranks and selects articles based on user interest profiles.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from services.interest_profiling_service import get_user_interests
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_article_score(
|
||||||
|
article: Dict,
|
||||||
|
user_interests: Optional[Dict],
|
||||||
|
category_weight: float = 0.4,
|
||||||
|
keyword_weight: float = 0.6
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Calculate personalization score for an article based on user interests.
|
||||||
|
|
||||||
|
Score is calculated as:
|
||||||
|
- Category match: 0-1.0 based on user's interest in the category
|
||||||
|
- Keyword match: Average of user's interest in article keywords
|
||||||
|
- Final score: (category_score * 0.4) + (keyword_score * 0.6)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
article: Article dictionary with 'category' and 'keywords' fields
|
||||||
|
user_interests: User interest profile (None for non-personalized)
|
||||||
|
category_weight: Weight for category matching (default: 0.4)
|
||||||
|
keyword_weight: Weight for keyword matching (default: 0.6)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Personalization score between 0.0 and 1.0
|
||||||
|
"""
|
||||||
|
# If no user interests, return neutral score
|
||||||
|
if not user_interests:
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
# Get article metadata
|
||||||
|
article_category = article.get('category', 'general')
|
||||||
|
article_keywords = article.get('keywords', [])
|
||||||
|
|
||||||
|
# Calculate category score
|
||||||
|
user_categories = user_interests.get('categories', {})
|
||||||
|
category_score = user_categories.get(article_category, 0.0)
|
||||||
|
|
||||||
|
# Calculate keyword score (average of all matching keywords)
|
||||||
|
user_keywords = user_interests.get('keywords', {})
|
||||||
|
keyword_scores = []
|
||||||
|
|
||||||
|
for keyword in article_keywords:
|
||||||
|
if keyword in user_keywords:
|
||||||
|
keyword_scores.append(user_keywords[keyword])
|
||||||
|
|
||||||
|
# Average keyword score (0.0 if no matches)
|
||||||
|
keyword_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0.0
|
||||||
|
|
||||||
|
# Weighted final score
|
||||||
|
final_score = (category_score * category_weight) + (keyword_score * keyword_weight)
|
||||||
|
|
||||||
|
return round(final_score, 3)
|
||||||
|
|
||||||
|
|
||||||
|
def rank_articles_for_user(
|
||||||
|
articles: List[Dict],
|
||||||
|
subscriber_email: str,
|
||||||
|
personalization_ratio: float = 0.7
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Rank articles for a specific user based on their interests.
|
||||||
|
|
||||||
|
Mixes personalized content with trending content to avoid filter bubbles.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: List of article dictionaries
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
personalization_ratio: Ratio of personalized vs trending (default: 0.7 = 70% personalized)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: Articles sorted by personalization score with score added
|
||||||
|
"""
|
||||||
|
# Get user interests
|
||||||
|
user_interests = get_user_interests(subscriber_email)
|
||||||
|
|
||||||
|
# Calculate score for each article
|
||||||
|
scored_articles = []
|
||||||
|
for article in articles:
|
||||||
|
score = calculate_article_score(article, user_interests)
|
||||||
|
|
||||||
|
# Add score to article (don't modify original)
|
||||||
|
article_with_score = article.copy()
|
||||||
|
article_with_score['personalization_score'] = score
|
||||||
|
scored_articles.append(article_with_score)
|
||||||
|
|
||||||
|
# Sort by score (highest first)
|
||||||
|
scored_articles.sort(key=lambda x: x['personalization_score'], reverse=True)
|
||||||
|
|
||||||
|
return scored_articles
|
||||||
|
|
||||||
|
|
||||||
|
def select_personalized_articles(
|
||||||
|
articles: List[Dict],
|
||||||
|
subscriber_email: str,
|
||||||
|
max_articles: int = 10,
|
||||||
|
personalization_ratio: float = 0.7,
|
||||||
|
min_score_threshold: float = 0.1
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Select and rank articles for a personalized newsletter.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
- Top N * personalization_ratio articles: Highest scoring (personalized)
|
||||||
|
- Remaining articles: Most recent (trending/diverse content)
|
||||||
|
- Ensures mix of personalized + diverse content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: List of available articles
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
max_articles: Maximum number of articles to include (default: 10)
|
||||||
|
personalization_ratio: Ratio of personalized content (default: 0.7)
|
||||||
|
min_score_threshold: Minimum score to consider personalized (default: 0.1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: Selected articles with personalization scores
|
||||||
|
"""
|
||||||
|
if not articles:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Rank all articles
|
||||||
|
ranked_articles = rank_articles_for_user(articles, subscriber_email, personalization_ratio)
|
||||||
|
|
||||||
|
# Calculate split
|
||||||
|
num_personalized = int(max_articles * personalization_ratio)
|
||||||
|
num_trending = max_articles - num_personalized
|
||||||
|
|
||||||
|
# Get personalized articles (high scoring)
|
||||||
|
personalized = [
|
||||||
|
a for a in ranked_articles
|
||||||
|
if a['personalization_score'] >= min_score_threshold
|
||||||
|
][:num_personalized]
|
||||||
|
|
||||||
|
# Get trending articles (most recent, not already selected)
|
||||||
|
personalized_ids = {a.get('_id') for a in personalized}
|
||||||
|
trending = [
|
||||||
|
a for a in ranked_articles
|
||||||
|
if a.get('_id') not in personalized_ids
|
||||||
|
][:num_trending]
|
||||||
|
|
||||||
|
# Combine: personalized first, then trending
|
||||||
|
selected = personalized + trending
|
||||||
|
|
||||||
|
# Ensure we don't exceed max_articles
|
||||||
|
return selected[:max_articles]
|
||||||
|
|
||||||
|
|
||||||
|
def get_personalization_explanation(
|
||||||
|
article: Dict,
|
||||||
|
user_interests: Optional[Dict]
|
||||||
|
) -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Generate explanation for why an article was recommended.
|
||||||
|
|
||||||
|
Useful for transparency and debugging.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
article: Article dictionary
|
||||||
|
user_interests: User interest profile
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Explanation containing:
|
||||||
|
- score: Overall personalization score
|
||||||
|
- category_match: Category score
|
||||||
|
- keyword_matches: List of matching keywords with scores
|
||||||
|
- reason: Human-readable explanation
|
||||||
|
"""
|
||||||
|
if not user_interests:
|
||||||
|
return {
|
||||||
|
'score': 0.5,
|
||||||
|
'category_match': 0.0,
|
||||||
|
'keyword_matches': [],
|
||||||
|
'reason': 'No personalization data available'
|
||||||
|
}
|
||||||
|
|
||||||
|
article_category = article.get('category', 'general')
|
||||||
|
article_keywords = article.get('keywords', [])
|
||||||
|
|
||||||
|
user_categories = user_interests.get('categories', {})
|
||||||
|
user_keywords = user_interests.get('keywords', {})
|
||||||
|
|
||||||
|
# Category match
|
||||||
|
category_score = user_categories.get(article_category, 0.0)
|
||||||
|
|
||||||
|
# Keyword matches
|
||||||
|
keyword_matches = []
|
||||||
|
for keyword in article_keywords:
|
||||||
|
if keyword in user_keywords:
|
||||||
|
keyword_matches.append({
|
||||||
|
'keyword': keyword,
|
||||||
|
'score': user_keywords[keyword]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate overall score
|
||||||
|
overall_score = calculate_article_score(article, user_interests)
|
||||||
|
|
||||||
|
# Generate reason
|
||||||
|
if overall_score >= 0.5:
|
||||||
|
reason = f"High match with your interests in {article_category}"
|
||||||
|
if keyword_matches:
|
||||||
|
top_keywords = [m['keyword'] for m in keyword_matches[:2]]
|
||||||
|
reason += f" and topics like {', '.join(top_keywords)}"
|
||||||
|
elif overall_score >= 0.3:
|
||||||
|
reason = f"Moderate match with your interests"
|
||||||
|
else:
|
||||||
|
reason = "Trending article for diverse content"
|
||||||
|
|
||||||
|
return {
|
||||||
|
'score': overall_score,
|
||||||
|
'category_match': category_score,
|
||||||
|
'keyword_matches': keyword_matches,
|
||||||
|
'reason': reason
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_personalization_stats(
|
||||||
|
selected_articles: List[Dict],
|
||||||
|
subscriber_email: str
|
||||||
|
) -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Get statistics about personalization for a newsletter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
selected_articles: Articles selected for the newsletter
|
||||||
|
subscriber_email: Email address of the user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Statistics containing:
|
||||||
|
- total_articles: Number of articles
|
||||||
|
- avg_score: Average personalization score
|
||||||
|
- highly_personalized: Number of articles with score >= 0.5
|
||||||
|
- moderately_personalized: Number with score 0.3-0.5
|
||||||
|
- trending: Number with score < 0.3
|
||||||
|
"""
|
||||||
|
if not selected_articles:
|
||||||
|
return {
|
||||||
|
'total_articles': 0,
|
||||||
|
'avg_score': 0.0,
|
||||||
|
'highly_personalized': 0,
|
||||||
|
'moderately_personalized': 0,
|
||||||
|
'trending': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
scores = [a.get('personalization_score', 0.0) for a in selected_articles]
|
||||||
|
avg_score = sum(scores) / len(scores)
|
||||||
|
|
||||||
|
highly_personalized = sum(1 for s in scores if s >= 0.5)
|
||||||
|
moderately_personalized = sum(1 for s in scores if 0.3 <= s < 0.5)
|
||||||
|
trending = sum(1 for s in scores if s < 0.3)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_articles': len(selected_articles),
|
||||||
|
'avg_score': round(avg_score, 3),
|
||||||
|
'highly_personalized': highly_personalized,
|
||||||
|
'moderately_personalized': moderately_personalized,
|
||||||
|
'trending': trending
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def batch_personalize_newsletters(
|
||||||
|
articles: List[Dict],
|
||||||
|
subscribers: List[str],
|
||||||
|
max_articles_per_user: int = 10
|
||||||
|
) -> Dict[str, List[Dict]]:
|
||||||
|
"""
|
||||||
|
Generate personalized article selections for multiple subscribers.
|
||||||
|
|
||||||
|
Useful for batch newsletter generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: List of available articles
|
||||||
|
subscribers: List of subscriber email addresses
|
||||||
|
max_articles_per_user: Max articles per newsletter (default: 10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Mapping of email -> personalized article list
|
||||||
|
"""
|
||||||
|
personalized_newsletters = {}
|
||||||
|
|
||||||
|
for subscriber_email in subscribers:
|
||||||
|
personalized_articles = select_personalized_articles(
|
||||||
|
articles,
|
||||||
|
subscriber_email,
|
||||||
|
max_articles=max_articles_per_user
|
||||||
|
)
|
||||||
|
personalized_newsletters[subscriber_email] = personalized_articles
|
||||||
|
|
||||||
|
return personalized_newsletters
|
||||||
@@ -80,6 +80,9 @@ def create_newsletter_tracking(
|
|||||||
link_tracking_map = {}
|
link_tracking_map = {}
|
||||||
|
|
||||||
if article_links:
|
if article_links:
|
||||||
|
# Import here to avoid circular dependency
|
||||||
|
from database import articles_collection
|
||||||
|
|
||||||
for article in article_links:
|
for article in article_links:
|
||||||
article_url = article.get('url')
|
article_url = article.get('url')
|
||||||
article_title = article.get('title', '')
|
article_title = article.get('title', '')
|
||||||
@@ -87,13 +90,22 @@ def create_newsletter_tracking(
|
|||||||
if article_url:
|
if article_url:
|
||||||
link_tracking_id = generate_tracking_id()
|
link_tracking_id = generate_tracking_id()
|
||||||
|
|
||||||
# Create link click tracking record
|
# Look up article metadata from database for personalization
|
||||||
|
article_doc = articles_collection.find_one({'link': article_url})
|
||||||
|
article_id = str(article_doc['_id']) if article_doc else None
|
||||||
|
category = article_doc.get('category', 'general') if article_doc else 'general'
|
||||||
|
keywords = article_doc.get('keywords', []) if article_doc else []
|
||||||
|
|
||||||
|
# Create link click tracking record with metadata
|
||||||
link_click_doc = {
|
link_click_doc = {
|
||||||
'tracking_id': link_tracking_id,
|
'tracking_id': link_tracking_id,
|
||||||
'newsletter_id': newsletter_id,
|
'newsletter_id': newsletter_id,
|
||||||
'subscriber_email': subscriber_email,
|
'subscriber_email': subscriber_email,
|
||||||
'article_url': article_url,
|
'article_url': article_url,
|
||||||
'article_title': article_title,
|
'article_title': article_title,
|
||||||
|
'article_id': article_id, # NEW: Article database ID
|
||||||
|
'category': category, # NEW: Article category
|
||||||
|
'keywords': keywords, # NEW: Article keywords for personalization
|
||||||
'clicked': False,
|
'clicked': False,
|
||||||
'clicked_at': None,
|
'clicked_at': None,
|
||||||
'user_agent': None,
|
'user_agent': None,
|
||||||
|
|||||||
221
backend/test_personalization_system.py
Normal file
221
backend/test_personalization_system.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive test suite for the personalization system.
|
||||||
|
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Import services
|
||||||
|
from services.tracking_service import create_newsletter_tracking
|
||||||
|
from services.interest_profiling_service import (
|
||||||
|
update_user_interests,
|
||||||
|
get_user_interests,
|
||||||
|
get_top_interests,
|
||||||
|
build_interests_from_history
|
||||||
|
)
|
||||||
|
from services.personalization_service import (
|
||||||
|
calculate_article_score,
|
||||||
|
rank_articles_for_user,
|
||||||
|
select_personalized_articles,
|
||||||
|
get_personalization_stats
|
||||||
|
)
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
# Connect to MongoDB
|
||||||
|
client = MongoClient(Config.MONGODB_URI)
|
||||||
|
db = client[Config.DB_NAME]
|
||||||
|
|
||||||
|
articles_collection = db['articles']
|
||||||
|
link_clicks_collection = db['link_clicks']
|
||||||
|
user_interests_collection = db['user_interests']
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase1_keywords():
|
||||||
|
"""Phase 1: Verify articles have keywords extracted"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 1: Keyword Extraction")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
articles_with_keywords = articles_collection.count_documents({
|
||||||
|
'keywords': {'$exists': True, '$ne': []}
|
||||||
|
})
|
||||||
|
|
||||||
|
if articles_with_keywords == 0:
|
||||||
|
print("❌ No articles with keywords found")
|
||||||
|
print(" Run a crawl first to extract keywords")
|
||||||
|
return False
|
||||||
|
|
||||||
|
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||||
|
print(f"✓ Found {articles_with_keywords} articles with keywords")
|
||||||
|
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
|
||||||
|
print(f" Keywords: {sample.get('keywords', [])[:3]}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase2_tracking():
|
||||||
|
"""Phase 2: Verify tracking includes keywords and metadata"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 2: Click Tracking Enhancement")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase2@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||||
|
|
||||||
|
# Get article with keywords
|
||||||
|
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||||
|
|
||||||
|
if not article:
|
||||||
|
print("❌ No articles found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create tracking
|
||||||
|
tracking_data = create_newsletter_tracking(
|
||||||
|
newsletter_id='test-phase2',
|
||||||
|
subscriber_email=test_email,
|
||||||
|
article_links=[{
|
||||||
|
'url': article['link'],
|
||||||
|
'title': article.get('title', '')
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify tracking record
|
||||||
|
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
|
||||||
|
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
|
||||||
|
|
||||||
|
has_metadata = (
|
||||||
|
tracking_record.get('article_id') is not None and
|
||||||
|
tracking_record.get('category') is not None and
|
||||||
|
len(tracking_record.get('keywords', [])) > 0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||||
|
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
|
||||||
|
|
||||||
|
if has_metadata:
|
||||||
|
print(f"✓ Tracking records include metadata")
|
||||||
|
print(f" Article ID: {tracking_record.get('article_id')}")
|
||||||
|
print(f" Category: {tracking_record.get('category')}")
|
||||||
|
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Tracking records missing metadata")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase3_profiling():
|
||||||
|
"""Phase 3: Verify interest profiles are built from clicks"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 3: User Interest Profiling")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase3@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
# Create profile
|
||||||
|
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||||
|
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
|
||||||
|
|
||||||
|
# Verify profile
|
||||||
|
profile = get_user_interests(test_email)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
if profile and profile['total_clicks'] == 2:
|
||||||
|
print(f"✓ Interest profile created")
|
||||||
|
print(f" Total clicks: {profile['total_clicks']}")
|
||||||
|
print(f" Categories: {len(profile.get('categories', {}))}")
|
||||||
|
print(f" Keywords: {len(profile.get('keywords', {}))}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Interest profile not created correctly")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase4_personalization():
|
||||||
|
"""Phase 4: Verify articles are ranked by user interests"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 4: Personalized Newsletter Generation")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase4@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
# Get articles
|
||||||
|
articles = list(articles_collection.find(
|
||||||
|
{'keywords': {'$exists': True, '$ne': []}},
|
||||||
|
limit=5
|
||||||
|
))
|
||||||
|
|
||||||
|
if len(articles) < 3:
|
||||||
|
print("❌ Not enough articles found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create profile
|
||||||
|
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||||
|
|
||||||
|
# Rank articles
|
||||||
|
ranked = rank_articles_for_user(articles, test_email)
|
||||||
|
|
||||||
|
# Select personalized
|
||||||
|
selected = select_personalized_articles(articles, test_email, max_articles=3)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
has_scores = all('personalization_score' in a for a in selected)
|
||||||
|
|
||||||
|
if has_scores and len(selected) > 0:
|
||||||
|
print(f"✓ Articles ranked and selected")
|
||||||
|
print(f" Total ranked: {len(ranked)}")
|
||||||
|
print(f" Selected: {len(selected)}")
|
||||||
|
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Personalization failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all personalization tests"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("PERSONALIZATION SYSTEM TEST SUITE")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
results = {
|
||||||
|
'Phase 1: Keyword Extraction': test_phase1_keywords(),
|
||||||
|
'Phase 2: Click Tracking': test_phase2_tracking(),
|
||||||
|
'Phase 3: Interest Profiling': test_phase3_profiling(),
|
||||||
|
'Phase 4: Personalization': test_phase4_personalization()
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TEST RESULTS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
for phase, passed in results.items():
|
||||||
|
status = "✅ PASS" if passed else "❌ FAIL"
|
||||||
|
print(f"{status} - {phase}")
|
||||||
|
|
||||||
|
all_passed = all(results.values())
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n🎉 All personalization tests PASSED!")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print("\n❌ Some tests FAILED")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
35
debug_categories.py
Normal file
35
debug_categories.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Connect to MongoDB
|
||||||
|
mongo_uri = os.getenv('MONGODB_URI', 'mongodb://mongodb:27017/')
|
||||||
|
client = MongoClient(mongo_uri)
|
||||||
|
db = client['munich_news']
|
||||||
|
articles = db['articles']
|
||||||
|
subscribers = db['subscribers']
|
||||||
|
|
||||||
|
print("--- Distinct Categories in Articles Collection ---")
|
||||||
|
categories = articles.distinct('category')
|
||||||
|
print(categories)
|
||||||
|
|
||||||
|
print("\n--- Recent Article Counts by Category (Last 24h) ---")
|
||||||
|
yesterday = datetime.utcnow() - timedelta(hours=24)
|
||||||
|
recent_articles = articles.find({'created_at': {'$gte': yesterday}})
|
||||||
|
category_counts = {}
|
||||||
|
for art in recent_articles:
|
||||||
|
cat = art.get('category', 'unknown')
|
||||||
|
category_counts[cat] = category_counts.get(cat, 0) + 1
|
||||||
|
|
||||||
|
for cat, count in category_counts.items():
|
||||||
|
print(f"{cat}: {count}")
|
||||||
|
|
||||||
|
print("\n--- Subscriber Preferences ---")
|
||||||
|
for sub in subscribers.find():
|
||||||
|
print(f"Email: {sub.get('email')}, Categories: {sub.get('categories')}")
|
||||||
|
|
||||||
|
print("\n--- RSS Feeds ---")
|
||||||
|
rss_feeds = db['rss_feeds']
|
||||||
|
for feed in rss_feeds.find():
|
||||||
|
print(f"Name: {feed.get('name')}, URL: {feed.get('url')}, Category: {feed.get('category')}, Active: {feed.get('active')}")
|
||||||
225
docker-compose.local.yml
Normal file
225
docker-compose.local.yml
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
services:
|
||||||
|
# Ollama AI Service (Exposed for local testing)
|
||||||
|
ollama:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
container_name: munich-news-local-ollama
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "11434:11434" # Exposed for local testing
|
||||||
|
volumes:
|
||||||
|
- ollama_data_local:/root/.ollama
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
dns:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 1.1.1.1
|
||||||
|
# GPU support (uncomment if you have NVIDIA GPU)
|
||||||
|
# deploy:
|
||||||
|
# resources:
|
||||||
|
# reservations:
|
||||||
|
# devices:
|
||||||
|
# - driver: nvidia
|
||||||
|
# count: all
|
||||||
|
# capabilities: [gpu]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "ollama list || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
# Ollama Model Loader - Pulls phi3:latest (smaller model for local dev)
|
||||||
|
ollama-setup:
|
||||||
|
image: curlimages/curl:latest
|
||||||
|
container_name: munich-news-local-ollama-setup
|
||||||
|
depends_on:
|
||||||
|
ollama:
|
||||||
|
condition: service_healthy
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
env_file:
|
||||||
|
- backend/.env.local
|
||||||
|
volumes:
|
||||||
|
- ./scripts/setup-ollama-model.sh:/setup-ollama-model.sh:ro
|
||||||
|
dns:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 1.1.1.1
|
||||||
|
command: sh /setup-ollama-model.sh
|
||||||
|
restart: on-failure
|
||||||
|
|
||||||
|
# Redis - Message queue for async tasks (Internal only - not exposed to host)
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: munich-news-local-redis
|
||||||
|
restart: unless-stopped
|
||||||
|
# No ports exposed - only accessible within Docker network
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# MongoDB Database (Exposed for local debugging)
|
||||||
|
mongodb:
|
||||||
|
image: mongo:latest
|
||||||
|
container_name: munich-news-local-mongodb
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "27017:27017" # Exposed for local debugging
|
||||||
|
environment:
|
||||||
|
# For production, set MONGO_PASSWORD environment variable
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: ${MONGO_USERNAME:-admin}
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
|
||||||
|
MONGO_INITDB_DATABASE: munich_news
|
||||||
|
volumes:
|
||||||
|
- mongodb_data_local:/data/db
|
||||||
|
- mongodb_config_local:/data/configdb
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
command: mongod --bind_ip_all ${MONGO_AUTH:---auth}
|
||||||
|
healthcheck:
|
||||||
|
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# News Crawler - Runs at 6 AM Berlin time
|
||||||
|
crawler:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: news_crawler/Dockerfile
|
||||||
|
container_name: munich-news-local-crawler
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- mongodb
|
||||||
|
- ollama
|
||||||
|
- redis
|
||||||
|
environment:
|
||||||
|
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||||
|
- REDIS_URL=redis://redis:6379
|
||||||
|
- TZ=Europe/Berlin
|
||||||
|
volumes:
|
||||||
|
- ./backend/.env.local:/app/.env:ro
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||||
|
interval: 1m
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# Backend API - Tracking and analytics
|
||||||
|
backend:
|
||||||
|
build:
|
||||||
|
context: ./backend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: munich-news-local-backend
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- mongodb
|
||||||
|
- redis
|
||||||
|
ports:
|
||||||
|
- "5001:5001"
|
||||||
|
environment:
|
||||||
|
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||||
|
- REDIS_URL=redis://redis:6379
|
||||||
|
- FLASK_PORT=5001
|
||||||
|
- TZ=Europe/Berlin
|
||||||
|
volumes:
|
||||||
|
- ./backend/.env.local:/app/.env:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
|
||||||
|
# Transport Crawler - API service for MVG disruptions (Internal only - not exposed to host)
|
||||||
|
transport-crawler:
|
||||||
|
build:
|
||||||
|
context: ./transport_crawler
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: munich-news-local-transport-crawler
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- mongodb
|
||||||
|
- redis
|
||||||
|
# No ports exposed - only accessible within Docker network
|
||||||
|
environment:
|
||||||
|
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||||
|
- REDIS_URL=redis://redis:6379
|
||||||
|
- TZ=Europe/Berlin
|
||||||
|
volumes:
|
||||||
|
- ./backend/.env.local:/app/.env:ro
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
|
||||||
|
# Newsletter Sender - Runs at 7 AM Berlin time
|
||||||
|
sender:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: news_sender/Dockerfile
|
||||||
|
container_name: munich-news-local-sender
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- mongodb
|
||||||
|
- backend
|
||||||
|
- crawler
|
||||||
|
- transport-crawler
|
||||||
|
environment:
|
||||||
|
- MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/
|
||||||
|
- TZ=Europe/Berlin
|
||||||
|
volumes:
|
||||||
|
- ./backend/.env.local:/app/.env:ro
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||||
|
interval: 1m
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# Frontend Web Interface
|
||||||
|
frontend:
|
||||||
|
build: ./frontend
|
||||||
|
container_name: munich-news-local-frontend
|
||||||
|
restart: unless-stopped
|
||||||
|
# ports:
|
||||||
|
# - "3000:3000"
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
environment:
|
||||||
|
- API_URL=http://backend:5001
|
||||||
|
- PORT=3000
|
||||||
|
depends_on:
|
||||||
|
- backend
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mongodb_data_local:
|
||||||
|
driver: local
|
||||||
|
mongodb_config_local:
|
||||||
|
driver: local
|
||||||
|
ollama_data_local:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
munich-news-network:
|
||||||
|
internal: false
|
||||||
@@ -26,6 +26,9 @@ services:
|
|||||||
- ollama_data:/root/.ollama
|
- ollama_data:/root/.ollama
|
||||||
networks:
|
networks:
|
||||||
- munich-news-network
|
- munich-news-network
|
||||||
|
dns:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 1.1.1.1
|
||||||
# GPU support (uncomment if you have NVIDIA GPU)
|
# GPU support (uncomment if you have NVIDIA GPU)
|
||||||
# deploy:
|
# deploy:
|
||||||
# resources:
|
# resources:
|
||||||
@@ -54,6 +57,9 @@ services:
|
|||||||
- backend/.env
|
- backend/.env
|
||||||
volumes:
|
volumes:
|
||||||
- ./scripts/setup-ollama-model.sh:/setup-ollama-model.sh:ro
|
- ./scripts/setup-ollama-model.sh:/setup-ollama-model.sh:ro
|
||||||
|
dns:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 1.1.1.1
|
||||||
command: sh /setup-ollama-model.sh
|
command: sh /setup-ollama-model.sh
|
||||||
restart: on-failure
|
restart: on-failure
|
||||||
|
|
||||||
@@ -261,6 +267,6 @@ volumes:
|
|||||||
|
|
||||||
networks:
|
networks:
|
||||||
munich-news-network:
|
munich-news-network:
|
||||||
internal: true
|
internal: false
|
||||||
proxy:
|
proxy:
|
||||||
external: true
|
external: true
|
||||||
167
docs/LOCAL_DEVELOPMENT.md
Normal file
167
docs/LOCAL_DEVELOPMENT.md
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
# Local Development Setup
|
||||||
|
|
||||||
|
This guide helps you run Munich News Daily locally for development and testing.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Copy local environment files
|
||||||
|
cp .env.local .env
|
||||||
|
cp backend/.env.local backend/.env
|
||||||
|
|
||||||
|
# 2. Start services with local configuration
|
||||||
|
docker-compose -f docker-compose.local.yml up -d
|
||||||
|
|
||||||
|
# 3. Check logs
|
||||||
|
docker-compose -f docker-compose.local.yml logs -f
|
||||||
|
|
||||||
|
# 4. Access services
|
||||||
|
# - Frontend: http://localhost:3000
|
||||||
|
# - Backend API: http://localhost:5001
|
||||||
|
# - MongoDB: localhost:27017
|
||||||
|
# - Ollama: http://localhost:11434
|
||||||
|
```
|
||||||
|
|
||||||
|
## Differences from Production
|
||||||
|
|
||||||
|
| Feature | Production | Local Development |
|
||||||
|
|---------|-----------|-------------------|
|
||||||
|
| Ollama Model | `gemma3:12b` (large) | `phi3:latest` (small, fast) |
|
||||||
|
| MongoDB Port | Internal only | Exposed on 27017 |
|
||||||
|
| Ollama Port | Internal only | Exposed on 11434 |
|
||||||
|
| Container Names | `munich-news-*` | `munich-news-local-*` |
|
||||||
|
| Volumes | `*_data` | `*_data_local` |
|
||||||
|
| Email | Production SMTP | Test/disabled |
|
||||||
|
|
||||||
|
## Useful Commands
|
||||||
|
|
||||||
|
### Start/Stop Services
|
||||||
|
```bash
|
||||||
|
# Start all services
|
||||||
|
docker-compose -f docker-compose.local.yml up -d
|
||||||
|
|
||||||
|
# Stop all services
|
||||||
|
docker-compose -f docker-compose.local.yml down
|
||||||
|
|
||||||
|
# Restart a specific service
|
||||||
|
docker-compose -f docker-compose.local.yml restart backend
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
docker-compose -f docker-compose.local.yml logs -f crawler
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Trigger a news crawl (2 articles for quick testing)
|
||||||
|
curl -X POST http://localhost:5001/api/admin/trigger-crawl \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"max_articles": 2}'
|
||||||
|
|
||||||
|
# Trigger transport crawl
|
||||||
|
curl -X POST http://localhost:5001/api/transport/crawl
|
||||||
|
|
||||||
|
# Check articles in MongoDB
|
||||||
|
docker exec munich-news-local-mongodb mongosh munich_news \
|
||||||
|
--eval "db.articles.find({}, {title: 1, keywords: 1, category: 1}).limit(3)"
|
||||||
|
|
||||||
|
# Check transport disruptions
|
||||||
|
curl http://localhost:5001/api/transport/disruptions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database Access
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Connect to MongoDB
|
||||||
|
docker exec -it munich-news-local-mongodb mongosh munich_news
|
||||||
|
|
||||||
|
# Or from host (if you have mongosh installed)
|
||||||
|
mongosh "mongodb://admin:local123@localhost:27017/munich_news"
|
||||||
|
|
||||||
|
# Useful queries
|
||||||
|
db.articles.countDocuments()
|
||||||
|
db.articles.find({keywords: {$exists: true}}).limit(5)
|
||||||
|
db.subscribers.find()
|
||||||
|
db.transport_alerts.find()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ollama Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List models
|
||||||
|
curl http://localhost:11434/api/tags
|
||||||
|
|
||||||
|
# Test generation
|
||||||
|
curl http://localhost:11434/api/generate -d '{
|
||||||
|
"model": "phi3:latest",
|
||||||
|
"prompt": "Summarize: Munich opens new U-Bahn line",
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop and remove containers
|
||||||
|
docker-compose -f docker-compose.local.yml down
|
||||||
|
|
||||||
|
# Remove volumes (WARNING: deletes all data)
|
||||||
|
docker-compose -f docker-compose.local.yml down -v
|
||||||
|
|
||||||
|
# Remove local volumes specifically
|
||||||
|
docker volume rm munich-news_mongodb_data_local
|
||||||
|
docker volume rm munich-news_mongodb_config_local
|
||||||
|
docker volume rm munich-news_ollama_data_local
|
||||||
|
```
|
||||||
|
|
||||||
|
## Switching Between Local and Production
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Switch to local
|
||||||
|
cp .env.local .env
|
||||||
|
cp backend/.env.local backend/.env
|
||||||
|
docker-compose -f docker-compose.local.yml up -d
|
||||||
|
|
||||||
|
# Switch to production
|
||||||
|
cp .env.production .env # (if you have one)
|
||||||
|
cp backend/.env.production backend/.env
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Ollama model not downloading
|
||||||
|
```bash
|
||||||
|
# Pull model manually
|
||||||
|
docker exec munich-news-local-ollama ollama pull phi3:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### MongoDB connection refused
|
||||||
|
```bash
|
||||||
|
# Check if MongoDB is running
|
||||||
|
docker-compose -f docker-compose.local.yml ps mongodb
|
||||||
|
|
||||||
|
# Check logs
|
||||||
|
docker-compose -f docker-compose.local.yml logs mongodb
|
||||||
|
```
|
||||||
|
|
||||||
|
### Port already in use
|
||||||
|
```bash
|
||||||
|
# Check what's using the port
|
||||||
|
lsof -i :5001 # or :3000, :27017, etc.
|
||||||
|
|
||||||
|
# Stop the conflicting service or change port in docker-compose.local.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
1. **Use phi3 for speed** - It's much faster than gemma3 for local testing
|
||||||
|
2. **Limit articles** - Use `max_articles: 2` for quick crawl tests
|
||||||
|
3. **Watch logs** - Keep logs open to see what's happening
|
||||||
|
4. **Separate volumes** - Local and production use different volumes, so they don't interfere
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- See `docs/PERSONALIZATION.md` for personalization feature development
|
||||||
|
- See `docs/OLLAMA_SETUP.md` for AI configuration
|
||||||
|
- See main `README.md` for general documentation
|
||||||
217
docs/PERSONALIZATION.md
Normal file
217
docs/PERSONALIZATION.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
# Newsletter Personalization Implementation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
Personalized newsletters based on user click behavior, using keywords and categories to build interest profiles.
|
||||||
|
|
||||||
|
## Implementation Phases
|
||||||
|
|
||||||
|
### ✅ Phase 1: Keyword Extraction (COMPLETED)
|
||||||
|
**Status:** Implemented
|
||||||
|
**Files Modified:**
|
||||||
|
- `news_crawler/ollama_client.py` - Added `extract_keywords()` method
|
||||||
|
- `news_crawler/crawler_service.py` - Integrated keyword extraction into crawl process
|
||||||
|
|
||||||
|
**What it does:**
|
||||||
|
- Extracts 5 keywords from each article using Ollama AI
|
||||||
|
- Keywords stored in `articles` collection: `keywords: ["Bayern Munich", "Football", ...]`
|
||||||
|
- Runs automatically during news crawling
|
||||||
|
|
||||||
|
**Test it:**
|
||||||
|
```bash
|
||||||
|
# Trigger a crawl
|
||||||
|
curl -X POST http://localhost:5001/api/admin/trigger-crawl -d '{"max_articles": 2}'
|
||||||
|
|
||||||
|
# Check articles have keywords
|
||||||
|
docker exec munich-news-mongodb mongosh munich_news --eval "db.articles.findOne({}, {title: 1, keywords: 1})"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ✅ Phase 2: Click Tracking Enhancement (COMPLETED)
|
||||||
|
**Status:** Implemented
|
||||||
|
**Goal:** Track clicks with keyword metadata
|
||||||
|
|
||||||
|
**Files Modified:**
|
||||||
|
- `backend/services/tracking_service.py` - Enhanced `create_newsletter_tracking()` to look up article metadata
|
||||||
|
|
||||||
|
**What it does:**
|
||||||
|
- When creating tracking links, looks up article from database
|
||||||
|
- Stores article ID, category, and keywords in tracking record
|
||||||
|
- Enables building user interest profiles from click behavior
|
||||||
|
|
||||||
|
**Database Schema:**
|
||||||
|
```javascript
|
||||||
|
// link_clicks collection
|
||||||
|
{
|
||||||
|
tracking_id: "uuid",
|
||||||
|
newsletter_id: "2024-11-18",
|
||||||
|
subscriber_email: "user@example.com",
|
||||||
|
article_url: "https://...",
|
||||||
|
article_title: "Article Title",
|
||||||
|
article_id: "673abc123...", // NEW: Article database ID
|
||||||
|
category: "sports", // NEW: Article category
|
||||||
|
keywords: ["Bayern Munich", "Bundesliga"], // NEW: Keywords for personalization
|
||||||
|
clicked: false,
|
||||||
|
clicked_at: null,
|
||||||
|
user_agent: null,
|
||||||
|
created_at: ISODate()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test it:**
|
||||||
|
```bash
|
||||||
|
# Send a test newsletter
|
||||||
|
curl -X POST http://localhost:5001/api/admin/send-newsletter
|
||||||
|
|
||||||
|
# Check tracking records have keywords
|
||||||
|
docker exec munich-news-mongodb mongosh munich_news --eval "db.link_clicks.findOne({}, {article_title: 1, keywords: 1, category: 1})"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ✅ Phase 3: User Interest Profiling (COMPLETED)
|
||||||
|
**Status:** Implemented
|
||||||
|
**Goal:** Build user interest profiles from click history
|
||||||
|
|
||||||
|
**Files Created:**
|
||||||
|
- `backend/services/interest_profiling_service.py` - Core profiling logic
|
||||||
|
- `backend/routes/interests_routes.py` - API endpoints for interest management
|
||||||
|
|
||||||
|
**Files Modified:**
|
||||||
|
- `backend/routes/tracking_routes.py` - Auto-update interests on click
|
||||||
|
- `backend/app.py` - Register interests routes
|
||||||
|
|
||||||
|
**What it does:**
|
||||||
|
- Automatically builds interest profiles when users click articles
|
||||||
|
- Tracks interest scores for categories and keywords (0.0 to 1.0)
|
||||||
|
- Increments scores by 0.1 per click, capped at 1.0
|
||||||
|
- Provides decay mechanism for old interests
|
||||||
|
- Supports rebuilding profiles from click history
|
||||||
|
|
||||||
|
**Database Schema:**
|
||||||
|
```javascript
|
||||||
|
// user_interests collection
|
||||||
|
{
|
||||||
|
email: "user@example.com",
|
||||||
|
categories: {
|
||||||
|
sports: 0.8,
|
||||||
|
local: 0.5,
|
||||||
|
science: 0.2
|
||||||
|
},
|
||||||
|
keywords: {
|
||||||
|
"Bayern Munich": 0.9,
|
||||||
|
"Oktoberfest": 0.7,
|
||||||
|
"AI": 0.3
|
||||||
|
},
|
||||||
|
total_clicks: 15,
|
||||||
|
last_updated: ISODate(),
|
||||||
|
created_at: ISODate()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**API Endpoints:**
|
||||||
|
```bash
|
||||||
|
# Get user interests
|
||||||
|
GET /api/interests/<email>
|
||||||
|
|
||||||
|
# Get top interests
|
||||||
|
GET /api/interests/<email>/top?top_n=10
|
||||||
|
|
||||||
|
# Rebuild from history
|
||||||
|
POST /api/interests/<email>/rebuild
|
||||||
|
Body: {"days_lookback": 30}
|
||||||
|
|
||||||
|
# Decay old interests
|
||||||
|
POST /api/interests/decay
|
||||||
|
Body: {"decay_factor": 0.95, "days_threshold": 7}
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
GET /api/interests/statistics
|
||||||
|
|
||||||
|
# Delete profile (GDPR)
|
||||||
|
DELETE /api/interests/<email>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test it:**
|
||||||
|
```bash
|
||||||
|
# Run test script
|
||||||
|
docker exec munich-news-local-backend python test_interest_profiling.py
|
||||||
|
|
||||||
|
# View a user's interests
|
||||||
|
curl http://localhost:5001/api/interests/user@example.com
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
curl http://localhost:5001/api/interests/statistics
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ✅ Phase 4: Personalized Newsletter (COMPLETED)
|
||||||
|
**Status:** Implemented
|
||||||
|
**Goal:** Rank and select articles based on user interests
|
||||||
|
|
||||||
|
**Files Created:**
|
||||||
|
- `backend/services/personalization_service.py` - Core personalization logic
|
||||||
|
- `backend/routes/personalization_routes.py` - API endpoints for testing
|
||||||
|
|
||||||
|
**Files Modified:**
|
||||||
|
- `backend/app.py` - Register personalization routes
|
||||||
|
|
||||||
|
**What it does:**
|
||||||
|
- Scores articles based on user's category and keyword interests
|
||||||
|
- Ranks articles by personalization score (0.0 to 1.0)
|
||||||
|
- Selects mix of personalized (70%) + trending (30%) content
|
||||||
|
- Provides explanations for recommendations
|
||||||
|
|
||||||
|
**Algorithm:**
|
||||||
|
```python
|
||||||
|
score = (category_match * 0.4) + (keyword_match * 0.6)
|
||||||
|
|
||||||
|
# Example:
|
||||||
|
# User interests: sports=0.8, "Bayern Munich"=0.9
|
||||||
|
# Article: sports category, keywords=["Bayern Munich", "Football"]
|
||||||
|
# Score = (0.8 * 0.4) + (0.9 * 0.6) = 0.32 + 0.54 = 0.86
|
||||||
|
```
|
||||||
|
|
||||||
|
**API Endpoints:**
|
||||||
|
```bash
|
||||||
|
# Preview personalized newsletter
|
||||||
|
GET /api/personalize/preview/<email>?max_articles=10&hours_lookback=24
|
||||||
|
|
||||||
|
# Explain recommendation
|
||||||
|
POST /api/personalize/explain
|
||||||
|
Body: {"email": "user@example.com", "article_id": "..."}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test it:**
|
||||||
|
```bash
|
||||||
|
# Run test script
|
||||||
|
docker exec munich-news-local-backend python test_personalization.py
|
||||||
|
|
||||||
|
# Preview personalized newsletter
|
||||||
|
curl "http://localhost:5001/api/personalize/preview/demo@example.com?max_articles=5"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ All Phases Complete!
|
||||||
|
|
||||||
|
1. ~~**Phase 1:** Keyword extraction from articles~~ ✅ DONE
|
||||||
|
2. ~~**Phase 2:** Click tracking with keywords~~ ✅ DONE
|
||||||
|
3. ~~**Phase 3:** User interest profiling~~ ✅ DONE
|
||||||
|
4. ~~**Phase 4:** Personalized newsletter generation~~ ✅ DONE
|
||||||
|
|
||||||
|
## Next Steps for Production
|
||||||
|
|
||||||
|
1. **Integrate with newsletter sender** - Modify `news_sender/sender_service.py` to use personalization
|
||||||
|
2. **A/B testing** - Compare personalized vs non-personalized engagement
|
||||||
|
3. **Tune parameters** - Adjust personalization_ratio, weights, decay rates
|
||||||
|
4. **Monitor metrics** - Track click-through rates, open rates by personalization score
|
||||||
|
5. **User controls** - Add UI for users to view/edit their interests
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
No configuration needed yet. Keyword extraction uses existing Ollama settings from `backend/.env`:
|
||||||
|
- `OLLAMA_ENABLED=true`
|
||||||
|
- `OLLAMA_MODEL=gemma3:12b`
|
||||||
|
- `OLLAMA_BASE_URL=http://ollama:11434`
|
||||||
195
docs/PERSONALIZATION_COMPLETE.md
Normal file
195
docs/PERSONALIZATION_COMPLETE.md
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
# 🎉 Newsletter Personalization System - Complete!
|
||||||
|
|
||||||
|
All 4 phases of the personalization system have been successfully implemented and tested.
|
||||||
|
|
||||||
|
## ✅ What Was Built
|
||||||
|
|
||||||
|
### Phase 1: Keyword Extraction
|
||||||
|
- AI-powered keyword extraction from articles using Ollama
|
||||||
|
- 5 keywords per article automatically extracted during crawling
|
||||||
|
- Keywords stored in database for personalization
|
||||||
|
|
||||||
|
### Phase 2: Click Tracking Enhancement
|
||||||
|
- Enhanced tracking to capture article keywords and category
|
||||||
|
- Tracking records now include metadata for building interest profiles
|
||||||
|
- Privacy-compliant with opt-out and GDPR support
|
||||||
|
|
||||||
|
### Phase 3: User Interest Profiling
|
||||||
|
- Automatic profile building from click behavior
|
||||||
|
- Interest scores (0.0-1.0) for categories and keywords
|
||||||
|
- Decay mechanism for old interests
|
||||||
|
- API endpoints for viewing and managing profiles
|
||||||
|
|
||||||
|
### Phase 4: Personalized Newsletter Generation
|
||||||
|
- Article scoring based on user interests
|
||||||
|
- Smart ranking algorithm (40% category + 60% keywords)
|
||||||
|
- Mix of personalized (70%) + trending (30%) content
|
||||||
|
- Explanation system for recommendations
|
||||||
|
|
||||||
|
## 📊 How It Works
|
||||||
|
|
||||||
|
```
|
||||||
|
1. User clicks article in newsletter
|
||||||
|
↓
|
||||||
|
2. System records: keywords + category
|
||||||
|
↓
|
||||||
|
3. Interest profile updates automatically
|
||||||
|
↓
|
||||||
|
4. Next newsletter: articles ranked by interests
|
||||||
|
↓
|
||||||
|
5. User receives personalized content
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🧪 Testing
|
||||||
|
|
||||||
|
All phases have been tested and verified:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run comprehensive test suite (tests all 4 phases)
|
||||||
|
docker exec munich-news-local-backend python test_personalization_system.py
|
||||||
|
|
||||||
|
# Or test keyword extraction separately
|
||||||
|
docker exec munich-news-local-crawler python -c "from crawler_service import crawl_all_feeds; crawl_all_feeds(max_articles_per_feed=2)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔌 API Endpoints
|
||||||
|
|
||||||
|
### Interest Management
|
||||||
|
```bash
|
||||||
|
GET /api/interests/<email> # View profile
|
||||||
|
GET /api/interests/<email>/top # Top interests
|
||||||
|
POST /api/interests/<email>/rebuild # Rebuild from history
|
||||||
|
GET /api/interests/statistics # Platform stats
|
||||||
|
DELETE /api/interests/<email> # Delete (GDPR)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Personalization
|
||||||
|
```bash
|
||||||
|
GET /api/personalize/preview/<email> # Preview personalized newsletter
|
||||||
|
POST /api/personalize/explain # Explain recommendation
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📈 Example Results
|
||||||
|
|
||||||
|
### User Profile
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"email": "user@example.com",
|
||||||
|
"categories": {
|
||||||
|
"sports": 0.30,
|
||||||
|
"local": 0.10
|
||||||
|
},
|
||||||
|
"keywords": {
|
||||||
|
"Bayern Munich": 0.30,
|
||||||
|
"Football": 0.20,
|
||||||
|
"Transportation": 0.10
|
||||||
|
},
|
||||||
|
"total_clicks": 5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Personalized Newsletter
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"articles": [
|
||||||
|
{
|
||||||
|
"title": "Bayern Munich wins championship",
|
||||||
|
"personalization_score": 0.86,
|
||||||
|
"category": "sports",
|
||||||
|
"keywords": ["Bayern Munich", "Football"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "New S-Bahn line opens",
|
||||||
|
"personalization_score": 0.42,
|
||||||
|
"category": "local",
|
||||||
|
"keywords": ["Transportation", "Munich"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"statistics": {
|
||||||
|
"highly_personalized": 1,
|
||||||
|
"moderately_personalized": 1,
|
||||||
|
"trending": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎯 Scoring Algorithm
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Article score calculation
|
||||||
|
category_score = user_interests.categories[article.category]
|
||||||
|
keyword_score = average(user_interests.keywords[kw] for kw in article.keywords)
|
||||||
|
|
||||||
|
final_score = (category_score * 0.4) + (keyword_score * 0.6)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
- User: sports=0.8, "Bayern Munich"=0.9
|
||||||
|
- Article: sports category, keywords=["Bayern Munich", "Football"]
|
||||||
|
- Score = (0.8 × 0.4) + (0.9 × 0.6) = 0.32 + 0.54 = **0.86**
|
||||||
|
|
||||||
|
## 🚀 Production Integration
|
||||||
|
|
||||||
|
To integrate with the newsletter sender:
|
||||||
|
|
||||||
|
1. **Modify `news_sender/sender_service.py`:**
|
||||||
|
```python
|
||||||
|
from services.personalization_service import select_personalized_articles
|
||||||
|
|
||||||
|
# For each subscriber
|
||||||
|
personalized_articles = select_personalized_articles(
|
||||||
|
all_articles,
|
||||||
|
subscriber_email,
|
||||||
|
max_articles=10
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Enable personalization flag in config:**
|
||||||
|
```env
|
||||||
|
PERSONALIZATION_ENABLED=true
|
||||||
|
PERSONALIZATION_RATIO=0.7 # 70% personalized, 30% trending
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Monitor metrics:**
|
||||||
|
- Click-through rate by personalization score
|
||||||
|
- Open rates for personalized vs non-personalized
|
||||||
|
- User engagement over time
|
||||||
|
|
||||||
|
## 🔐 Privacy & Compliance
|
||||||
|
|
||||||
|
- ✅ Users can opt out of tracking
|
||||||
|
- ✅ Interest profiles can be deleted (GDPR)
|
||||||
|
- ✅ Automatic anonymization after 90 days
|
||||||
|
- ✅ No PII beyond email address
|
||||||
|
- ✅ Transparent recommendation explanations
|
||||||
|
|
||||||
|
## 📁 Files Created/Modified
|
||||||
|
|
||||||
|
### New Files
|
||||||
|
- `backend/services/interest_profiling_service.py`
|
||||||
|
- `backend/services/personalization_service.py`
|
||||||
|
- `backend/routes/interests_routes.py`
|
||||||
|
- `backend/routes/personalization_routes.py`
|
||||||
|
- `backend/test_tracking_phase2.py`
|
||||||
|
- `backend/test_interest_profiling.py`
|
||||||
|
- `backend/test_personalization.py`
|
||||||
|
- `docs/PERSONALIZATION.md`
|
||||||
|
|
||||||
|
### Modified Files
|
||||||
|
- `news_crawler/ollama_client.py` - Added keyword extraction
|
||||||
|
- `news_crawler/crawler_service.py` - Integrated keyword extraction
|
||||||
|
- `backend/services/tracking_service.py` - Enhanced with metadata
|
||||||
|
- `backend/routes/tracking_routes.py` - Auto-update interests
|
||||||
|
- `backend/app.py` - Registered new routes
|
||||||
|
|
||||||
|
## 🎓 Key Learnings
|
||||||
|
|
||||||
|
1. **Incremental scoring works well** - 0.1 per click prevents over-weighting
|
||||||
|
2. **Mix is important** - 70/30 personalized/trending avoids filter bubbles
|
||||||
|
3. **Keywords > Categories** - 60/40 weight reflects keyword importance
|
||||||
|
4. **Decay is essential** - Prevents stale interests from dominating
|
||||||
|
5. **Transparency matters** - Explanation API helps users understand recommendations
|
||||||
|
|
||||||
|
## 🎉 Status: COMPLETE
|
||||||
|
|
||||||
|
All 4 phases implemented, tested, and documented. The personalization system is ready for production integration!
|
||||||
@@ -27,7 +27,16 @@ function refreshAll() {
|
|||||||
// Load system statistics
|
// Load system statistics
|
||||||
async function loadSystemStats() {
|
async function loadSystemStats() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/stats`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10 second timeout
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/stats`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
const html = `
|
const html = `
|
||||||
@@ -59,14 +68,25 @@ async function loadSystemStats() {
|
|||||||
|
|
||||||
document.getElementById('systemStats').innerHTML = html;
|
document.getElementById('systemStats').innerHTML = html;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('systemStats').innerHTML = `<div class="error">Error loading stats: ${error.message}</div>`;
|
console.error('Error loading system stats:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('systemStats').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load Ollama status
|
// Load Ollama status
|
||||||
async function loadOllamaStatus() {
|
async function loadOllamaStatus() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/ollama/ping`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/ollama/ping`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
const isActive = data.status === 'success';
|
const isActive = data.status === 'success';
|
||||||
@@ -102,25 +122,37 @@ async function loadOllamaStatus() {
|
|||||||
|
|
||||||
document.getElementById('ollamaStatus').innerHTML = html;
|
document.getElementById('ollamaStatus').innerHTML = html;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('ollamaStatus').innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
console.error('Error loading Ollama status:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('ollamaStatus').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load GPU status
|
// Load GPU status
|
||||||
async function loadGPUStatus() {
|
async function loadGPUStatus() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/ollama/gpu-status`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/ollama/gpu-status`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
const gpuActive = data.gpu_in_use;
|
const gpuActive = data.gpu_in_use || false;
|
||||||
|
const gpuAvailable = data.gpu_available || false;
|
||||||
const statusClass = gpuActive ? 'status-active' : 'status-warning';
|
const statusClass = gpuActive ? 'status-active' : 'status-warning';
|
||||||
|
|
||||||
const html = `
|
const html = `
|
||||||
<div class="stat-row">
|
<div class="stat-row">
|
||||||
<span class="stat-label">GPU Available</span>
|
<span class="stat-label">GPU Available</span>
|
||||||
<span class="stat-value">
|
<span class="stat-value">
|
||||||
<span class="status-indicator ${data.gpu_available ? 'status-active' : 'status-inactive'}"></span>
|
<span class="status-indicator ${gpuAvailable ? 'status-active' : 'status-inactive'}"></span>
|
||||||
${data.gpu_available ? 'Yes' : 'No'}
|
${gpuAvailable ? 'Yes' : 'No'}
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="stat-row">
|
<div class="stat-row">
|
||||||
@@ -134,16 +166,6 @@ async function loadGPUStatus() {
|
|||||||
<span class="stat-label">Models Loaded</span>
|
<span class="stat-label">Models Loaded</span>
|
||||||
<span class="stat-value">${data.models_loaded || 0}</span>
|
<span class="stat-value">${data.models_loaded || 0}</span>
|
||||||
</div>
|
</div>
|
||||||
${data.gpu_details ? `
|
|
||||||
<div class="stat-row">
|
|
||||||
<span class="stat-label">GPU Model</span>
|
|
||||||
<span class="stat-value">${data.gpu_details.model}</span>
|
|
||||||
</div>
|
|
||||||
<div class="stat-row">
|
|
||||||
<span class="stat-label">GPU Layers</span>
|
|
||||||
<span class="stat-value">${data.gpu_details.gpu_layers}</span>
|
|
||||||
</div>
|
|
||||||
` : ''}
|
|
||||||
${!gpuActive ? `
|
${!gpuActive ? `
|
||||||
<div style="margin-top: 10px; padding: 10px; background: #fef3c7; border-radius: 5px; font-size: 12px;">
|
<div style="margin-top: 10px; padding: 10px; background: #fef3c7; border-radius: 5px; font-size: 12px;">
|
||||||
💡 Enable GPU for 5-10x faster processing
|
💡 Enable GPU for 5-10x faster processing
|
||||||
@@ -153,7 +175,9 @@ async function loadGPUStatus() {
|
|||||||
|
|
||||||
document.getElementById('gpuStatus').innerHTML = html;
|
document.getElementById('gpuStatus').innerHTML = html;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('gpuStatus').innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
console.error('Error loading GPU status:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('gpuStatus').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,7 +228,16 @@ async function runPerformanceTest() {
|
|||||||
// Load available models
|
// Load available models
|
||||||
async function loadModels() {
|
async function loadModels() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/ollama/models`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/ollama/models`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
if (data.models && data.models.length > 0) {
|
if (data.models && data.models.length > 0) {
|
||||||
@@ -227,14 +260,25 @@ async function loadModels() {
|
|||||||
document.getElementById('modelsList').innerHTML = '<div>No models found</div>';
|
document.getElementById('modelsList').innerHTML = '<div>No models found</div>';
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('modelsList').innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
console.error('Error loading models:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('modelsList').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load configuration
|
// Load configuration
|
||||||
async function loadConfig() {
|
async function loadConfig() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/ollama/config`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/ollama/config`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
const html = `
|
const html = `
|
||||||
@@ -262,14 +306,25 @@ async function loadConfig() {
|
|||||||
|
|
||||||
document.getElementById('configInfo').innerHTML = html;
|
document.getElementById('configInfo').innerHTML = html;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('configInfo').innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
console.error('Error loading config:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('configInfo').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load clustering statistics
|
// Load clustering statistics
|
||||||
async function loadClusteringStats() {
|
async function loadClusteringStats() {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/stats`);
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE}/api/stats`, { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
const clusteringRate = data.clustered_articles > 0
|
const clusteringRate = data.clustered_articles > 0
|
||||||
@@ -310,7 +365,9 @@ async function loadClusteringStats() {
|
|||||||
|
|
||||||
document.getElementById('clusteringStats').innerHTML = html;
|
document.getElementById('clusteringStats').innerHTML = html;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
document.getElementById('clusteringStats').innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
console.error('Error loading clustering stats:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
document.getElementById('clusteringStats').innerHTML = `<div class="error">Error: ${errorMsg}</div>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -512,7 +569,16 @@ async function loadRecentArticles() {
|
|||||||
const container = document.getElementById('recentArticles');
|
const container = document.getElementById('recentArticles');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch('/api/admin/recent-articles');
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), 10000);
|
||||||
|
|
||||||
|
const response = await fetch('/api/admin/recent-articles', { signal: controller.signal });
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
|
|
||||||
if (data.articles && data.articles.length > 0) {
|
if (data.articles && data.articles.length > 0) {
|
||||||
@@ -548,7 +614,9 @@ async function loadRecentArticles() {
|
|||||||
container.innerHTML = '<p style="color: #666;">No summarized articles found.</p>';
|
container.innerHTML = '<p style="color: #666;">No summarized articles found.</p>';
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
container.innerHTML = '<p style="color: red;">Failed to load recent articles</p>';
|
console.error('Error loading recent articles:', error);
|
||||||
|
const errorMsg = error.name === 'AbortError' ? 'Request timeout' : error.message;
|
||||||
|
container.innerHTML = `<p style="color: red;">Error: ${errorMsg}</p>`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,17 @@ app.post('/api/unsubscribe', async (req, res) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.get('/api/subscribers', async (req, res) => {
|
||||||
|
try {
|
||||||
|
const response = await axios.get(`${API_URL}/api/subscribers`);
|
||||||
|
res.json(response.data);
|
||||||
|
} catch (error) {
|
||||||
|
res.status(error.response?.status || 500).json(
|
||||||
|
error.response?.data || { error: 'Failed to get subscribers' }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
app.get('/api/subscribers/:email', async (req, res) => {
|
app.get('/api/subscribers/:email', async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(`${API_URL}/api/subscribers/${req.params.email}`);
|
const response = await axios.get(`${API_URL}/api/subscribers/${req.params.email}`);
|
||||||
|
|||||||
@@ -388,6 +388,21 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
|||||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||||
failed_summaries += 1
|
failed_summaries += 1
|
||||||
|
|
||||||
|
# Extract keywords for personalization
|
||||||
|
keywords_result = None
|
||||||
|
if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
|
||||||
|
print(f" 🔑 Extracting keywords...")
|
||||||
|
keywords_result = ollama_client.extract_keywords(
|
||||||
|
original_title,
|
||||||
|
summary_result['summary'],
|
||||||
|
max_keywords=5
|
||||||
|
)
|
||||||
|
|
||||||
|
if keywords_result['success']:
|
||||||
|
print(f" ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
|
||||||
|
else:
|
||||||
|
print(f" ⚠ Keyword extraction failed: {keywords_result['error']}")
|
||||||
|
|
||||||
# Prepare document
|
# Prepare document
|
||||||
article_doc = {
|
article_doc = {
|
||||||
'title': original_title,
|
'title': original_title,
|
||||||
@@ -396,6 +411,7 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
|||||||
'link': article_url,
|
'link': article_url,
|
||||||
'content': article_data.get('content', ''), # Full article content
|
'content': article_data.get('content', ''), # Full article content
|
||||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||||
|
'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
|
||||||
'word_count': article_data.get('word_count', 0),
|
'word_count': article_data.get('word_count', 0),
|
||||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||||
'source': feed_name,
|
'source': feed_name,
|
||||||
@@ -446,7 +462,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ✗ Error processing feed {feed_name}: {e}")
|
print(f" ✗ Error processing feed {feed_name}: {e}")
|
||||||
return 0
|
return {
|
||||||
|
'crawled': 0,
|
||||||
|
'summarized': 0,
|
||||||
|
'failed_summaries': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def crawl_all_feeds(max_articles_per_feed=10):
|
def crawl_all_feeds(max_articles_per_feed=10):
|
||||||
|
|||||||
@@ -26,6 +26,43 @@ class OllamaClient:
|
|||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def _chat_request(self, messages, options=None):
|
||||||
|
"""
|
||||||
|
Helper to make chat requests to Ollama
|
||||||
|
|
||||||
|
Args:
|
||||||
|
messages: List of message dicts [{'role': 'user', 'content': '...'}]
|
||||||
|
options: Optional dict of model parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Generated text content
|
||||||
|
"""
|
||||||
|
if options is None:
|
||||||
|
options = {}
|
||||||
|
|
||||||
|
url = f"{self.base_url}/api/chat"
|
||||||
|
headers = {'Content-Type': 'application/json'}
|
||||||
|
if self.api_key:
|
||||||
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'model': self.model,
|
||||||
|
'messages': messages,
|
||||||
|
'stream': False,
|
||||||
|
'options': options
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
url,
|
||||||
|
json=payload,
|
||||||
|
headers=headers,
|
||||||
|
timeout=self.timeout
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
return result.get('message', {}).get('content', '').strip()
|
||||||
|
|
||||||
def summarize_article(self, content, max_words=150):
|
def summarize_article(self, content, max_words=150):
|
||||||
"""
|
"""
|
||||||
Summarize article content using Ollama
|
Summarize article content using Ollama
|
||||||
@@ -70,37 +107,26 @@ class OllamaClient:
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Construct prompt
|
# Construct messages for chat API
|
||||||
prompt = self._build_summarization_prompt(content, max_words)
|
messages = [
|
||||||
|
{
|
||||||
# Prepare request
|
'role': 'system',
|
||||||
url = f"{self.base_url}/api/generate"
|
'content': f"You are a skilled journalist writing for The New York Times. Summarize the provided article in English in {max_words} words or less.\\n\\nWrite in the clear, engaging, and authoritative style of New York Times Magazine:\\n- Lead with the most newsworthy information\\n- Use active voice and vivid language\\n- Make it accessible and easy to read\\n- Focus on what matters to readers\\n- Even if the source is in German or another language, write your summary entirely in English\\n\\nIMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose."
|
||||||
headers = {'Content-Type': 'application/json'}
|
},
|
||||||
if self.api_key:
|
{
|
||||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
'role': 'user',
|
||||||
|
'content': f"Summarize this article:\\n\\n{content}"
|
||||||
payload = {
|
|
||||||
'model': self.model,
|
|
||||||
'prompt': prompt,
|
|
||||||
'stream': False,
|
|
||||||
'options': {
|
|
||||||
'temperature': 0.7,
|
|
||||||
'num_predict': 250 # Limit response length
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
|
|
||||||
# Make request
|
# Make request using chat endpoint
|
||||||
response = requests.post(
|
summary = self._chat_request(
|
||||||
url,
|
messages,
|
||||||
json=payload,
|
options={
|
||||||
headers=headers,
|
'temperature': 0.5,
|
||||||
timeout=self.timeout
|
'num_predict': 350
|
||||||
|
}
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
# Parse response
|
|
||||||
result = response.json()
|
|
||||||
summary = result.get('response', '').strip()
|
|
||||||
|
|
||||||
if not summary:
|
if not summary:
|
||||||
return {
|
return {
|
||||||
@@ -198,37 +224,26 @@ class OllamaClient:
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Construct prompt
|
# Construct messages for chat API
|
||||||
prompt = self._build_translation_prompt(title, target_language)
|
messages = [
|
||||||
|
{
|
||||||
# Prepare request
|
'role': 'system',
|
||||||
url = f"{self.base_url}/api/generate"
|
'content': f"You are a professional translator. Translate the following German news headline to {target_language}.\\n\\nIMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline."
|
||||||
headers = {'Content-Type': 'application/json'}
|
},
|
||||||
if self.api_key:
|
{
|
||||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
'role': 'user',
|
||||||
|
'content': title
|
||||||
payload = {
|
|
||||||
'model': self.model,
|
|
||||||
'prompt': prompt,
|
|
||||||
'stream': False,
|
|
||||||
'options': {
|
|
||||||
'temperature': 0.3, # Lower temperature for consistent translations
|
|
||||||
'num_predict': 100 # Limit response length for title-length outputs
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
|
|
||||||
# Make request
|
# Make request using chat endpoint
|
||||||
response = requests.post(
|
translated_title = self._chat_request(
|
||||||
url,
|
messages,
|
||||||
json=payload,
|
options={
|
||||||
headers=headers,
|
'temperature': 0.1, # Low temperature for consistent translations
|
||||||
timeout=self.timeout
|
'num_predict': 100 # Limit response length
|
||||||
|
}
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
# Parse response
|
|
||||||
result = response.json()
|
|
||||||
translated_title = result.get('response', '').strip()
|
|
||||||
|
|
||||||
if not translated_title:
|
if not translated_title:
|
||||||
return {
|
return {
|
||||||
@@ -241,6 +256,13 @@ class OllamaClient:
|
|||||||
# Clean the translation output
|
# Clean the translation output
|
||||||
translated_title = self._clean_translation(translated_title)
|
translated_title = self._clean_translation(translated_title)
|
||||||
|
|
||||||
|
# Validate translation (if it's same as original, it might have failed)
|
||||||
|
if translated_title.lower() == title.lower() and target_language == 'English':
|
||||||
|
# Retry with more forceful prompt
|
||||||
|
messages[0]['content'] += " If the text is already English, just output it as is."
|
||||||
|
translated_title = self._chat_request(messages, options={'temperature': 0.1})
|
||||||
|
translated_title = self._clean_translation(translated_title)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'success': True,
|
'success': True,
|
||||||
'translated_title': translated_title,
|
'translated_title': translated_title,
|
||||||
@@ -277,19 +299,6 @@ class OllamaClient:
|
|||||||
'duration': time.time() - start_time
|
'duration': time.time() - start_time
|
||||||
}
|
}
|
||||||
|
|
||||||
def _build_translation_prompt(self, title, target_language):
|
|
||||||
"""Build prompt for title translation"""
|
|
||||||
prompt = f"""Translate the following German news headline to {target_language}.
|
|
||||||
|
|
||||||
IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline.
|
|
||||||
|
|
||||||
German headline:
|
|
||||||
{title}
|
|
||||||
|
|
||||||
{target_language} translation:"""
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
def _clean_translation(self, translation):
|
def _clean_translation(self, translation):
|
||||||
"""Clean translation output by removing quotes and extra text"""
|
"""Clean translation output by removing quotes and extra text"""
|
||||||
# Extract first line only
|
# Extract first line only
|
||||||
@@ -335,31 +344,6 @@ German headline:
|
|||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _build_summarization_prompt(self, content, max_words):
|
|
||||||
"""Build prompt for article summarization"""
|
|
||||||
# Truncate content if too long (keep first 5000 words)
|
|
||||||
words = content.split()
|
|
||||||
if len(words) > 5000:
|
|
||||||
content = ' '.join(words[:5000]) + '...'
|
|
||||||
|
|
||||||
prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less.
|
|
||||||
|
|
||||||
Write in the clear, engaging, and authoritative style of New York Times Magazine:
|
|
||||||
- Lead with the most newsworthy information
|
|
||||||
- Use active voice and vivid language
|
|
||||||
- Make it accessible and easy to read
|
|
||||||
- Focus on what matters to readers
|
|
||||||
- Even if the source is in German or another language, write your summary entirely in English
|
|
||||||
|
|
||||||
IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.
|
|
||||||
|
|
||||||
Article:
|
|
||||||
{content}
|
|
||||||
|
|
||||||
New York Times-style summary (max {max_words} words):"""
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
def is_available(self):
|
def is_available(self):
|
||||||
"""
|
"""
|
||||||
Check if Ollama server is reachable
|
Check if Ollama server is reachable
|
||||||
@@ -462,37 +446,24 @@ New York Times-style summary (max {max_words} words):"""
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
messages = [{'role': 'user', 'content': prompt}]
|
||||||
f"{self.base_url}/api/generate",
|
|
||||||
json={
|
text = self._chat_request(
|
||||||
"model": self.model,
|
messages,
|
||||||
"prompt": prompt,
|
options={
|
||||||
"stream": False,
|
|
||||||
"options": {
|
|
||||||
"num_predict": max_tokens,
|
"num_predict": max_tokens,
|
||||||
"temperature": 0.1 # Low temperature for consistent answers
|
"temperature": 0.1
|
||||||
}
|
}
|
||||||
},
|
|
||||||
timeout=self.timeout
|
|
||||||
)
|
)
|
||||||
|
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json()
|
|
||||||
return {
|
return {
|
||||||
'text': result.get('response', '').strip(),
|
'text': text,
|
||||||
'success': True,
|
'success': True,
|
||||||
'error': None,
|
'error': None,
|
||||||
'duration': duration
|
'duration': duration
|
||||||
}
|
}
|
||||||
else:
|
|
||||||
return {
|
|
||||||
'text': '',
|
|
||||||
'success': False,
|
|
||||||
'error': f"HTTP {response.status_code}: {response.text}",
|
|
||||||
'duration': duration
|
|
||||||
}
|
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
return {
|
return {
|
||||||
@@ -509,6 +480,89 @@ New York Times-style summary (max {max_words} words):"""
|
|||||||
'duration': time.time() - start_time
|
'duration': time.time() - start_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def extract_keywords(self, title, summary, max_keywords=5):
|
||||||
|
"""
|
||||||
|
Extract keywords/topics from article for personalization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Article title
|
||||||
|
summary: Article summary
|
||||||
|
max_keywords: Maximum number of keywords to extract (default 5)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
'keywords': list, # List of extracted keywords
|
||||||
|
'success': bool, # Whether extraction succeeded
|
||||||
|
'error': str or None, # Error message if failed
|
||||||
|
'duration': float # Time taken in seconds
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
return {
|
||||||
|
'keywords': [],
|
||||||
|
'success': False,
|
||||||
|
'error': 'Ollama is disabled',
|
||||||
|
'duration': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Construct messages for chat API
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
'role': 'system',
|
||||||
|
'content': f"Extract {max_keywords} key topics or keywords from the article.\\n\\nReturn ONLY the keywords separated by commas, nothing else. Focus on:\\n- Main topics (e.g., 'Bayern Munich', 'Oktoberfest', 'City Council')\\n- Locations (e.g., 'Marienplatz', 'Airport')\\n- Events or themes (e.g., 'Transportation', 'Housing', 'Technology')"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': f"Title: {title}\\nSummary: {summary}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
keywords_text = self._chat_request(
|
||||||
|
messages,
|
||||||
|
options={
|
||||||
|
'temperature': 0.2,
|
||||||
|
'num_predict': 100
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not keywords_text:
|
||||||
|
return {
|
||||||
|
'keywords': [],
|
||||||
|
'success': False,
|
||||||
|
'error': 'Ollama returned empty response',
|
||||||
|
'duration': time.time() - start_time
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse keywords from response
|
||||||
|
keywords = [k.strip() for k in keywords_text.split(',')]
|
||||||
|
keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'keywords': keywords,
|
||||||
|
'success': True,
|
||||||
|
'error': None,
|
||||||
|
'duration': time.time() - start_time
|
||||||
|
}
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
return {
|
||||||
|
'keywords': [],
|
||||||
|
'success': False,
|
||||||
|
'error': f"Request timed out after {self.timeout}s",
|
||||||
|
'duration': time.time() - start_time
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
'keywords': [],
|
||||||
|
'success': False,
|
||||||
|
'error': str(e),
|
||||||
|
'duration': time.time() - start_time
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Quick test
|
# Quick test
|
||||||
|
|||||||
@@ -5,119 +5,113 @@
|
|||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||||
<title>Munich News Daily</title>
|
<title>Munich News Daily</title>
|
||||||
<!--[if mso]>
|
|
||||||
<style type="text/css">
|
<style type="text/css">
|
||||||
body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
|
/* Client-specific resets */
|
||||||
</style>
|
body, table, td, a { -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
|
||||||
<![endif]-->
|
table, td { mso-table-lspace: 0pt; mso-table-rspace: 0pt; }
|
||||||
</head>
|
img { -ms-interpolation-mode: bicubic; border: 0; height: auto; line-height: 100%; outline: none; text-decoration: none; }
|
||||||
<body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
|
|
||||||
<!-- Wrapper Table -->
|
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
|
|
||||||
<tr>
|
|
||||||
<td align="center" style="padding: 20px 0;">
|
|
||||||
<!-- Main Container -->
|
|
||||||
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
|
|
||||||
|
|
||||||
<!-- Header -->
|
/* General styles */
|
||||||
|
body { height: 100% !important; margin: 0 !important; padding: 0 !important; width: 100% !important; }
|
||||||
|
|
||||||
|
/* Hover effects */
|
||||||
|
.hover-opacity:hover { opacity: 0.8 !important; }
|
||||||
|
.read-more-btn:hover { background-color: #556cd6 !important; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body style="margin: 0; padding: 0; background-color: #f0f2f5; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif; color: #1a1a1a;">
|
||||||
|
|
||||||
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f0f2f5;">
|
||||||
<tr>
|
<tr>
|
||||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
<td align="center" style="padding: 30px 10px;">
|
||||||
<h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
|
|
||||||
Munich News Daily
|
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="max-width: 600px; width: 100%;">
|
||||||
</h1>
|
|
||||||
<p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
|
<tr>
|
||||||
|
<td align="center" style="padding-bottom: 25px;">
|
||||||
|
<p style="margin: 0 0 5px 0; font-size: 12px; font-weight: 700; letter-spacing: 1.5px; text-transform: uppercase; color: #667eea;">
|
||||||
{{ date }}
|
{{ date }}
|
||||||
</p>
|
</p>
|
||||||
|
<h1 style="margin: 0; font-size: 32px; font-weight: 800; letter-spacing: -1px; color: #1a1a1a;">
|
||||||
|
Munich News Daily
|
||||||
|
</h1>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
<!-- Greeting -->
|
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 30px 40px 20px 40px;">
|
<td style="background-color: #ffffff; border-radius: 16px; padding: 30px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
|
||||||
<p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
|
<p style="margin: 0 0 10px 0; font-size: 18px; font-weight: 600; color: #1a1a1a;">
|
||||||
Good morning ☀️
|
Good morning ☀️
|
||||||
</p>
|
</p>
|
||||||
<p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666; text-align: justify;">
|
<p style="margin: 0 0 25px 0; font-size: 16px; line-height: 1.6; color: #555555;">
|
||||||
Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
|
Here is your AI-curated briefing. We've summarized <strong>{{ article_count }} stories</strong> to get you up to speed in under 5 minutes.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
{% if weather and weather.success %}
|
{% if weather and weather.success %}
|
||||||
<!-- Weather Widget -->
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; overflow: hidden;">
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-top: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; overflow: hidden;">
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 20px;">
|
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
|
||||||
<tr>
|
|
||||||
<td style="width: 60%; vertical-align: middle;">
|
|
||||||
<p style="margin: 0 0 5px 0; font-size: 13px; color: rgba(255,255,255,0.9); font-weight: 600;">
|
|
||||||
TODAY'S WEATHER
|
|
||||||
</p>
|
|
||||||
<p style="margin: 0; font-size: 32px; color: #ffffff; font-weight: 700; line-height: 1;">
|
|
||||||
{{ weather.icon }} {{ weather.temperature }}°C
|
|
||||||
</p>
|
|
||||||
<p style="margin: 5px 0 0 0; font-size: 14px; color: rgba(255,255,255,0.9);">
|
|
||||||
{{ weather.condition }}
|
|
||||||
</p>
|
|
||||||
</td>
|
|
||||||
<td style="width: 40%; text-align: right; vertical-align: middle;">
|
|
||||||
<p style="margin: 0; font-size: 14px; color: rgba(255,255,255,0.9);">
|
|
||||||
High: <strong style="color: #ffffff;">{{ weather.high }}°C</strong>
|
|
||||||
</p>
|
|
||||||
<p style="margin: 5px 0 0 0; font-size: 14px; color: rgba(255,255,255,0.9);">
|
|
||||||
Low: <strong style="color: #ffffff;">{{ weather.low }}°C</strong>
|
|
||||||
</p>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
{% endif %}
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<!-- Divider -->
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 0 40px;">
|
|
||||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<!-- TL;DR Section -->
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 30px 40px;">
|
|
||||||
<h2 style="margin: 0 0 8px 0; font-size: 22px; font-weight: 700; color: #1a1a1a;">
|
|
||||||
📋 TL;DR - Quick Summary
|
|
||||||
</h2>
|
|
||||||
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999;">
|
|
||||||
Scan through today's top stories in seconds
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<!-- TL;DR Box -->
|
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #667eea;">
|
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 20px 25px;">
|
<td style="padding: 20px 25px;">
|
||||||
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||||
|
<tr>
|
||||||
|
<td style="vertical-align: middle;">
|
||||||
|
<div style="font-size: 36px; font-weight: 700; color: #ffffff; line-height: 1;">
|
||||||
|
{{ weather.icon }} {{ weather.temperature }}°
|
||||||
|
</div>
|
||||||
|
<div style="font-size: 14px; color: rgba(255,255,255,0.9); margin-top: 5px; font-weight: 500;">
|
||||||
|
{{ weather.condition }}
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td align="right" style="vertical-align: middle;">
|
||||||
|
<table role="presentation" cellpadding="0" cellspacing="0" border="0">
|
||||||
|
<tr>
|
||||||
|
<td style="padding-right: 15px; border-right: 1px solid rgba(255,255,255,0.3);">
|
||||||
|
<div style="font-size: 12px; color: rgba(255,255,255,0.8); text-transform: uppercase;">High</div>
|
||||||
|
<div style="font-size: 16px; font-weight: 700; color: #ffffff;">{{ weather.high }}°</div>
|
||||||
|
</td>
|
||||||
|
<td style="padding-left: 15px;">
|
||||||
|
<div style="font-size: 12px; color: rgba(255,255,255,0.8); text-transform: uppercase;">Low</div>
|
||||||
|
<div style="font-size: 16px; font-weight: 700; color: #ffffff;">{{ weather.low }}°</div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
|
||||||
|
<tr><td style="height: 20px;"></td></tr>
|
||||||
|
|
||||||
|
<tr>
|
||||||
|
<td style="background-color: #ffffff; border-radius: 16px; padding: 30px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
|
||||||
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||||
|
<tr>
|
||||||
|
<td style="padding-bottom: 20px; border-bottom: 2px solid #f0f2f5;">
|
||||||
|
<h2 style="margin: 0; font-size: 20px; font-weight: 700; color: #1a1a1a;">
|
||||||
|
⚡️ Quick Summary
|
||||||
|
</h2>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td style="padding-top: 20px;">
|
||||||
{% for section in category_sections %}
|
{% for section in category_sections %}
|
||||||
{% for article in section.articles %}
|
{% for article in section.articles %}
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-bottom: 15px;">
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-bottom: 12px;">
|
||||||
<tr>
|
<tr>
|
||||||
<td style="width: 30px; vertical-align: top; padding-top: 2px;">
|
<td style="width: 24px; vertical-align: top; padding-top: 4px;">
|
||||||
<span style="display: inline-block; background-color: #667eea; color: #ffffff; width: 22px; height: 22px; line-height: 22px; text-align: center; border-radius: 50%; font-size: 11px; font-weight: 700;">
|
<div style="height: 6px; width: 6px; background-color: #667eea; border-radius: 50%;"></div>
|
||||||
{{ loop.index }}
|
|
||||||
</span>
|
|
||||||
</td>
|
</td>
|
||||||
<td style="vertical-align: top;">
|
<td style="vertical-align: top;">
|
||||||
<p style="margin: 0; font-size: 14px; line-height: 1.6; color: #333333;">
|
<p style="margin: 0; font-size: 15px; line-height: 1.5; color: #4a4a4a;">
|
||||||
<strong style="color: #1a1a1a;">{{ article.title_en if article.title_en else article.title }}</strong>
|
<strong style="color: #1a1a1a;">{{ article.title_en if article.title_en else article.title }}</strong> — {{ article.summary.split('.')[0] }}.
|
||||||
<br>
|
|
||||||
<span style="color: #666666;">{{ article.summary.split('.')[0] }}.</span>
|
|
||||||
</p>
|
</p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
{% if not loop.last %}
|
|
||||||
<div style="height: 1px; background-color: #e0e0e0; margin: 10px 0;"></div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</td>
|
</td>
|
||||||
@@ -127,88 +121,48 @@
|
|||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
{% if transport_disruptions and transport_disruptions|length > 0 %}
|
{% if transport_disruptions and transport_disruptions|length > 0 %}
|
||||||
<!-- Divider -->
|
<tr><td style="height: 20px;"></td></tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 0 40px;">
|
<td style="background-color: #ffffff; border-radius: 16px; padding: 30px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
|
||||||
<div style="height: 2px; background-color: #e0e0e0;"></div>
|
<h2 style="margin: 0 0 15px 0; font-size: 20px; font-weight: 700; color: #1a1a1a;">
|
||||||
</td>
|
🚆 S-Bahn Updates
|
||||||
</tr>
|
|
||||||
|
|
||||||
<!-- Transport Disruptions Section -->
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 30px 40px;">
|
|
||||||
<h2 style="margin: 0 0 20px 0; font-size: 22px; font-weight: 700; color: #1a1a1a;">
|
|
||||||
🚆 S-Bahn Disruptions Today
|
|
||||||
</h2>
|
</h2>
|
||||||
<p style="margin: 0 0 20px 0; font-size: 14px; color: #666666;">
|
|
||||||
Current service disruptions affecting Munich S-Bahn:
|
|
||||||
</p>
|
|
||||||
|
|
||||||
{% for disruption in transport_disruptions %}
|
{% for disruption in transport_disruptions %}
|
||||||
<!-- Disruption Card -->
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #fff8f0; border-radius: 8px; border: 1px solid #ffeeba; margin-bottom: 10px;">
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="margin-bottom: 15px; background-color: #fff8f0; border-left: 4px solid #ff9800; border-radius: 4px;">
|
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 15px 20px;">
|
<td style="padding: 15px;">
|
||||||
<!-- Severity and Lines -->
|
<p style="margin: 0 0 5px 0; font-size: 12px; font-weight: 700; color: #c05621; text-transform: uppercase;">
|
||||||
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666;">
|
{{ disruption.severity_icon }} {{ disruption.lines_str }}
|
||||||
{{ disruption.severity_icon }} <strong style="color: #000000;">{{ disruption.lines_str }}</strong>
|
|
||||||
</p>
|
</p>
|
||||||
|
<p style="margin: 0 0 5px 0; font-size: 15px; font-weight: 700; color: #2d3748;">
|
||||||
<!-- Title -->
|
|
||||||
<p style="margin: 0 0 8px 0; font-size: 15px; font-weight: 700; color: #1a1a1a; line-height: 1.4;">
|
|
||||||
{{ disruption.title }}
|
{{ disruption.title }}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!-- Description -->
|
|
||||||
{% if disruption.description %}
|
{% if disruption.description %}
|
||||||
<p style="margin: 0 0 8px 0; font-size: 14px; color: #333333; line-height: 1.5;">
|
<p style="margin: 0; font-size: 14px; color: #4a5568; line-height: 1.4;">
|
||||||
{{ disruption.description }}
|
{{ disruption.description }}
|
||||||
</p>
|
</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<!-- Time -->
|
|
||||||
{% if disruption.start_time_str or disruption.end_time_str %}
|
|
||||||
<p style="margin: 0; font-size: 13px; color: #666666;">
|
|
||||||
⏰
|
|
||||||
{% if disruption.start_time_str %}
|
|
||||||
From {{ disruption.start_time_str }}
|
|
||||||
{% endif %}
|
|
||||||
{% if disruption.end_time_str %}
|
|
||||||
until {{ disruption.end_time_str }}
|
|
||||||
{% endif %}
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
<p style="margin: 10px 0 0 0; font-size: 13px; color: #718096;">
|
||||||
<p style="margin: 15px 0 0 0; font-size: 12px; color: #999999; font-style: italic;">
|
Check <a href="https://www.mvg.de" style="color: #667eea; text-decoration: underline;">MVG.de</a> for live times.
|
||||||
💡 Plan your commute accordingly. Check <a href="https://www.mvg.de" style="color: #667eea; text-decoration: none;">MVG.de</a> for real-time updates.
|
|
||||||
</p>
|
</p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<!-- Divider -->
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 0 40px;">
|
|
||||||
<div style="height: 2px; background-color: #e0e0e0;"></div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<!-- Category Sections -->
|
|
||||||
{% for section in category_sections %}
|
{% for section in category_sections %}
|
||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 30px 40px 15px 40px;">
|
<td align="center" style="padding: 40px 0 20px 0;">
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
<table role="presentation" border="0" cellspacing="0" cellpadding="0">
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td align="center" style="background-color: #e2e8f0; color: #4a5568; border-radius: 20px; padding: 8px 20px;">
|
||||||
<h2 style="margin: 0; font-size: 22px; font-weight: 700; color: #1a1a1a;">
|
<p style="margin: 0; font-size: 13px; font-weight: 700; letter-spacing: 0.5px; text-transform: uppercase;">
|
||||||
{{ section.icon }} {{ section.name }}
|
{{ section.icon }} {{ section.name }}
|
||||||
</h2>
|
|
||||||
<p style="margin: 8px 0 0 0; font-size: 13px; color: #666666;">
|
|
||||||
Top stories in {{ section.name.lower() }}
|
|
||||||
</p>
|
</p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
@@ -216,155 +170,99 @@
|
|||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
<!-- Category Articles -->
|
|
||||||
{% for article in section.articles %}
|
{% for article in section.articles %}
|
||||||
<tr>
|
<tr>
|
||||||
<td style="padding: 25px 40px;">
|
<td style="background-color: #ffffff; border-radius: 16px; padding: 30px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
|
||||||
<!-- Article Number Badge -->
|
<p style="margin: 0 0 12px 0; font-size: 12px; color: #718096; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
|
||||||
<tr>
|
|
||||||
<td>
|
|
||||||
<span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
|
|
||||||
{{ loop.index }}
|
|
||||||
</span>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<!-- Article Title -->
|
|
||||||
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
|
|
||||||
{{ article.title_en if article.title_en else article.title }}
|
|
||||||
</h2>
|
|
||||||
|
|
||||||
<!-- Original German Title (subtitle) -->
|
|
||||||
{% if article.title_en and article.title_en != article.title %}
|
|
||||||
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999; font-style: italic;">
|
|
||||||
Original: {{ article.title }}
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<!-- Article Meta -->
|
|
||||||
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
|
|
||||||
{% if article.is_clustered %}
|
{% if article.is_clustered %}
|
||||||
<span style="color: #000000; font-weight: 600;">Multiple sources</span>
|
Multiple Sources
|
||||||
{% else %}
|
{% else %}
|
||||||
<span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
|
{{ article.source }}
|
||||||
{% if article.author %}
|
|
||||||
<span> • {{ article.author }}</span>
|
|
||||||
{% endif %}
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!-- Article Summary -->
|
<h3 style="margin: 0 0 10px 0; font-size: 22px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
|
||||||
<p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333; text-align: justify;">
|
<a href="{{ article.link }}" style="color: #1a1a1a; text-decoration: none;">
|
||||||
|
{{ article.title_en if article.title_en else article.title }}
|
||||||
|
</a>
|
||||||
|
</h3>
|
||||||
|
|
||||||
|
{% if article.title_en and article.title_en != article.title %}
|
||||||
|
<p style="margin: 0 0 15px 0; font-size: 13px; color: #a0aec0; font-style: italic;">
|
||||||
|
"{{ article.title }}"
|
||||||
|
</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<p style="margin: 0 0 20px 0; font-size: 16px; line-height: 1.6; color: #4a5568;">
|
||||||
{{ article.summary }}
|
{{ article.summary }}
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!-- Read More Links -->
|
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
{% if article.is_clustered and article.sources %}
|
{% if article.is_clustered and article.sources %}
|
||||||
<!-- Multiple sources -->
|
<p style="margin: 0; font-size: 13px; color: #718096;">
|
||||||
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666;">
|
<strong>Read full coverage:</strong><br>
|
||||||
📰 Covered by {{ article.article_count }} sources:
|
|
||||||
</p>
|
|
||||||
<div style="margin: 0;">
|
|
||||||
{% for source in article.sources %}
|
{% for source in article.sources %}
|
||||||
<a href="{{ source.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 13px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px; margin-right: 15px; margin-bottom: 8px;">
|
<a href="{{ source.link }}" style="color: #667eea; text-decoration: none; margin-right: 10px; display: inline-block; margin-top: 5px;">
|
||||||
{{ source.name }} →
|
{{ source.name }} ↗
|
||||||
</a>
|
</a>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
|
||||||
{% else %}
|
|
||||||
<!-- Single source -->
|
|
||||||
<a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
|
|
||||||
Read more →
|
|
||||||
</a>
|
|
||||||
{% endif %}
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<!-- Article Divider -->
|
|
||||||
{% if not loop.last %}
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 0 40px;">
|
|
||||||
<div style="height: 1px; background-color: #f0f0f0;"></div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
<!-- Category Section Divider -->
|
|
||||||
{% if not loop.last %}
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 25px 40px;">
|
|
||||||
<div style="height: 2px; background-color: #e0e0e0;"></div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
<!-- Bottom Divider -->
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 25px 40px 0 40px;">
|
|
||||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
<td style="padding: 30px 40px;">
|
|
||||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
|
|
||||||
<tr>
|
|
||||||
<td style="padding: 25px; text-align: center;">
|
|
||||||
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
|
|
||||||
Today's Digest
|
|
||||||
</p>
|
</p>
|
||||||
<p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
|
{% else %}
|
||||||
|
<table role="presentation" cellpadding="0" cellspacing="0" border="0">
|
||||||
|
<tr>
|
||||||
|
<td style="background-color: #f0f2f5; border-radius: 8px; padding: 10px 20px;">
|
||||||
|
<a href="{{ article.link }}" style="color: #1a1a1a; font-size: 14px; font-weight: 600; text-decoration: none; display: block;">
|
||||||
|
Read full story →
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr><td style="height: 20px;"></td></tr>
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
<tr>
|
||||||
|
<td align="center" style="padding: 40px 0;">
|
||||||
|
<p style="margin: 0 0 5px 0; font-size: 48px; font-weight: 800; color: #cbd5e0; line-height: 1;">
|
||||||
{{ article_count }}
|
{{ article_count }}
|
||||||
</p>
|
</p>
|
||||||
<p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
|
<p style="margin: 0; font-size: 14px; font-weight: 600; color: #a0aec0; text-transform: uppercase; letter-spacing: 1px;">
|
||||||
stories • AI-summarized • 5 min read
|
Stories Summarized
|
||||||
</p>
|
</p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
|
||||||
</td>
|
|
||||||
<!-- Footer -->
|
|
||||||
<tr>
|
<tr>
|
||||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
<td style="text-align: center; padding: 0 20px 40px 20px;">
|
||||||
<p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
|
<p style="margin: 0 0 20px 0; font-size: 13px; line-height: 1.5; color: #718096;">
|
||||||
Munich News Daily
|
<strong>Munich News Daily</strong><br>
|
||||||
</p>
|
AI-powered news for busy locals.
|
||||||
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
|
|
||||||
AI-powered news summaries for busy people.<br>
|
|
||||||
Delivered daily to your inbox.
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!-- Footer Links -->
|
<p style="margin: 0 0 20px 0; font-size: 12px; color: #a0aec0;">
|
||||||
<p style="margin: 0; font-size: 12px; color: #666666;">
|
<a href="{{ website_link }}" style="color: #718096; text-decoration: underline;">Website</a> •
|
||||||
<a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
|
<a href="{{ preferences_link }}" style="color: #718096; text-decoration: underline;">Preferences</a> •
|
||||||
<span style="color: #444444;"> • </span>
|
<a href="{{ unsubscribe_link }}" style="color: #718096; text-decoration: underline;">Unsubscribe</a>
|
||||||
<a href="{{ preferences_link }}" style="color: #999999; text-decoration: none;">Manage Preferences</a>
|
|
||||||
<span style="color: #444444;"> • </span>
|
|
||||||
<a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
{% if tracking_enabled %}
|
<p style="margin: 0; font-size: 11px; color: #cbd5e0;">
|
||||||
<!-- Privacy Notice -->
|
© {{ year }} Munich News Daily. Made with 🥨 in Bavaria.
|
||||||
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666; line-height: 1.4;">
|
|
||||||
This email contains tracking to measure engagement and improve our content.<br>
|
|
||||||
We respect your privacy and anonymize data after 90 days.
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
|
|
||||||
© {{ year }} Munich News Daily. All rights reserved.
|
|
||||||
</p>
|
</p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
</table>
|
</table>
|
||||||
<!-- End Main Container -->
|
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
<!-- End Wrapper Table -->
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
@@ -166,8 +166,12 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
# Get cluster summaries collection
|
# Get cluster summaries collection
|
||||||
cluster_summaries_collection = db['cluster_summaries']
|
cluster_summaries_collection = db['cluster_summaries']
|
||||||
|
|
||||||
# If no categories specified, get all available categories
|
# If no categories specified, get all available categories from database
|
||||||
if categories is None:
|
if categories is None:
|
||||||
|
# Dynamically get categories from articles collection
|
||||||
|
categories = list(articles_collection.distinct('category'))
|
||||||
|
if not categories:
|
||||||
|
# Fallback to default categories if no articles exist yet
|
||||||
categories = ['general', 'local', 'sports', 'science']
|
categories = ['general', 'local', 'sports', 'science']
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
@@ -176,6 +180,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
# Fetch articles for each category separately
|
# Fetch articles for each category separately
|
||||||
for category in categories:
|
for category in categories:
|
||||||
# Query for articles in this category from today
|
# Query for articles in this category from today
|
||||||
|
# Fetch more than needed to allow for source diversity filtering
|
||||||
cursor = articles_collection.find({
|
cursor = articles_collection.find({
|
||||||
'summary': {'$exists': True, '$ne': None},
|
'summary': {'$exists': True, '$ne': None},
|
||||||
'category': category,
|
'category': category,
|
||||||
@@ -183,9 +188,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
{'published_at': {'$gte': today_start}},
|
{'published_at': {'$gte': today_start}},
|
||||||
{'created_at': {'$gte': today_start}}
|
{'created_at': {'$gte': today_start}}
|
||||||
]
|
]
|
||||||
}).sort('created_at', -1).limit(articles_per_category)
|
}).sort('created_at', -1).limit(articles_per_category * 3) # Fetch 3x to allow diversity
|
||||||
|
|
||||||
category_articles = []
|
category_articles = []
|
||||||
|
source_count = {} # Track how many articles from each source
|
||||||
|
|
||||||
for doc in cursor:
|
for doc in cursor:
|
||||||
# Double-check the date to ensure it's from today
|
# Double-check the date to ensure it's from today
|
||||||
@@ -268,15 +274,71 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
'is_clustered': False
|
'is_clustered': False
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Diversify sources: prioritize articles from different sources
|
||||||
|
# Sort by: clustered first, then by source diversity (fewer articles from same source)
|
||||||
|
diversified_articles = []
|
||||||
|
source_usage = {}
|
||||||
|
|
||||||
|
# First pass: add clustered articles (they represent multiple sources)
|
||||||
|
for article in category_articles:
|
||||||
|
if article.get('is_clustered'):
|
||||||
|
diversified_articles.append(article)
|
||||||
|
|
||||||
|
# Second pass: add non-clustered articles with source diversity
|
||||||
|
for article in category_articles:
|
||||||
|
if not article.get('is_clustered'):
|
||||||
|
source = article.get('source', 'unknown')
|
||||||
|
# Prefer sources we haven't used much yet
|
||||||
|
if source not in source_usage:
|
||||||
|
source_usage[source] = 0
|
||||||
|
|
||||||
|
# Add article and track source usage
|
||||||
|
diversified_articles.append(article)
|
||||||
|
source_usage[source] += 1
|
||||||
|
|
||||||
|
# Sort by source diversity: clustered first, then by how many times we've used this source
|
||||||
|
diversified_articles.sort(key=lambda x: (
|
||||||
|
0 if x.get('is_clustered') else 1, # Clustered first
|
||||||
|
-x.get('article_count', 1), # More sources in cluster = higher priority
|
||||||
|
source_usage.get(x.get('source', 'unknown'), 0) # Fewer from same source = higher priority
|
||||||
|
))
|
||||||
|
|
||||||
|
# Take only the requested number per category
|
||||||
|
category_articles = diversified_articles[:articles_per_category]
|
||||||
|
|
||||||
# Add this category's articles to the main list
|
# Add this category's articles to the main list
|
||||||
articles.extend(category_articles)
|
articles.extend(category_articles)
|
||||||
|
|
||||||
# Sort articles: clustered articles first (by source count), then by recency
|
# Final sort with source diversity across all categories
|
||||||
# This prioritizes stories covered by multiple sources
|
# Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
|
||||||
articles.sort(key=lambda x: (
|
import random
|
||||||
-1 if x.get('is_clustered') else 0, # Clustered first
|
|
||||||
-x.get('article_count', 1), # More sources = higher priority
|
# Group by clustered vs non-clustered
|
||||||
), reverse=True)
|
clustered = [a for a in articles if a.get('is_clustered')]
|
||||||
|
non_clustered = [a for a in articles if not a.get('is_clustered')]
|
||||||
|
|
||||||
|
# Sort clustered by article count (more sources = more important)
|
||||||
|
clustered.sort(key=lambda x: -x.get('article_count', 1))
|
||||||
|
|
||||||
|
# For non-clustered, shuffle within each category to add variety
|
||||||
|
# This prevents the same sources from always appearing first
|
||||||
|
from collections import defaultdict
|
||||||
|
by_category = defaultdict(list)
|
||||||
|
for article in non_clustered:
|
||||||
|
by_category[article.get('category', 'general')].append(article)
|
||||||
|
|
||||||
|
# Shuffle each category's articles to mix sources
|
||||||
|
for cat_articles in by_category.values():
|
||||||
|
random.shuffle(cat_articles)
|
||||||
|
|
||||||
|
# Reconstruct non-clustered list with shuffled articles
|
||||||
|
non_clustered = []
|
||||||
|
# Use all categories that exist in the articles
|
||||||
|
for cat in sorted(by_category.keys()):
|
||||||
|
non_clustered.extend(by_category[cat])
|
||||||
|
|
||||||
|
# Combine: clustered first, then shuffled non-clustered
|
||||||
|
articles = clustered + non_clustered
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
@@ -345,7 +407,8 @@ def render_newsletter_html(articles, subscriber_categories=None, tracking_enable
|
|||||||
'general': {'name': 'Top Trending', 'icon': '🔥'},
|
'general': {'name': 'Top Trending', 'icon': '🔥'},
|
||||||
'local': {'name': 'Local Events', 'icon': '🏛️'},
|
'local': {'name': 'Local Events', 'icon': '🏛️'},
|
||||||
'sports': {'name': 'Sports', 'icon': '⚽'},
|
'sports': {'name': 'Sports', 'icon': '⚽'},
|
||||||
'science': {'name': 'Science & Tech', 'icon': '🔬'}
|
'science': {'name': 'Science & Tech', 'icon': '🔬'},
|
||||||
|
'business': {'name': 'Business', 'icon': '💼'}
|
||||||
}
|
}
|
||||||
|
|
||||||
for category, category_articles in sorted(articles_by_category.items()):
|
for category, category_articles in sorted(articles_by_category.items()):
|
||||||
|
|||||||
221
tests/backend/test_personalization_system.py
Normal file
221
tests/backend/test_personalization_system.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive test suite for the personalization system.
|
||||||
|
Tests all 4 phases: keyword extraction, click tracking, interest profiling, and personalization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Import services
|
||||||
|
from services.tracking_service import create_newsletter_tracking
|
||||||
|
from services.interest_profiling_service import (
|
||||||
|
update_user_interests,
|
||||||
|
get_user_interests,
|
||||||
|
get_top_interests,
|
||||||
|
build_interests_from_history
|
||||||
|
)
|
||||||
|
from services.personalization_service import (
|
||||||
|
calculate_article_score,
|
||||||
|
rank_articles_for_user,
|
||||||
|
select_personalized_articles,
|
||||||
|
get_personalization_stats
|
||||||
|
)
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
# Connect to MongoDB
|
||||||
|
client = MongoClient(Config.MONGODB_URI)
|
||||||
|
db = client[Config.DB_NAME]
|
||||||
|
|
||||||
|
articles_collection = db['articles']
|
||||||
|
link_clicks_collection = db['link_clicks']
|
||||||
|
user_interests_collection = db['user_interests']
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase1_keywords():
|
||||||
|
"""Phase 1: Verify articles have keywords extracted"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 1: Keyword Extraction")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
articles_with_keywords = articles_collection.count_documents({
|
||||||
|
'keywords': {'$exists': True, '$ne': []}
|
||||||
|
})
|
||||||
|
|
||||||
|
if articles_with_keywords == 0:
|
||||||
|
print("❌ No articles with keywords found")
|
||||||
|
print(" Run a crawl first to extract keywords")
|
||||||
|
return False
|
||||||
|
|
||||||
|
sample = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||||
|
print(f"✓ Found {articles_with_keywords} articles with keywords")
|
||||||
|
print(f" Sample: {sample.get('title', 'N/A')[:50]}...")
|
||||||
|
print(f" Keywords: {sample.get('keywords', [])[:3]}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase2_tracking():
|
||||||
|
"""Phase 2: Verify tracking includes keywords and metadata"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 2: Click Tracking Enhancement")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase2@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||||
|
|
||||||
|
# Get article with keywords
|
||||||
|
article = articles_collection.find_one({'keywords': {'$exists': True, '$ne': []}})
|
||||||
|
|
||||||
|
if not article:
|
||||||
|
print("❌ No articles found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create tracking
|
||||||
|
tracking_data = create_newsletter_tracking(
|
||||||
|
newsletter_id='test-phase2',
|
||||||
|
subscriber_email=test_email,
|
||||||
|
article_links=[{
|
||||||
|
'url': article['link'],
|
||||||
|
'title': article.get('title', '')
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify tracking record
|
||||||
|
tracking_id = list(tracking_data['link_tracking_map'].values())[0]
|
||||||
|
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
|
||||||
|
|
||||||
|
has_metadata = (
|
||||||
|
tracking_record.get('article_id') is not None and
|
||||||
|
tracking_record.get('category') is not None and
|
||||||
|
len(tracking_record.get('keywords', [])) > 0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
link_clicks_collection.delete_many({'subscriber_email': test_email})
|
||||||
|
db['newsletter_sends'].delete_many({'subscriber_email': test_email})
|
||||||
|
|
||||||
|
if has_metadata:
|
||||||
|
print(f"✓ Tracking records include metadata")
|
||||||
|
print(f" Article ID: {tracking_record.get('article_id')}")
|
||||||
|
print(f" Category: {tracking_record.get('category')}")
|
||||||
|
print(f" Keywords: {len(tracking_record.get('keywords', []))} keywords")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Tracking records missing metadata")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase3_profiling():
|
||||||
|
"""Phase 3: Verify interest profiles are built from clicks"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 3: User Interest Profiling")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase3@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
# Create profile
|
||||||
|
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||||
|
update_user_interests(test_email, ['Transportation', 'Munich'], 'local')
|
||||||
|
|
||||||
|
# Verify profile
|
||||||
|
profile = get_user_interests(test_email)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
if profile and profile['total_clicks'] == 2:
|
||||||
|
print(f"✓ Interest profile created")
|
||||||
|
print(f" Total clicks: {profile['total_clicks']}")
|
||||||
|
print(f" Categories: {len(profile.get('categories', {}))}")
|
||||||
|
print(f" Keywords: {len(profile.get('keywords', {}))}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Interest profile not created correctly")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def test_phase4_personalization():
|
||||||
|
"""Phase 4: Verify articles are ranked by user interests"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("Phase 4: Personalized Newsletter Generation")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
test_email = 'test-phase4@example.com'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
# Get articles
|
||||||
|
articles = list(articles_collection.find(
|
||||||
|
{'keywords': {'$exists': True, '$ne': []}},
|
||||||
|
limit=5
|
||||||
|
))
|
||||||
|
|
||||||
|
if len(articles) < 3:
|
||||||
|
print("❌ Not enough articles found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create profile
|
||||||
|
update_user_interests(test_email, ['Bayern Munich', 'Football'], 'sports')
|
||||||
|
|
||||||
|
# Rank articles
|
||||||
|
ranked = rank_articles_for_user(articles, test_email)
|
||||||
|
|
||||||
|
# Select personalized
|
||||||
|
selected = select_personalized_articles(articles, test_email, max_articles=3)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
user_interests_collection.delete_many({'email': test_email})
|
||||||
|
|
||||||
|
has_scores = all('personalization_score' in a for a in selected)
|
||||||
|
|
||||||
|
if has_scores and len(selected) > 0:
|
||||||
|
print(f"✓ Articles ranked and selected")
|
||||||
|
print(f" Total ranked: {len(ranked)}")
|
||||||
|
print(f" Selected: {len(selected)}")
|
||||||
|
print(f" Top score: {selected[0].get('personalization_score', 0):.3f}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Personalization failed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all personalization tests"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("PERSONALIZATION SYSTEM TEST SUITE")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
results = {
|
||||||
|
'Phase 1: Keyword Extraction': test_phase1_keywords(),
|
||||||
|
'Phase 2: Click Tracking': test_phase2_tracking(),
|
||||||
|
'Phase 3: Interest Profiling': test_phase3_profiling(),
|
||||||
|
'Phase 4: Personalization': test_phase4_personalization()
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TEST RESULTS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
for phase, passed in results.items():
|
||||||
|
status = "✅ PASS" if passed else "❌ FAIL"
|
||||||
|
print(f"{status} - {phase}")
|
||||||
|
|
||||||
|
all_passed = all(results.values())
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n🎉 All personalization tests PASSED!")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print("\n❌ Some tests FAILED")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user