update
This commit is contained in:
28
backend/.env.example
Normal file
28
backend/.env.example
Normal file
@@ -0,0 +1,28 @@
|
||||
# MongoDB Configuration
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
|
||||
# Email Configuration (Required)
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
|
||||
# Newsletter Settings
|
||||
NEWSLETTER_MAX_ARTICLES=10
|
||||
NEWSLETTER_HOURS_LOOKBACK=24
|
||||
WEBSITE_URL=http://localhost:3000
|
||||
|
||||
# Tracking Configuration
|
||||
TRACKING_ENABLED=true
|
||||
TRACKING_API_URL=http://localhost:5001
|
||||
TRACKING_DATA_RETENTION_DAYS=90
|
||||
|
||||
# Ollama Configuration (AI Summarization)
|
||||
OLLAMA_ENABLED=true
|
||||
OLLAMA_BASE_URL=http://127.0.0.1:11434
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
OLLAMA_TIMEOUT=120
|
||||
SUMMARY_MAX_WORDS=150
|
||||
|
||||
# Flask Server Configuration
|
||||
FLASK_PORT=5001
|
||||
@@ -1,143 +0,0 @@
|
||||
# MongoDB Database Schema
|
||||
|
||||
This document describes the MongoDB collections and their structure for Munich News Daily.
|
||||
|
||||
## Collections
|
||||
|
||||
### 1. Articles Collection (`articles`)
|
||||
|
||||
Stores all news articles aggregated from Munich news sources.
|
||||
|
||||
**Document Structure:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId, // Auto-generated MongoDB ID
|
||||
title: String, // Article title (required)
|
||||
author: String, // Article author (optional, extracted during crawl)
|
||||
link: String, // Article URL (required, unique)
|
||||
content: String, // Full article content (no length limit)
|
||||
summary: String, // AI-generated English summary (≤150 words)
|
||||
word_count: Number, // Word count of full content
|
||||
summary_word_count: Number, // Word count of AI summary
|
||||
source: String, // News source name (e.g., "Süddeutsche Zeitung München")
|
||||
published_at: String, // Original publication date from RSS feed or crawled
|
||||
crawled_at: DateTime, // When article content was crawled (UTC)
|
||||
summarized_at: DateTime, // When AI summary was generated (UTC)
|
||||
created_at: DateTime // When article was added to database (UTC)
|
||||
}
|
||||
```
|
||||
|
||||
**Indexes:**
|
||||
- `link` - Unique index to prevent duplicate articles
|
||||
- `created_at` - Index for efficient sorting by date
|
||||
|
||||
**Example Document:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId("507f1f77bcf86cd799439011"),
|
||||
title: "New U-Bahn Line Opens in Munich",
|
||||
author: "Max Mustermann",
|
||||
link: "https://www.sueddeutsche.de/muenchen/ubahn-1.123456",
|
||||
content: "The new U-Bahn line connecting the city center with the airport opened today. Mayor Dieter Reiter attended the opening ceremony... [full article text continues]",
|
||||
summary: "Munich's new U-Bahn line connecting the city center to the airport opened today with Mayor Dieter Reiter in attendance. The line features 10 stations and runs every 10 minutes during peak hours, significantly reducing travel time. Construction took five years and cost approximately 2 billion euros.",
|
||||
word_count: 1250,
|
||||
summary_word_count: 48,
|
||||
source: "Süddeutsche Zeitung München",
|
||||
published_at: "Mon, 15 Jan 2024 10:00:00 +0100",
|
||||
crawled_at: ISODate("2024-01-15T09:30:00.000Z"),
|
||||
summarized_at: ISODate("2024-01-15T09:30:15.000Z"),
|
||||
created_at: ISODate("2024-01-15T09:00:00.000Z")
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Subscribers Collection (`subscribers`)
|
||||
|
||||
Stores all newsletter subscribers.
|
||||
|
||||
**Document Structure:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId, // Auto-generated MongoDB ID
|
||||
email: String, // Subscriber email (required, unique, lowercase)
|
||||
subscribed_at: DateTime, // When user subscribed (UTC)
|
||||
status: String // Subscription status: 'active' or 'inactive'
|
||||
}
|
||||
```
|
||||
|
||||
**Indexes:**
|
||||
- `email` - Unique index for email lookups and preventing duplicates
|
||||
- `subscribed_at` - Index for analytics and sorting
|
||||
|
||||
**Example Document:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId("507f1f77bcf86cd799439012"),
|
||||
email: "user@example.com",
|
||||
subscribed_at: ISODate("2024-01-15T08:30:00.000Z"),
|
||||
status: "active"
|
||||
}
|
||||
```
|
||||
|
||||
## Design Decisions
|
||||
|
||||
### Why MongoDB?
|
||||
|
||||
1. **Flexibility**: Easy to add new fields without schema migrations
|
||||
2. **Scalability**: Handles large volumes of articles and subscribers efficiently
|
||||
3. **Performance**: Indexes on frequently queried fields (link, email, created_at)
|
||||
4. **Document Model**: Natural fit for news articles and subscriber data
|
||||
|
||||
### Schema Choices
|
||||
|
||||
1. **Unique Link Index**: Prevents duplicate articles from being stored, even if fetched multiple times
|
||||
2. **Status Field**: Soft delete for subscribers (set to 'inactive' instead of deleting) - allows for analytics and easy re-subscription
|
||||
3. **UTC Timestamps**: All dates stored in UTC for consistency across timezones
|
||||
4. **Lowercase Emails**: Emails stored in lowercase to prevent case-sensitivity issues
|
||||
|
||||
### Future Enhancements
|
||||
|
||||
Potential fields to add in the future:
|
||||
|
||||
**Articles:**
|
||||
- `category`: String (e.g., "politics", "sports", "culture")
|
||||
- `tags`: Array of Strings
|
||||
- `image_url`: String
|
||||
- `sent_in_newsletter`: Boolean (track if article was sent)
|
||||
- `sent_at`: DateTime (when article was included in newsletter)
|
||||
|
||||
**Subscribers:**
|
||||
- `preferences`: Object (newsletter frequency, categories, etc.)
|
||||
- `last_sent_at`: DateTime (last newsletter sent date)
|
||||
- `unsubscribed_at`: DateTime (when user unsubscribed)
|
||||
- `verification_token`: String (for email verification)
|
||||
|
||||
|
||||
|
||||
## AI Summarization Workflow
|
||||
|
||||
When the crawler processes an article:
|
||||
|
||||
1. **Extract Content**: Full article text is extracted from the webpage
|
||||
2. **Summarize with Ollama**: If `OLLAMA_ENABLED=true`, the content is sent to Ollama for summarization
|
||||
3. **Store Both**: Both the original `content` and AI-generated `summary` are stored
|
||||
4. **Fallback**: If Ollama is unavailable or fails, only the original content is stored
|
||||
|
||||
### Summary Field Details
|
||||
|
||||
- **Language**: Always in English, regardless of source article language
|
||||
- **Length**: Maximum 150 words
|
||||
- **Format**: Plain text, concise and clear
|
||||
- **Purpose**: Quick preview for newsletters and frontend display
|
||||
|
||||
### Querying Articles
|
||||
|
||||
```javascript
|
||||
// Get articles with AI summaries
|
||||
db.articles.find({ summary: { $exists: true, $ne: null } })
|
||||
|
||||
// Get articles without summaries
|
||||
db.articles.find({ summary: { $exists: false } })
|
||||
|
||||
// Count summarized articles
|
||||
db.articles.countDocuments({ summary: { $exists: true, $ne: null } })
|
||||
```
|
||||
20
backend/Dockerfile
Normal file
20
backend/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application files
|
||||
COPY . .
|
||||
|
||||
# Set timezone to Berlin
|
||||
ENV TZ=Europe/Berlin
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
# Expose Flask port
|
||||
EXPOSE 5001
|
||||
|
||||
# Run the Flask application
|
||||
CMD ["python", "-u", "app.py"]
|
||||
@@ -1,98 +0,0 @@
|
||||
# Backend Structure
|
||||
|
||||
The backend has been modularized for better maintainability and scalability.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
backend/
|
||||
├── app.py # Main Flask application entry point
|
||||
├── config.py # Configuration management
|
||||
├── database.py # Database connection and initialization
|
||||
├── requirements.txt # Python dependencies
|
||||
├── .env # Environment variables
|
||||
│
|
||||
├── routes/ # API route handlers (blueprints)
|
||||
│ ├── __init__.py
|
||||
│ ├── subscription_routes.py # /api/subscribe, /api/unsubscribe
|
||||
│ ├── news_routes.py # /api/news, /api/stats
|
||||
│ ├── rss_routes.py # /api/rss-feeds (CRUD operations)
|
||||
│ └── ollama_routes.py # /api/ollama/* (AI features)
|
||||
│
|
||||
└── services/ # Business logic layer
|
||||
├── __init__.py
|
||||
├── news_service.py # News fetching and storage logic
|
||||
├── email_service.py # Newsletter email sending
|
||||
└── ollama_service.py # Ollama AI integration
|
||||
```
|
||||
|
||||
## Key Components
|
||||
|
||||
### app.py
|
||||
- Main Flask application
|
||||
- Registers all blueprints
|
||||
- Minimal code, just wiring things together
|
||||
|
||||
### config.py
|
||||
- Centralized configuration
|
||||
- Loads environment variables
|
||||
- Single source of truth for all settings
|
||||
|
||||
### database.py
|
||||
- MongoDB connection setup
|
||||
- Collection definitions
|
||||
- Database initialization with indexes
|
||||
|
||||
### routes/
|
||||
Each route file is a Flask Blueprint handling specific API endpoints:
|
||||
- **subscription_routes.py**: User subscription management
|
||||
- **news_routes.py**: News fetching and statistics
|
||||
- **rss_routes.py**: RSS feed management (add/remove/list/toggle)
|
||||
- **ollama_routes.py**: AI/Ollama integration endpoints
|
||||
|
||||
### services/
|
||||
Business logic separated from route handlers:
|
||||
- **news_service.py**: Fetches news from RSS feeds, saves to database
|
||||
- **email_service.py**: Sends newsletter emails to subscribers
|
||||
- **ollama_service.py**: Communicates with Ollama AI server
|
||||
|
||||
## Benefits of This Structure
|
||||
|
||||
1. **Separation of Concerns**: Routes handle HTTP, services handle business logic
|
||||
2. **Testability**: Each module can be tested independently
|
||||
3. **Maintainability**: Easy to find and modify specific functionality
|
||||
4. **Scalability**: Easy to add new routes or services
|
||||
5. **Reusability**: Services can be used by multiple routes
|
||||
|
||||
## Adding New Features
|
||||
|
||||
### To add a new API endpoint:
|
||||
1. Create a new route file in `routes/` or add to existing one
|
||||
2. Create a Blueprint and define routes
|
||||
3. Register the blueprint in `app.py`
|
||||
|
||||
### To add new business logic:
|
||||
1. Create a new service file in `services/`
|
||||
2. Import and use in your route handlers
|
||||
|
||||
### Example:
|
||||
```python
|
||||
# services/my_service.py
|
||||
def my_business_logic():
|
||||
return "Hello"
|
||||
|
||||
# routes/my_routes.py
|
||||
from flask import Blueprint
|
||||
from services.my_service import my_business_logic
|
||||
|
||||
my_bp = Blueprint('my', __name__)
|
||||
|
||||
@my_bp.route('/api/my-endpoint')
|
||||
def my_endpoint():
|
||||
result = my_business_logic()
|
||||
return {'message': result}
|
||||
|
||||
# app.py
|
||||
from routes.my_routes import my_bp
|
||||
app.register_blueprint(my_bp)
|
||||
```
|
||||
@@ -7,6 +7,8 @@ from routes.news_routes import news_bp
|
||||
from routes.rss_routes import rss_bp
|
||||
from routes.ollama_routes import ollama_bp
|
||||
from routes.newsletter_routes import newsletter_bp
|
||||
from routes.tracking_routes import tracking_bp
|
||||
from routes.analytics_routes import analytics_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
@@ -21,9 +23,17 @@ app.register_blueprint(news_bp)
|
||||
app.register_blueprint(rss_bp)
|
||||
app.register_blueprint(ollama_bp)
|
||||
app.register_blueprint(newsletter_bp)
|
||||
app.register_blueprint(tracking_bp)
|
||||
app.register_blueprint(analytics_bp)
|
||||
|
||||
# Health check endpoint
|
||||
@app.route('/health')
|
||||
def health():
|
||||
return {'status': 'healthy', 'service': 'munich-news-backend'}, 200
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, port=Config.FLASK_PORT, host='127.0.0.1')
|
||||
# Use 0.0.0.0 to allow Docker container access
|
||||
app.run(debug=True, port=Config.FLASK_PORT, host='0.0.0.0')
|
||||
|
||||
@@ -40,6 +40,11 @@ class Config:
|
||||
# Flask
|
||||
FLASK_PORT = int(os.getenv('FLASK_PORT', '5000'))
|
||||
|
||||
# Tracking
|
||||
TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true'
|
||||
TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
|
||||
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print configuration (without sensitive data)"""
|
||||
@@ -50,3 +55,5 @@ class Config:
|
||||
print(f" Ollama Base URL: {cls.OLLAMA_BASE_URL}")
|
||||
print(f" Ollama Model: {cls.OLLAMA_MODEL}")
|
||||
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Tracking Enabled: {cls.TRACKING_ENABLED}")
|
||||
print(f" Tracking API URL: {cls.TRACKING_API_URL}")
|
||||
|
||||
@@ -11,6 +11,11 @@ articles_collection = db['articles']
|
||||
subscribers_collection = db['subscribers']
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
# Tracking Collections
|
||||
newsletter_sends_collection = db['newsletter_sends']
|
||||
link_clicks_collection = db['link_clicks']
|
||||
subscriber_activity_collection = db['subscriber_activity']
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Initialize database with indexes"""
|
||||
@@ -25,6 +30,9 @@ def init_db():
|
||||
# Create unique index on RSS feed URLs
|
||||
rss_feeds_collection.create_index('url', unique=True)
|
||||
|
||||
# Initialize tracking collections indexes
|
||||
init_tracking_collections()
|
||||
|
||||
# Initialize default RSS feeds if collection is empty
|
||||
if rss_feeds_collection.count_documents({}) == 0:
|
||||
default_feeds = [
|
||||
@@ -51,3 +59,37 @@ def init_db():
|
||||
print(f"Initialized {len(default_feeds)} default RSS feeds")
|
||||
|
||||
print("Database initialized with indexes")
|
||||
|
||||
|
||||
def init_tracking_collections():
|
||||
"""Initialize tracking collections with indexes for email tracking system"""
|
||||
|
||||
# Newsletter Sends Collection Indexes
|
||||
# Unique index on tracking_id for fast pixel/click lookups
|
||||
newsletter_sends_collection.create_index('tracking_id', unique=True)
|
||||
# Index on newsletter_id for analytics queries
|
||||
newsletter_sends_collection.create_index('newsletter_id')
|
||||
# Index on subscriber_email for user activity queries
|
||||
newsletter_sends_collection.create_index('subscriber_email')
|
||||
# Index on sent_at for time-based queries
|
||||
newsletter_sends_collection.create_index('sent_at')
|
||||
|
||||
# Link Clicks Collection Indexes
|
||||
# Unique index on tracking_id for fast redirect lookups
|
||||
link_clicks_collection.create_index('tracking_id', unique=True)
|
||||
# Index on newsletter_id for analytics queries
|
||||
link_clicks_collection.create_index('newsletter_id')
|
||||
# Index on article_url for article performance queries
|
||||
link_clicks_collection.create_index('article_url')
|
||||
# Index on subscriber_email for user activity queries
|
||||
link_clicks_collection.create_index('subscriber_email')
|
||||
|
||||
# Subscriber Activity Collection Indexes
|
||||
# Unique index on email for fast lookups
|
||||
subscriber_activity_collection.create_index('email', unique=True)
|
||||
# Index on status for filtering by activity level
|
||||
subscriber_activity_collection.create_index('status')
|
||||
# Index on last_opened_at for time-based queries
|
||||
subscriber_activity_collection.create_index('last_opened_at')
|
||||
|
||||
print("Tracking collections initialized with indexes")
|
||||
|
||||
@@ -30,3 +30,12 @@ OLLAMA_TIMEOUT=30
|
||||
# Port for Flask server (default: 5001 to avoid AirPlay conflict on macOS)
|
||||
FLASK_PORT=5001
|
||||
|
||||
# Tracking Configuration
|
||||
# Enable/disable email tracking features (true/false)
|
||||
TRACKING_ENABLED=true
|
||||
# Base URL for tracking API (used in tracking pixel and link URLs)
|
||||
# In production, use your actual domain (e.g., https://yourdomain.com)
|
||||
TRACKING_API_URL=http://localhost:5001
|
||||
# Number of days to retain tracking data before anonymization
|
||||
TRACKING_DATA_RETENTION_DAYS=90
|
||||
|
||||
|
||||
107
backend/init_tracking_db.py
Normal file
107
backend/init_tracking_db.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Database initialization script for email tracking system.
|
||||
|
||||
This script creates the necessary MongoDB collections and indexes
|
||||
for tracking email opens and link clicks in the newsletter system.
|
||||
|
||||
Collections created:
|
||||
- newsletter_sends: Tracks each newsletter sent to each subscriber
|
||||
- link_clicks: Tracks individual link clicks
|
||||
- subscriber_activity: Aggregated activity status for each subscriber
|
||||
|
||||
Usage:
|
||||
python init_tracking_db.py
|
||||
"""
|
||||
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
from config import Config
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def init_tracking_database():
|
||||
"""Initialize tracking collections with proper indexes"""
|
||||
|
||||
print("Connecting to MongoDB...")
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
print(f"Connected to database: {Config.DB_NAME}")
|
||||
|
||||
# Get collection references
|
||||
newsletter_sends = db['newsletter_sends']
|
||||
link_clicks = db['link_clicks']
|
||||
subscriber_activity = db['subscriber_activity']
|
||||
|
||||
print("\n=== Setting up Newsletter Sends Collection ===")
|
||||
# Newsletter Sends Collection Indexes
|
||||
newsletter_sends.create_index('tracking_id', unique=True)
|
||||
print("✓ Created unique index on 'tracking_id'")
|
||||
|
||||
newsletter_sends.create_index('newsletter_id')
|
||||
print("✓ Created index on 'newsletter_id'")
|
||||
|
||||
newsletter_sends.create_index('subscriber_email')
|
||||
print("✓ Created index on 'subscriber_email'")
|
||||
|
||||
newsletter_sends.create_index('sent_at')
|
||||
print("✓ Created index on 'sent_at'")
|
||||
|
||||
print("\n=== Setting up Link Clicks Collection ===")
|
||||
# Link Clicks Collection Indexes
|
||||
link_clicks.create_index('tracking_id', unique=True)
|
||||
print("✓ Created unique index on 'tracking_id'")
|
||||
|
||||
link_clicks.create_index('newsletter_id')
|
||||
print("✓ Created index on 'newsletter_id'")
|
||||
|
||||
link_clicks.create_index('article_url')
|
||||
print("✓ Created index on 'article_url'")
|
||||
|
||||
link_clicks.create_index('subscriber_email')
|
||||
print("✓ Created index on 'subscriber_email'")
|
||||
|
||||
print("\n=== Setting up Subscriber Activity Collection ===")
|
||||
# Subscriber Activity Collection Indexes
|
||||
subscriber_activity.create_index('email', unique=True)
|
||||
print("✓ Created unique index on 'email'")
|
||||
|
||||
subscriber_activity.create_index('status')
|
||||
print("✓ Created index on 'status'")
|
||||
|
||||
subscriber_activity.create_index('last_opened_at')
|
||||
print("✓ Created index on 'last_opened_at'")
|
||||
|
||||
# Display collection statistics
|
||||
print("\n=== Collection Statistics ===")
|
||||
print(f"newsletter_sends: {newsletter_sends.count_documents({})} documents")
|
||||
print(f"link_clicks: {link_clicks.count_documents({})} documents")
|
||||
print(f"subscriber_activity: {subscriber_activity.count_documents({})} documents")
|
||||
|
||||
# List all indexes for verification
|
||||
print("\n=== Index Verification ===")
|
||||
print("\nNewsletter Sends Indexes:")
|
||||
for index in newsletter_sends.list_indexes():
|
||||
print(f" - {index['name']}: {index.get('key', {})}")
|
||||
|
||||
print("\nLink Clicks Indexes:")
|
||||
for index in link_clicks.list_indexes():
|
||||
print(f" - {index['name']}: {index.get('key', {})}")
|
||||
|
||||
print("\nSubscriber Activity Indexes:")
|
||||
for index in subscriber_activity.list_indexes():
|
||||
print(f" - {index['name']}: {index.get('key', {})}")
|
||||
|
||||
print("\n✅ Tracking database initialization complete!")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
init_tracking_database()
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error initializing tracking database: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
exit(1)
|
||||
127
backend/routes/analytics_routes.py
Normal file
127
backend/routes/analytics_routes.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Analytics routes for email tracking metrics and subscriber engagement.
|
||||
"""
|
||||
|
||||
from flask import Blueprint, jsonify, request
|
||||
from services.analytics_service import (
|
||||
get_newsletter_metrics,
|
||||
get_article_performance,
|
||||
get_subscriber_activity_status,
|
||||
update_subscriber_activity_statuses
|
||||
)
|
||||
from database import subscriber_activity_collection
|
||||
|
||||
analytics_bp = Blueprint('analytics', __name__)
|
||||
|
||||
|
||||
@analytics_bp.route('/api/analytics/newsletter/<newsletter_id>', methods=['GET'])
|
||||
def get_newsletter_analytics(newsletter_id):
|
||||
"""
|
||||
Get comprehensive metrics for a specific newsletter.
|
||||
|
||||
Args:
|
||||
newsletter_id: Unique identifier for the newsletter batch
|
||||
|
||||
Returns:
|
||||
JSON response with newsletter metrics including:
|
||||
- total_sent, total_opened, open_rate
|
||||
- total_clicks, unique_clickers, click_through_rate
|
||||
"""
|
||||
try:
|
||||
metrics = get_newsletter_metrics(newsletter_id)
|
||||
return jsonify(metrics), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@analytics_bp.route('/api/analytics/article/<path:article_url>', methods=['GET'])
|
||||
def get_article_analytics(article_url):
|
||||
"""
|
||||
Get performance metrics for a specific article.
|
||||
|
||||
Args:
|
||||
article_url: The original article URL (passed as path parameter)
|
||||
|
||||
Returns:
|
||||
JSON response with article performance metrics including:
|
||||
- total_sent, total_clicks, click_rate
|
||||
- unique_clickers, newsletters
|
||||
"""
|
||||
try:
|
||||
performance = get_article_performance(article_url)
|
||||
return jsonify(performance), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@analytics_bp.route('/api/analytics/subscriber/<email>', methods=['GET'])
|
||||
def get_subscriber_analytics(email):
|
||||
"""
|
||||
Get activity status and engagement metrics for a specific subscriber.
|
||||
|
||||
Args:
|
||||
email: Subscriber email address
|
||||
|
||||
Returns:
|
||||
JSON response with subscriber activity data including:
|
||||
- status, last_opened_at, last_clicked_at
|
||||
- total_opens, total_clicks
|
||||
- newsletters_received, newsletters_opened
|
||||
"""
|
||||
try:
|
||||
# Get current activity status
|
||||
status = get_subscriber_activity_status(email)
|
||||
|
||||
# Get detailed activity record from database
|
||||
activity_record = subscriber_activity_collection.find_one(
|
||||
{'email': email},
|
||||
{'_id': 0} # Exclude MongoDB _id field
|
||||
)
|
||||
|
||||
if activity_record:
|
||||
# Convert datetime objects to ISO format strings
|
||||
if activity_record.get('last_opened_at'):
|
||||
activity_record['last_opened_at'] = activity_record['last_opened_at'].isoformat()
|
||||
if activity_record.get('last_clicked_at'):
|
||||
activity_record['last_clicked_at'] = activity_record['last_clicked_at'].isoformat()
|
||||
if activity_record.get('updated_at'):
|
||||
activity_record['updated_at'] = activity_record['updated_at'].isoformat()
|
||||
|
||||
return jsonify(activity_record), 200
|
||||
else:
|
||||
# Return basic status if no detailed record exists yet
|
||||
return jsonify({
|
||||
'email': email,
|
||||
'status': status,
|
||||
'last_opened_at': None,
|
||||
'last_clicked_at': None,
|
||||
'total_opens': 0,
|
||||
'total_clicks': 0,
|
||||
'newsletters_received': 0,
|
||||
'newsletters_opened': 0
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@analytics_bp.route('/api/analytics/update-activity', methods=['POST'])
|
||||
def update_activity_statuses():
|
||||
"""
|
||||
Trigger batch update of subscriber activity statuses.
|
||||
|
||||
Updates the subscriber_activity collection with current engagement
|
||||
metrics for all subscribers.
|
||||
|
||||
Returns:
|
||||
JSON response with count of updated records
|
||||
"""
|
||||
try:
|
||||
updated_count = update_subscriber_activity_statuses()
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'updated_count': updated_count,
|
||||
'message': f'Updated activity status for {updated_count} subscribers'
|
||||
}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
285
backend/routes/tracking_routes.py
Normal file
285
backend/routes/tracking_routes.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""
|
||||
Tracking routes for email open and link click tracking.
|
||||
"""
|
||||
|
||||
from flask import Blueprint, request, redirect, make_response, jsonify
|
||||
from datetime import datetime
|
||||
import base64
|
||||
from database import newsletter_sends_collection, link_clicks_collection
|
||||
from services.tracking_service import delete_subscriber_tracking_data, anonymize_old_tracking_data
|
||||
from config import Config
|
||||
|
||||
tracking_bp = Blueprint('tracking', __name__)
|
||||
|
||||
# 1x1 transparent PNG image (43 bytes)
|
||||
TRANSPARENT_PNG = base64.b64decode(
|
||||
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII='
|
||||
)
|
||||
|
||||
|
||||
@tracking_bp.route('/api/track/pixel/<tracking_id>', methods=['GET'])
|
||||
def track_pixel(tracking_id):
|
||||
"""
|
||||
Track email opens via tracking pixel.
|
||||
|
||||
Serves a 1x1 transparent PNG image and logs the email open event.
|
||||
Handles multiple opens by updating last_opened_at and open_count.
|
||||
Fails silently if tracking_id is invalid to avoid breaking email rendering.
|
||||
|
||||
Args:
|
||||
tracking_id: Unique tracking ID for the newsletter send
|
||||
|
||||
Returns:
|
||||
Response: 1x1 transparent PNG image with proper headers
|
||||
"""
|
||||
try:
|
||||
# Look up tracking record
|
||||
tracking_record = newsletter_sends_collection.find_one({'tracking_id': tracking_id})
|
||||
|
||||
if tracking_record:
|
||||
# Get user agent for logging
|
||||
user_agent = request.headers.get('User-Agent', '')
|
||||
current_time = datetime.utcnow()
|
||||
|
||||
# Update tracking record
|
||||
update_data = {
|
||||
'opened': True,
|
||||
'last_opened_at': current_time,
|
||||
'user_agent': user_agent
|
||||
}
|
||||
|
||||
# Set first_opened_at only if this is the first open
|
||||
if not tracking_record.get('opened'):
|
||||
update_data['first_opened_at'] = current_time
|
||||
|
||||
# Increment open count
|
||||
newsletter_sends_collection.update_one(
|
||||
{'tracking_id': tracking_id},
|
||||
{
|
||||
'$set': update_data,
|
||||
'$inc': {'open_count': 1}
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
# Log error but don't fail - we still want to return the pixel
|
||||
print(f"Error tracking pixel for {tracking_id}: {str(e)}")
|
||||
|
||||
# Always return the transparent PNG, even if tracking fails
|
||||
response = make_response(TRANSPARENT_PNG)
|
||||
response.headers['Content-Type'] = 'image/png'
|
||||
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
||||
response.headers['Pragma'] = 'no-cache'
|
||||
response.headers['Expires'] = '0'
|
||||
|
||||
return response
|
||||
|
||||
|
||||
@tracking_bp.route('/api/track/click/<tracking_id>', methods=['GET'])
|
||||
def track_click(tracking_id):
|
||||
"""
|
||||
Track link clicks and redirect to original article URL.
|
||||
|
||||
Logs the click event and redirects the user to the original article URL.
|
||||
Handles invalid tracking_id by redirecting to homepage.
|
||||
Ensures redirect completes within 200ms.
|
||||
|
||||
Args:
|
||||
tracking_id: Unique tracking ID for the article link
|
||||
|
||||
Returns:
|
||||
Response: 302 redirect to original article URL or homepage
|
||||
"""
|
||||
# Default redirect URL (homepage)
|
||||
redirect_url = Config.TRACKING_API_URL or 'http://localhost:5001'
|
||||
|
||||
try:
|
||||
# Look up tracking record
|
||||
tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id})
|
||||
|
||||
if tracking_record:
|
||||
# Get the original article URL
|
||||
redirect_url = tracking_record.get('article_url', redirect_url)
|
||||
|
||||
# Get user agent for logging
|
||||
user_agent = request.headers.get('User-Agent', '')
|
||||
current_time = datetime.utcnow()
|
||||
|
||||
# Update tracking record with click event
|
||||
link_clicks_collection.update_one(
|
||||
{'tracking_id': tracking_id},
|
||||
{
|
||||
'$set': {
|
||||
'clicked': True,
|
||||
'clicked_at': current_time,
|
||||
'user_agent': user_agent
|
||||
}
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
# Log error but still redirect
|
||||
print(f"Error tracking click for {tracking_id}: {str(e)}")
|
||||
|
||||
# Redirect to the article URL (or homepage if tracking failed)
|
||||
return redirect(redirect_url, code=302)
|
||||
|
||||
|
||||
|
||||
@tracking_bp.route('/api/tracking/subscriber/<email>', methods=['DELETE'])
|
||||
def delete_subscriber_data(email):
|
||||
"""
|
||||
Delete all tracking data for a specific subscriber.
|
||||
|
||||
Removes all tracking records associated with the subscriber's email address
|
||||
from all tracking collections (newsletter_sends, link_clicks, subscriber_activity).
|
||||
Supports GDPR right to be forgotten.
|
||||
|
||||
Args:
|
||||
email: Email address of the subscriber
|
||||
|
||||
Returns:
|
||||
JSON response with deletion counts and confirmation message
|
||||
"""
|
||||
try:
|
||||
# Delete all tracking data for the subscriber
|
||||
result = delete_subscriber_tracking_data(email)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'All tracking data deleted for {email}',
|
||||
'deleted_counts': result
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@tracking_bp.route('/api/tracking/anonymize', methods=['POST'])
|
||||
def anonymize_tracking_data():
|
||||
"""
|
||||
Anonymize tracking data older than the retention period.
|
||||
|
||||
Removes email addresses from old tracking records while preserving
|
||||
aggregated metrics. Default retention period is 90 days.
|
||||
|
||||
Request body (optional):
|
||||
{
|
||||
"retention_days": 90 // Number of days to retain personal data
|
||||
}
|
||||
|
||||
Returns:
|
||||
JSON response with anonymization counts
|
||||
"""
|
||||
try:
|
||||
# Get retention days from request body (default: 90)
|
||||
data = request.get_json() or {}
|
||||
retention_days = data.get('retention_days', 90)
|
||||
|
||||
# Validate retention_days
|
||||
if not isinstance(retention_days, int) or retention_days < 1:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'retention_days must be a positive integer'
|
||||
}), 400
|
||||
|
||||
# Anonymize old tracking data
|
||||
result = anonymize_old_tracking_data(retention_days)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Anonymized tracking data older than {retention_days} days',
|
||||
'anonymized_counts': result
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
|
||||
@tracking_bp.route('/api/tracking/subscriber/<email>/opt-out', methods=['POST'])
|
||||
def opt_out_tracking(email):
|
||||
"""
|
||||
Opt a subscriber out of tracking.
|
||||
|
||||
Sets the tracking_enabled field to False for the subscriber,
|
||||
preventing future tracking of their email opens and link clicks.
|
||||
|
||||
Args:
|
||||
email: Email address of the subscriber
|
||||
|
||||
Returns:
|
||||
JSON response with confirmation message
|
||||
"""
|
||||
try:
|
||||
from database import subscribers_collection
|
||||
|
||||
# Update subscriber to opt out of tracking
|
||||
result = subscribers_collection.update_one(
|
||||
{'email': email},
|
||||
{'$set': {'tracking_enabled': False}},
|
||||
upsert=False
|
||||
)
|
||||
|
||||
if result.matched_count == 0:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': f'Subscriber {email} not found'
|
||||
}), 404
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Subscriber {email} has opted out of tracking'
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
|
||||
|
||||
@tracking_bp.route('/api/tracking/subscriber/<email>/opt-in', methods=['POST'])
|
||||
def opt_in_tracking(email):
|
||||
"""
|
||||
Opt a subscriber back into tracking.
|
||||
|
||||
Sets the tracking_enabled field to True for the subscriber,
|
||||
enabling tracking of their email opens and link clicks.
|
||||
|
||||
Args:
|
||||
email: Email address of the subscriber
|
||||
|
||||
Returns:
|
||||
JSON response with confirmation message
|
||||
"""
|
||||
try:
|
||||
from database import subscribers_collection
|
||||
|
||||
# Update subscriber to opt in to tracking
|
||||
result = subscribers_collection.update_one(
|
||||
{'email': email},
|
||||
{'$set': {'tracking_enabled': True}},
|
||||
upsert=False
|
||||
)
|
||||
|
||||
if result.matched_count == 0:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': f'Subscriber {email} not found'
|
||||
}), 404
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Subscriber {email} has opted in to tracking'
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}), 500
|
||||
306
backend/services/analytics_service.py
Normal file
306
backend/services/analytics_service.py
Normal file
@@ -0,0 +1,306 @@
|
||||
"""
|
||||
Analytics service for email tracking metrics and subscriber engagement.
|
||||
Calculates open rates, click rates, and subscriber activity status.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Optional
|
||||
from database import (
|
||||
newsletter_sends_collection,
|
||||
link_clicks_collection,
|
||||
subscriber_activity_collection
|
||||
)
|
||||
|
||||
|
||||
def get_open_rate(newsletter_id: str) -> float:
|
||||
"""
|
||||
Calculate the percentage of subscribers who opened a specific newsletter.
|
||||
|
||||
Args:
|
||||
newsletter_id: Unique identifier for the newsletter batch
|
||||
|
||||
Returns:
|
||||
float: Open rate as a percentage (0-100)
|
||||
"""
|
||||
# Count total sends for this newsletter
|
||||
total_sends = newsletter_sends_collection.count_documents({
|
||||
'newsletter_id': newsletter_id
|
||||
})
|
||||
|
||||
if total_sends == 0:
|
||||
return 0.0
|
||||
|
||||
# Count how many were opened
|
||||
opened_count = newsletter_sends_collection.count_documents({
|
||||
'newsletter_id': newsletter_id,
|
||||
'opened': True
|
||||
})
|
||||
|
||||
# Calculate percentage
|
||||
open_rate = (opened_count / total_sends) * 100
|
||||
return round(open_rate, 2)
|
||||
|
||||
|
||||
def get_click_rate(article_url: str) -> float:
|
||||
"""
|
||||
Calculate the percentage of subscribers who clicked a specific article link.
|
||||
|
||||
Args:
|
||||
article_url: The original article URL
|
||||
|
||||
Returns:
|
||||
float: Click rate as a percentage (0-100)
|
||||
"""
|
||||
# Count total link tracking records for this article
|
||||
total_links = link_clicks_collection.count_documents({
|
||||
'article_url': article_url
|
||||
})
|
||||
|
||||
if total_links == 0:
|
||||
return 0.0
|
||||
|
||||
# Count how many were clicked
|
||||
clicked_count = link_clicks_collection.count_documents({
|
||||
'article_url': article_url,
|
||||
'clicked': True
|
||||
})
|
||||
|
||||
# Calculate percentage
|
||||
click_rate = (clicked_count / total_links) * 100
|
||||
return round(click_rate, 2)
|
||||
|
||||
|
||||
def get_newsletter_metrics(newsletter_id: str) -> Dict:
|
||||
"""
|
||||
Get comprehensive metrics for a specific newsletter.
|
||||
|
||||
Args:
|
||||
newsletter_id: Unique identifier for the newsletter batch
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing:
|
||||
- newsletter_id: The newsletter ID
|
||||
- total_sent: Total number of emails sent
|
||||
- total_opened: Number of emails opened
|
||||
- open_rate: Percentage of emails opened
|
||||
- total_clicks: Total number of link clicks
|
||||
- unique_clickers: Number of unique subscribers who clicked
|
||||
- click_through_rate: Percentage of recipients who clicked any link
|
||||
"""
|
||||
# Get total sends
|
||||
total_sent = newsletter_sends_collection.count_documents({
|
||||
'newsletter_id': newsletter_id
|
||||
})
|
||||
|
||||
# Get total opened
|
||||
total_opened = newsletter_sends_collection.count_documents({
|
||||
'newsletter_id': newsletter_id,
|
||||
'opened': True
|
||||
})
|
||||
|
||||
# Calculate open rate
|
||||
open_rate = (total_opened / total_sent * 100) if total_sent > 0 else 0.0
|
||||
|
||||
# Get total clicks for this newsletter
|
||||
total_clicks = link_clicks_collection.count_documents({
|
||||
'newsletter_id': newsletter_id,
|
||||
'clicked': True
|
||||
})
|
||||
|
||||
# Get unique clickers (distinct subscriber emails who clicked)
|
||||
unique_clickers = len(link_clicks_collection.distinct(
|
||||
'subscriber_email',
|
||||
{'newsletter_id': newsletter_id, 'clicked': True}
|
||||
))
|
||||
|
||||
# Calculate click-through rate (unique clickers / total sent)
|
||||
click_through_rate = (unique_clickers / total_sent * 100) if total_sent > 0 else 0.0
|
||||
|
||||
return {
|
||||
'newsletter_id': newsletter_id,
|
||||
'total_sent': total_sent,
|
||||
'total_opened': total_opened,
|
||||
'open_rate': round(open_rate, 2),
|
||||
'total_clicks': total_clicks,
|
||||
'unique_clickers': unique_clickers,
|
||||
'click_through_rate': round(click_through_rate, 2)
|
||||
}
|
||||
|
||||
|
||||
def get_article_performance(article_url: str) -> Dict:
|
||||
"""
|
||||
Get performance metrics for a specific article across all newsletters.
|
||||
|
||||
Args:
|
||||
article_url: The original article URL
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing:
|
||||
- article_url: The article URL
|
||||
- total_sent: Total times this article was sent
|
||||
- total_clicks: Total number of clicks
|
||||
- click_rate: Percentage of recipients who clicked
|
||||
- unique_clickers: Number of unique subscribers who clicked
|
||||
- newsletters: List of newsletter IDs that included this article
|
||||
"""
|
||||
# Get all link tracking records for this article
|
||||
total_sent = link_clicks_collection.count_documents({
|
||||
'article_url': article_url
|
||||
})
|
||||
|
||||
# Get total clicks
|
||||
total_clicks = link_clicks_collection.count_documents({
|
||||
'article_url': article_url,
|
||||
'clicked': True
|
||||
})
|
||||
|
||||
# Calculate click rate
|
||||
click_rate = (total_clicks / total_sent * 100) if total_sent > 0 else 0.0
|
||||
|
||||
# Get unique clickers
|
||||
unique_clickers = len(link_clicks_collection.distinct(
|
||||
'subscriber_email',
|
||||
{'article_url': article_url, 'clicked': True}
|
||||
))
|
||||
|
||||
# Get list of newsletters that included this article
|
||||
newsletters = link_clicks_collection.distinct(
|
||||
'newsletter_id',
|
||||
{'article_url': article_url}
|
||||
)
|
||||
|
||||
return {
|
||||
'article_url': article_url,
|
||||
'total_sent': total_sent,
|
||||
'total_clicks': total_clicks,
|
||||
'click_rate': round(click_rate, 2),
|
||||
'unique_clickers': unique_clickers,
|
||||
'newsletters': newsletters
|
||||
}
|
||||
|
||||
|
||||
def get_subscriber_activity_status(email: str) -> str:
|
||||
"""
|
||||
Get the activity status for a specific subscriber.
|
||||
|
||||
Classifies subscribers based on their last email open:
|
||||
- 'active': Opened an email in the last 30 days
|
||||
- 'inactive': No opens in 30-60 days
|
||||
- 'dormant': No opens in 60+ days
|
||||
- 'new': No opens yet
|
||||
|
||||
Args:
|
||||
email: Subscriber email address
|
||||
|
||||
Returns:
|
||||
str: Activity status ('active', 'inactive', 'dormant', or 'new')
|
||||
"""
|
||||
# Find the most recent open for this subscriber
|
||||
most_recent_open = newsletter_sends_collection.find_one(
|
||||
{'subscriber_email': email, 'opened': True},
|
||||
sort=[('last_opened_at', -1)]
|
||||
)
|
||||
|
||||
if not most_recent_open:
|
||||
# Check if subscriber has received any newsletters
|
||||
has_received = newsletter_sends_collection.count_documents({
|
||||
'subscriber_email': email
|
||||
}) > 0
|
||||
|
||||
return 'new' if has_received else 'new'
|
||||
|
||||
# Calculate days since last open
|
||||
last_opened_at = most_recent_open.get('last_opened_at')
|
||||
if not last_opened_at:
|
||||
return 'new'
|
||||
|
||||
days_since_open = (datetime.utcnow() - last_opened_at).days
|
||||
|
||||
# Classify based on days since last open
|
||||
if days_since_open <= 30:
|
||||
return 'active'
|
||||
elif days_since_open <= 60:
|
||||
return 'inactive'
|
||||
else:
|
||||
return 'dormant'
|
||||
|
||||
|
||||
def update_subscriber_activity_statuses() -> int:
|
||||
"""
|
||||
Batch update activity statuses for all subscribers.
|
||||
|
||||
Updates the subscriber_activity collection with current activity status,
|
||||
engagement metrics, and last interaction timestamps for all subscribers
|
||||
who have received newsletters.
|
||||
|
||||
Returns:
|
||||
int: Number of subscriber records updated
|
||||
"""
|
||||
# Get all unique subscriber emails from newsletter sends
|
||||
all_subscribers = newsletter_sends_collection.distinct('subscriber_email')
|
||||
|
||||
updated_count = 0
|
||||
|
||||
for email in all_subscribers:
|
||||
# Get activity status
|
||||
status = get_subscriber_activity_status(email)
|
||||
|
||||
# Get last opened timestamp
|
||||
last_open_record = newsletter_sends_collection.find_one(
|
||||
{'subscriber_email': email, 'opened': True},
|
||||
sort=[('last_opened_at', -1)]
|
||||
)
|
||||
last_opened_at = last_open_record.get('last_opened_at') if last_open_record else None
|
||||
|
||||
# Get last clicked timestamp
|
||||
last_click_record = link_clicks_collection.find_one(
|
||||
{'subscriber_email': email, 'clicked': True},
|
||||
sort=[('clicked_at', -1)]
|
||||
)
|
||||
last_clicked_at = last_click_record.get('clicked_at') if last_click_record else None
|
||||
|
||||
# Count total opens
|
||||
total_opens = newsletter_sends_collection.count_documents({
|
||||
'subscriber_email': email,
|
||||
'opened': True
|
||||
})
|
||||
|
||||
# Count total clicks
|
||||
total_clicks = link_clicks_collection.count_documents({
|
||||
'subscriber_email': email,
|
||||
'clicked': True
|
||||
})
|
||||
|
||||
# Count newsletters received
|
||||
newsletters_received = newsletter_sends_collection.count_documents({
|
||||
'subscriber_email': email
|
||||
})
|
||||
|
||||
# Count newsletters opened (distinct newsletter_ids)
|
||||
newsletters_opened = len(newsletter_sends_collection.distinct(
|
||||
'newsletter_id',
|
||||
{'subscriber_email': email, 'opened': True}
|
||||
))
|
||||
|
||||
# Update or insert subscriber activity record
|
||||
subscriber_activity_collection.update_one(
|
||||
{'email': email},
|
||||
{
|
||||
'$set': {
|
||||
'email': email,
|
||||
'status': status,
|
||||
'last_opened_at': last_opened_at,
|
||||
'last_clicked_at': last_clicked_at,
|
||||
'total_opens': total_opens,
|
||||
'total_clicks': total_clicks,
|
||||
'newsletters_received': newsletters_received,
|
||||
'newsletters_opened': newsletters_opened,
|
||||
'updated_at': datetime.utcnow()
|
||||
}
|
||||
},
|
||||
upsert=True
|
||||
)
|
||||
|
||||
updated_count += 1
|
||||
|
||||
return updated_count
|
||||
215
backend/services/tracking_service.py
Normal file
215
backend/services/tracking_service.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Email tracking service for Munich News Daily newsletter system.
|
||||
Handles tracking ID generation and tracking record creation.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional
|
||||
from database import newsletter_sends_collection, link_clicks_collection, subscriber_activity_collection, subscribers_collection
|
||||
|
||||
|
||||
def generate_tracking_id() -> str:
|
||||
"""
|
||||
Generate a unique tracking ID using UUID4.
|
||||
|
||||
Returns:
|
||||
str: A unique UUID4 string for tracking purposes
|
||||
"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def create_newsletter_tracking(
|
||||
newsletter_id: str,
|
||||
subscriber_email: str,
|
||||
article_links: Optional[List[Dict[str, str]]] = None
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Create tracking records for a newsletter send.
|
||||
|
||||
Creates a tracking record in newsletter_sends collection for email open tracking,
|
||||
and creates tracking records in link_clicks collection for each article link.
|
||||
Respects subscriber opt-out preferences.
|
||||
|
||||
Args:
|
||||
newsletter_id: Unique identifier for the newsletter batch (e.g., date-based)
|
||||
subscriber_email: Email address of the recipient
|
||||
article_links: Optional list of article dictionaries with 'url' and 'title' keys
|
||||
|
||||
Returns:
|
||||
dict: Tracking information containing:
|
||||
- pixel_tracking_id: ID for the tracking pixel (None if opted out)
|
||||
- link_tracking_map: Dict mapping original URLs to tracking IDs (empty if opted out)
|
||||
- newsletter_id: The newsletter batch ID
|
||||
- subscriber_email: The recipient email
|
||||
- tracking_enabled: Boolean indicating if tracking is enabled for this subscriber
|
||||
"""
|
||||
# Check if subscriber has opted out of tracking
|
||||
subscriber = subscribers_collection.find_one({'email': subscriber_email})
|
||||
tracking_enabled = subscriber.get('tracking_enabled', True) if subscriber else True
|
||||
|
||||
# If tracking is disabled, return empty tracking data
|
||||
if not tracking_enabled:
|
||||
return {
|
||||
'pixel_tracking_id': None,
|
||||
'link_tracking_map': {},
|
||||
'newsletter_id': newsletter_id,
|
||||
'subscriber_email': subscriber_email,
|
||||
'tracking_enabled': False
|
||||
}
|
||||
|
||||
# Generate tracking ID for the email open pixel
|
||||
pixel_tracking_id = generate_tracking_id()
|
||||
|
||||
# Create newsletter send tracking record
|
||||
newsletter_send_doc = {
|
||||
'newsletter_id': newsletter_id,
|
||||
'subscriber_email': subscriber_email,
|
||||
'tracking_id': pixel_tracking_id,
|
||||
'sent_at': datetime.utcnow(),
|
||||
'opened': False,
|
||||
'first_opened_at': None,
|
||||
'last_opened_at': None,
|
||||
'open_count': 0,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
newsletter_sends_collection.insert_one(newsletter_send_doc)
|
||||
|
||||
# Create tracking records for article links
|
||||
link_tracking_map = {}
|
||||
|
||||
if article_links:
|
||||
for article in article_links:
|
||||
article_url = article.get('url')
|
||||
article_title = article.get('title', '')
|
||||
|
||||
if article_url:
|
||||
link_tracking_id = generate_tracking_id()
|
||||
|
||||
# Create link click tracking record
|
||||
link_click_doc = {
|
||||
'tracking_id': link_tracking_id,
|
||||
'newsletter_id': newsletter_id,
|
||||
'subscriber_email': subscriber_email,
|
||||
'article_url': article_url,
|
||||
'article_title': article_title,
|
||||
'clicked': False,
|
||||
'clicked_at': None,
|
||||
'user_agent': None,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
link_clicks_collection.insert_one(link_click_doc)
|
||||
|
||||
# Map original URL to tracking ID
|
||||
link_tracking_map[article_url] = link_tracking_id
|
||||
|
||||
return {
|
||||
'pixel_tracking_id': pixel_tracking_id,
|
||||
'link_tracking_map': link_tracking_map,
|
||||
'newsletter_id': newsletter_id,
|
||||
'subscriber_email': subscriber_email,
|
||||
'tracking_enabled': True
|
||||
}
|
||||
|
||||
|
||||
|
||||
def anonymize_old_tracking_data(retention_days: int = 90) -> Dict[str, int]:
|
||||
"""
|
||||
Anonymize tracking data older than the specified retention period.
|
||||
|
||||
Removes email addresses from tracking records while preserving aggregated metrics.
|
||||
This helps comply with privacy regulations by not retaining personal data indefinitely.
|
||||
|
||||
Args:
|
||||
retention_days: Number of days to retain personal data (default: 90)
|
||||
|
||||
Returns:
|
||||
dict: Count of anonymized records for each collection:
|
||||
- newsletter_sends_anonymized: Number of newsletter send records anonymized
|
||||
- link_clicks_anonymized: Number of link click records anonymized
|
||||
- total_anonymized: Total number of records anonymized
|
||||
"""
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=retention_days)
|
||||
|
||||
# Anonymize newsletter_sends records
|
||||
newsletter_result = newsletter_sends_collection.update_many(
|
||||
{
|
||||
'sent_at': {'$lt': cutoff_date},
|
||||
'subscriber_email': {'$ne': 'anonymized'} # Don't re-anonymize
|
||||
},
|
||||
{
|
||||
'$set': {
|
||||
'subscriber_email': 'anonymized',
|
||||
'anonymized_at': datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Anonymize link_clicks records
|
||||
link_clicks_result = link_clicks_collection.update_many(
|
||||
{
|
||||
'created_at': {'$lt': cutoff_date},
|
||||
'subscriber_email': {'$ne': 'anonymized'} # Don't re-anonymize
|
||||
},
|
||||
{
|
||||
'$set': {
|
||||
'subscriber_email': 'anonymized',
|
||||
'anonymized_at': datetime.utcnow()
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
newsletter_count = newsletter_result.modified_count
|
||||
link_clicks_count = link_clicks_result.modified_count
|
||||
|
||||
return {
|
||||
'newsletter_sends_anonymized': newsletter_count,
|
||||
'link_clicks_anonymized': link_clicks_count,
|
||||
'total_anonymized': newsletter_count + link_clicks_count
|
||||
}
|
||||
|
||||
|
||||
def delete_subscriber_tracking_data(subscriber_email: str) -> Dict[str, int]:
|
||||
"""
|
||||
Delete all tracking data for a specific subscriber.
|
||||
|
||||
Removes all tracking records associated with a subscriber's email address
|
||||
from all tracking collections. This supports GDPR right to be forgotten.
|
||||
|
||||
Args:
|
||||
subscriber_email: Email address of the subscriber
|
||||
|
||||
Returns:
|
||||
dict: Count of deleted records for each collection:
|
||||
- newsletter_sends_deleted: Number of newsletter send records deleted
|
||||
- link_clicks_deleted: Number of link click records deleted
|
||||
- subscriber_activity_deleted: Number of activity records deleted
|
||||
- total_deleted: Total number of records deleted
|
||||
"""
|
||||
# Delete from newsletter_sends
|
||||
newsletter_result = newsletter_sends_collection.delete_many({
|
||||
'subscriber_email': subscriber_email
|
||||
})
|
||||
|
||||
# Delete from link_clicks
|
||||
link_clicks_result = link_clicks_collection.delete_many({
|
||||
'subscriber_email': subscriber_email
|
||||
})
|
||||
|
||||
# Delete from subscriber_activity
|
||||
activity_result = subscriber_activity_collection.delete_many({
|
||||
'email': subscriber_email
|
||||
})
|
||||
|
||||
newsletter_count = newsletter_result.deleted_count
|
||||
link_clicks_count = link_clicks_result.deleted_count
|
||||
activity_count = activity_result.deleted_count
|
||||
|
||||
return {
|
||||
'newsletter_sends_deleted': newsletter_count,
|
||||
'link_clicks_deleted': link_clicks_count,
|
||||
'subscriber_activity_deleted': activity_count,
|
||||
'total_deleted': newsletter_count + link_clicks_count + activity_count
|
||||
}
|
||||
@@ -1,128 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test RSS feed URL extraction
|
||||
Run from backend directory with venv activated:
|
||||
cd backend
|
||||
source venv/bin/activate # or venv\Scripts\activate on Windows
|
||||
python test_rss_extraction.py
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
import feedparser
|
||||
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*80)
|
||||
|
||||
# Connect to database
|
||||
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
# Get RSS feeds
|
||||
print("Fetching RSS feeds from database...")
|
||||
feeds = list(db['rss_feeds'].find())
|
||||
|
||||
if not feeds:
|
||||
print("\n❌ No RSS feeds in database!")
|
||||
print("\nAdd a feed first:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
|
||||
exit(1)
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s)\n")
|
||||
|
||||
# Test each feed
|
||||
total_success = 0
|
||||
total_fail = 0
|
||||
|
||||
for feed_doc in feeds:
|
||||
name = feed_doc.get('name', 'Unknown')
|
||||
url = feed_doc.get('url', '')
|
||||
active = feed_doc.get('active', True)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(f"Feed: {name}")
|
||||
print(f"URL: {url}")
|
||||
print(f"Active: {'Yes' if active else 'No'}")
|
||||
print("="*80)
|
||||
|
||||
if not active:
|
||||
print("⏭ Skipping (inactive)")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse RSS
|
||||
print("\nFetching RSS feed...")
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
continue
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries")
|
||||
|
||||
# Test first 3 entries
|
||||
print(f"\nTesting first 3 entries:")
|
||||
print("-" * 80)
|
||||
|
||||
for i, entry in enumerate(feed.entries[:3], 1):
|
||||
print(f"\n📰 Entry {i}:")
|
||||
|
||||
# Title
|
||||
title = entry.get('title', 'No title')
|
||||
print(f" Title: {title[:65]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f" ✓ URL: {article_url}")
|
||||
total_success += 1
|
||||
else:
|
||||
print(f" ❌ Could not extract URL")
|
||||
print(f" Available fields: {list(entry.keys())[:10]}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
total_fail += 1
|
||||
|
||||
# Test summary
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f" ✓ Summary: {summary[:70]}...")
|
||||
else:
|
||||
print(f" ⚠ No summary")
|
||||
|
||||
# Test date
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f" ✓ Date: {pub_date}")
|
||||
else:
|
||||
print(f" ⚠ No date")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*80)
|
||||
print("SUMMARY")
|
||||
print("="*80)
|
||||
print(f"Total URLs tested: {total_success + total_fail}")
|
||||
print(f"✓ Successfully extracted: {total_success}")
|
||||
print(f"❌ Failed to extract: {total_fail}")
|
||||
|
||||
if total_fail == 0:
|
||||
print("\n🎉 All URLs extracted successfully!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" cd ../news_crawler")
|
||||
print(" pip install -r requirements.txt")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print(f"\n⚠ {total_fail} URL(s) could not be extracted")
|
||||
print("Check the output above for details")
|
||||
|
||||
print("="*80 + "\n")
|
||||
Reference in New Issue
Block a user