diff --git a/.gitignore b/.gitignore index e04f045..6fdb506 100644 --- a/.gitignore +++ b/.gitignore @@ -179,8 +179,8 @@ mongodb_data/ ollama_data/ # Spec artifacts (optional - uncomment if you don't want to track specs) -# .kiro/specs/ - +.kiro/specs/ +.vscode # Test outputs test-results/ coverage/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..71415bc --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,79 @@ +# Contributing to Munich News Daily + +Thank you for your interest in contributing! + +## Getting Started + +1. Fork the repository +2. Clone your fork +3. Create a feature branch +4. Make your changes +5. Run tests +6. Submit a pull request + +## Development Setup + +```bash +# Clone repository +git clone +cd munich-news + +# Copy environment file +cp backend/.env.example backend/.env + +# Start development environment +docker-compose up -d + +# View logs +docker-compose logs -f +``` + +## Running Tests + +```bash +# Run all tests +docker-compose exec crawler python -m pytest tests/crawler +docker-compose exec sender python -m pytest tests/sender +docker-compose exec backend python -m pytest tests/backend + +# Run specific test +docker-compose exec crawler python tests/crawler/test_crawler.py +``` + +## Code Style + +- Follow PEP 8 for Python code +- Use meaningful variable names +- Add docstrings to functions +- Keep functions small and focused +- Write tests for new features + +## Commit Messages + +- Use clear, descriptive commit messages +- Start with a verb (Add, Fix, Update, etc.) +- Keep first line under 50 characters +- Add details in the body if needed + +Example: +``` +Add RSS feed validation + +- Validate URL format +- Check feed accessibility +- Add error handling +``` + +## Pull Request Process + +1. Update documentation if needed +2. Add tests for new features +3. Ensure all tests pass +4. Update CHANGELOG.md +5. Request review from maintainers + +## Questions? + +Open an issue or reach out to the maintainers. + +Thank you for contributing! πŸŽ‰ diff --git a/FINAL_STRUCTURE.md b/FINAL_STRUCTURE.md new file mode 100644 index 0000000..c35f4a1 --- /dev/null +++ b/FINAL_STRUCTURE.md @@ -0,0 +1,243 @@ +# βœ… Final Clean Project Structure + +## πŸŽ‰ Cleanup Complete! + +Your Munich News Daily project is now clean, organized, and professional. + +## πŸ“ Current Structure + +``` +munich-news/ +β”œβ”€β”€ πŸ“„ Root Files (5 essential files) +β”‚ β”œβ”€β”€ README.md # Main documentation +β”‚ β”œβ”€β”€ QUICKSTART.md # 5-minute setup guide +β”‚ β”œβ”€β”€ CONTRIBUTING.md # Contribution guidelines +β”‚ β”œβ”€β”€ PROJECT_STRUCTURE.md # Project layout +β”‚ └── docker-compose.yml # Single unified compose file +β”‚ +β”œβ”€β”€ πŸ“š docs/ (12 documentation files) +β”‚ β”œβ”€β”€ API.md # API reference +β”‚ β”œβ”€β”€ ARCHITECTURE.md # System architecture +β”‚ β”œβ”€β”€ BACKEND_STRUCTURE.md # Backend organization +β”‚ β”œβ”€β”€ CRAWLER_HOW_IT_WORKS.md # Crawler internals +β”‚ β”œβ”€β”€ DATABASE_SCHEMA.md # Database structure +β”‚ β”œβ”€β”€ DEPLOYMENT.md # Deployment guide +β”‚ β”œβ”€β”€ EXTRACTION_STRATEGIES.md # Content extraction +β”‚ └── RSS_URL_EXTRACTION.md # RSS parsing +β”‚ +β”œβ”€β”€ πŸ§ͺ tests/ (10 test files) +β”‚ β”œβ”€β”€ backend/ # Backend tests +β”‚ β”œβ”€β”€ crawler/ # Crawler tests +β”‚ └── sender/ # Sender tests +β”‚ +β”œβ”€β”€ πŸ”§ backend/ # Backend API +β”‚ β”œβ”€β”€ routes/ +β”‚ β”œβ”€β”€ services/ +β”‚ β”œβ”€β”€ .env.example +β”‚ └── app.py +β”‚ +β”œβ”€β”€ πŸ“° news_crawler/ # Crawler service +β”‚ β”œβ”€β”€ Dockerfile +β”‚ β”œβ”€β”€ crawler_service.py +β”‚ β”œβ”€β”€ scheduled_crawler.py +β”‚ └── requirements.txt +β”‚ +β”œβ”€β”€ πŸ“§ news_sender/ # Sender service +β”‚ β”œβ”€β”€ Dockerfile +β”‚ β”œβ”€β”€ sender_service.py +β”‚ β”œβ”€β”€ scheduled_sender.py +β”‚ └── requirements.txt +β”‚ +└── 🎨 frontend/ # React dashboard (optional) +``` + +## ✨ What Was Cleaned + +### Removed Files (20+) +- ❌ All redundant markdown files from root +- ❌ All redundant markdown files from subdirectories +- ❌ Multiple docker-compose files (kept only 1) +- ❌ Multiple startup scripts (use docker-compose now) +- ❌ Test scripts and helpers + +### Organized Files +- βœ… All tests β†’ `tests/` directory +- βœ… All documentation β†’ `docs/` directory +- βœ… All docker configs β†’ single `docker-compose.yml` + +## πŸš€ How to Use + +### Start Everything +```bash +docker-compose up -d +``` + +That's it! One command starts: +- MongoDB database +- News crawler (6 AM schedule) +- Newsletter sender (7 AM schedule) + +### View Logs +```bash +docker-compose logs -f +``` + +### Stop Everything +```bash +docker-compose down +``` + +## πŸ“Š Before vs After + +### Before +``` +Root: 20+ files (messy) +β”œβ”€β”€ AUTOMATION_README.md +β”œβ”€β”€ AUTOMATION_SETUP_COMPLETE.md +β”œβ”€β”€ CRAWLER_QUICKSTART.md +β”œβ”€β”€ CRAWLER_SETUP_SUMMARY.md +β”œβ”€β”€ docker-compose.yml +β”œβ”€β”€ docker-compose.prod.yml +β”œβ”€β”€ README_CRAWLER.md +β”œβ”€β”€ start-automation.sh +β”œβ”€β”€ start-crawler.sh +β”œβ”€β”€ start-sender.sh +β”œβ”€β”€ test-crawler-setup.sh +└── ... many more + +Subdirectories: Scattered docs +β”œβ”€β”€ backend/TRACKING_README.md +β”œβ”€β”€ backend/TRACKING_CONFIGURATION.md +β”œβ”€β”€ news_crawler/README.md +β”œβ”€β”€ news_crawler/QUICKSTART.md +β”œβ”€β”€ news_crawler/docker-compose.yml +β”œβ”€β”€ news_sender/README.md +└── ... more scattered files + +Tests: Scattered everywhere +``` + +### After +``` +Root: 5 essential files (clean) +β”œβ”€β”€ README.md +β”œβ”€β”€ QUICKSTART.md +β”œβ”€β”€ CONTRIBUTING.md +β”œβ”€β”€ PROJECT_STRUCTURE.md +└── docker-compose.yml + +docs/: All documentation (12 files) +β”œβ”€β”€ API.md +β”œβ”€β”€ ARCHITECTURE.md +β”œβ”€β”€ DEPLOYMENT.md +└── ... organized docs + +tests/: All tests (10 files) +β”œβ”€β”€ backend/ +β”œβ”€β”€ crawler/ +└── sender/ + +Subdirectories: Clean, no scattered docs +``` + +## 🎯 Benefits + +### 1. Easy to Navigate +- Clear directory structure +- Everything in its place +- No clutter + +### 2. Simple to Use +- One command: `docker-compose up -d` +- One place for docs: `docs/` +- One place for tests: `tests/` + +### 3. Professional +- Industry-standard layout +- Clean and organized +- Ready for collaboration + +### 4. Maintainable +- Easy to find files +- Clear separation of concerns +- Scalable structure + +## πŸ“ Quick Reference + +### Documentation +```bash +# Main docs +cat README.md +cat QUICKSTART.md + +# Technical docs +ls docs/ +``` + +### Running +```bash +# Start +docker-compose up -d + +# Logs +docker-compose logs -f + +# Stop +docker-compose down +``` + +### Testing +```bash +# Run tests +docker-compose exec crawler python tests/crawler/test_crawler.py +docker-compose exec sender python tests/sender/test_tracking_integration.py +``` + +### Development +```bash +# Edit code in respective directories +# Rebuild +docker-compose up -d --build +``` + +## βœ… Verification + +Run these commands to verify the cleanup: + +```bash +# Check root directory (should be clean) +ls -1 *.md + +# Check docs directory +ls -1 docs/ + +# Check tests directory +ls -1 tests/ + +# Check for stray docker-compose files (should be only 1) +find . -name "docker-compose*.yml" ! -path "*/node_modules/*" ! -path "*/env/*" + +# Check for stray markdown in subdirectories (should be none) +find backend news_crawler news_sender -name "*.md" ! -path "*/env/*" +``` + +## 🎊 Result + +A clean, professional, production-ready project structure! + +**One command to start everything:** +```bash +docker-compose up -d +``` + +**One place for all documentation:** +```bash +ls docs/ +``` + +**One place for all tests:** +```bash +ls tests/ +``` + +Simple. Clean. Professional. ✨ diff --git a/PROJECT_STRUCTURE.md b/PROJECT_STRUCTURE.md new file mode 100644 index 0000000..1d68902 --- /dev/null +++ b/PROJECT_STRUCTURE.md @@ -0,0 +1,126 @@ +# Project Structure + +``` +munich-news/ +β”œβ”€β”€ backend/ # Backend API and services +β”‚ β”œβ”€β”€ routes/ # API routes +β”‚ β”œβ”€β”€ services/ # Business logic +β”‚ β”œβ”€β”€ .env.example # Environment template +β”‚ β”œβ”€β”€ app.py # Flask application +β”‚ β”œβ”€β”€ config.py # Configuration +β”‚ └── database.py # MongoDB connection +β”‚ +β”œβ”€β”€ news_crawler/ # News crawler service +β”‚ β”œβ”€β”€ Dockerfile # Crawler container +β”‚ β”œβ”€β”€ crawler_service.py # Main crawler logic +β”‚ β”œβ”€β”€ scheduled_crawler.py # Scheduler (6 AM) +β”‚ β”œβ”€β”€ rss_utils.py # RSS parsing utilities +β”‚ └── requirements.txt # Python dependencies +β”‚ +β”œβ”€β”€ news_sender/ # Newsletter sender service +β”‚ β”œβ”€β”€ Dockerfile # Sender container +β”‚ β”œβ”€β”€ sender_service.py # Main sender logic +β”‚ β”œβ”€β”€ scheduled_sender.py # Scheduler (7 AM) +β”‚ β”œβ”€β”€ tracking_integration.py # Email tracking +β”‚ β”œβ”€β”€ newsletter_template.html # Email template +β”‚ └── requirements.txt # Python dependencies +β”‚ +β”œβ”€β”€ frontend/ # React dashboard (optional) +β”‚ β”œβ”€β”€ src/ # React components +β”‚ β”œβ”€β”€ public/ # Static files +β”‚ └── package.json # Node dependencies +β”‚ +β”œβ”€β”€ tests/ # All test files +β”‚ β”œβ”€β”€ crawler/ # Crawler tests +β”‚ β”œβ”€β”€ sender/ # Sender tests +β”‚ └── backend/ # Backend tests +β”‚ +β”œβ”€β”€ docs/ # Documentation +β”‚ β”œβ”€β”€ ARCHITECTURE.md # System architecture +β”‚ β”œβ”€β”€ DEPLOYMENT.md # Deployment guide +β”‚ β”œβ”€β”€ API.md # API reference +β”‚ β”œβ”€β”€ DATABASE_SCHEMA.md # Database structure +β”‚ β”œβ”€β”€ BACKEND_STRUCTURE.md # Backend organization +β”‚ β”œβ”€β”€ CRAWLER_HOW_IT_WORKS.md # Crawler internals +β”‚ β”œβ”€β”€ EXTRACTION_STRATEGIES.md # Content extraction +β”‚ └── RSS_URL_EXTRACTION.md # RSS parsing +β”‚ +β”œβ”€β”€ .kiro/ # Kiro IDE configuration +β”‚ └── specs/ # Feature specifications +β”‚ +β”œβ”€β”€ docker-compose.yml # Docker orchestration +β”œβ”€β”€ README.md # Main documentation +β”œβ”€β”€ QUICKSTART.md # 5-minute setup guide +β”œβ”€β”€ CONTRIBUTING.md # Contribution guidelines +β”œβ”€β”€ .gitignore # Git ignore rules +└── .dockerignore # Docker ignore rules +``` + +## Key Files + +### Configuration +- `backend/.env` - Environment variables (create from .env.example) +- `docker-compose.yml` - Docker services configuration + +### Entry Points +- `news_crawler/scheduled_crawler.py` - Crawler scheduler (6 AM) +- `news_sender/scheduled_sender.py` - Sender scheduler (7 AM) +- `backend/app.py` - Backend API server + +### Documentation +- `README.md` - Main project documentation +- `QUICKSTART.md` - Quick setup guide +- `docs/` - Detailed documentation + +### Tests +- `tests/crawler/` - Crawler test files +- `tests/sender/` - Sender test files +- `tests/backend/` - Backend test files + +## Docker Services + +When you run `docker-compose up -d`, these services start: + +1. **mongodb** - Database (port 27017) +2. **crawler** - News crawler (scheduled for 6 AM) +3. **sender** - Newsletter sender (scheduled for 7 AM) +4. **backend** - API server (port 5001, optional) + +## Data Flow + +``` +RSS Feeds β†’ Crawler β†’ MongoDB β†’ Sender β†’ Subscribers + ↓ + Backend API + ↓ + Analytics +``` + +## Development Workflow + +1. Edit code in respective directories +2. Rebuild containers: `docker-compose up -d --build` +3. View logs: `docker-compose logs -f` +4. Run tests: `docker-compose exec python tests/...` + +## Adding New Features + +1. Create spec in `.kiro/specs/` +2. Implement in appropriate directory +3. Add tests in `tests/` +4. Update documentation in `docs/` +5. Submit pull request + +## Clean Architecture + +- **Separation of Concerns**: Each service has its own directory +- **Centralized Configuration**: All config in `backend/.env` +- **Organized Tests**: All tests in `tests/` directory +- **Clear Documentation**: All docs in `docs/` directory +- **Single Entry Point**: One `docker-compose.yml` file + +This structure makes the project: +- βœ… Easy to navigate +- βœ… Simple to deploy +- βœ… Clear to understand +- βœ… Maintainable long-term diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..b6d7fb4 --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,131 @@ +# Quick Start Guide + +Get Munich News Daily running in 5 minutes! + +## Prerequisites + +- Docker & Docker Compose installed +- (Optional) Ollama for AI summarization + +## Setup + +### 1. Configure Environment + +```bash +# Copy example environment file +cp backend/.env.example backend/.env + +# Edit with your settings (required: email configuration) +nano backend/.env +``` + +**Minimum required settings:** +```env +SMTP_SERVER=smtp.gmail.com +SMTP_PORT=587 +EMAIL_USER=your-email@gmail.com +EMAIL_PASSWORD=your-app-password +``` + +### 2. Start System + +```bash +# Start all services +docker-compose up -d + +# View logs +docker-compose logs -f +``` + +### 3. Add RSS Feeds + +```bash +mongosh munich_news + +db.rss_feeds.insertMany([ + { + name: "SΓΌddeutsche Zeitung MΓΌnchen", + url: "https://www.sueddeutsche.de/muenchen/rss", + active: true + }, + { + name: "Merkur MΓΌnchen", + url: "https://www.merkur.de/lokales/muenchen/rss/feed.rss", + active: true + } +]) +``` + +### 4. Add Subscribers + +```bash +mongosh munich_news + +db.subscribers.insertOne({ + email: "your-email@example.com", + active: true, + tracking_enabled: true, + subscribed_at: new Date() +}) +``` + +### 5. Test It + +```bash +# Test crawler +docker-compose exec crawler python crawler_service.py 5 + +# Test newsletter +docker-compose exec sender python sender_service.py test your-email@example.com +``` + +## What Happens Next? + +The system will automatically: +- **Backend API**: Runs continuously at http://localhost:5001 for tracking and analytics +- **6:00 AM Berlin time**: Crawl news articles +- **7:00 AM Berlin time**: Send newsletter to subscribers + +## View Results + +```bash +# Check articles +mongosh munich_news +db.articles.find().sort({ crawled_at: -1 }).limit(5) + +# Check logs +docker-compose logs -f crawler +docker-compose logs -f sender +``` + +## Common Commands + +```bash +# Stop system +docker-compose down + +# Restart system +docker-compose restart + +# View logs +docker-compose logs -f + +# Rebuild after changes +docker-compose up -d --build +``` + +## Need Help? + +- Check [README.md](README.md) for full documentation +- See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for detailed setup +- View [docs/API.md](docs/API.md) for API reference + +## Next Steps + +1. Configure Ollama for AI summaries (optional) +2. Set up tracking API (optional) +3. Customize newsletter template +4. Add more RSS feeds +5. Monitor engagement metrics + +That's it! Your automated news system is running. πŸŽ‰ diff --git a/README.md b/README.md index a7bf5bd..5d7e3d2 100644 --- a/README.md +++ b/README.md @@ -1,327 +1,390 @@ -# Munich News Daily πŸ“° +# Munich News Daily - Automated Newsletter System -A TLDR/Morning Brew-style news email platform specifically for Munich. Get the latest Munich news delivered to your inbox every morning. +A fully automated news aggregation and newsletter system that crawls Munich news sources, generates AI summaries, and sends daily newsletters with engagement tracking. -## Features +## πŸš€ Quick Start -- πŸ“§ Email newsletter subscription system -- πŸ“° Aggregated news from multiple Munich news sources -- 🎨 Beautiful, modern web interface -- πŸ“Š Subscription statistics -- πŸ”„ Real-time news updates +```bash +# 1. Configure environment +cp backend/.env.example backend/.env +# Edit backend/.env with your email settings -## Tech Stack +# 2. Start everything +docker-compose up -d -- **Backend**: Python (Flask) - Modular architecture with blueprints -- **Frontend**: Node.js (Express + Vanilla JavaScript) -- **Database**: MongoDB -- **News Crawler**: Standalone Python microservice -- **News Sources**: RSS feeds from major Munich news outlets +# 3. View logs +docker-compose logs -f +``` -## Setup Instructions +That's it! The system will automatically: +- **Backend API**: Runs continuously for tracking and analytics (http://localhost:5001) +- **6:00 AM Berlin time**: Crawl news articles and generate summaries +- **7:00 AM Berlin time**: Send newsletter to all subscribers + +πŸ“– **New to the project?** See [QUICKSTART.md](QUICKSTART.md) for a detailed 5-minute setup guide. + +## πŸ“‹ System Overview + +``` +6:00 AM β†’ News Crawler + ↓ + Fetches articles from RSS feeds + Extracts full content + Generates AI summaries + Saves to MongoDB + ↓ +7:00 AM β†’ Newsletter Sender + ↓ + Waits for crawler to finish + Fetches today's articles + Generates newsletter with tracking + Sends to all subscribers + ↓ + βœ… Done! Repeat tomorrow +``` + +## πŸ—οΈ Architecture + +### Components + +- **MongoDB**: Data storage (articles, subscribers, tracking) +- **Backend API**: Flask API for tracking and analytics (port 5001) +- **News Crawler**: Automated RSS feed crawler with AI summarization +- **Newsletter Sender**: Automated email sender with tracking +- **Frontend**: React dashboard (optional) + +### Technology Stack + +- Python 3.11 +- MongoDB 7.0 +- Docker & Docker Compose +- Flask (API) +- Ollama (AI summarization) +- Schedule (automation) +- Jinja2 (email templates) + +## πŸ“¦ Installation ### Prerequisites -- Python 3.8+ -- Node.js 14+ -- npm or yarn -- Docker and Docker Compose (recommended for MongoDB) OR MongoDB (local installation or MongoDB Atlas account) +- Docker & Docker Compose +- (Optional) Ollama for AI summarization -### Backend Setup +### Setup -1. Navigate to the backend directory: -```bash -cd backend -``` - -2. Create a virtual environment (recommended): -```bash -python3 -m venv venv -source venv/bin/activate # On Windows: venv\Scripts\activate -``` - -3. Install dependencies: -```bash -pip install -r requirements.txt -``` - -4. Set up MongoDB using Docker Compose (recommended): +1. **Clone the repository** + ```bash + git clone + cd munich-news + ``` + +2. **Configure environment** + ```bash + cp backend/.env.example backend/.env + # Edit backend/.env with your settings + ``` + +3. **Start the system** ```bash - # From the project root directory docker-compose up -d ``` - - This will start MongoDB in a Docker container. The database will be available at `mongodb://localhost:27017/` - - **Useful Docker commands:** - ```bash - # Start MongoDB - docker-compose up -d - - # Stop MongoDB - docker-compose down - - # View MongoDB logs - docker-compose logs -f mongodb - - # Restart MongoDB - docker-compose restart mongodb - - # Remove MongoDB and all data (WARNING: deletes all data) - docker-compose down -v - ``` - - **Alternative options:** - - **Local MongoDB**: Install MongoDB locally and make sure it's running - - **MongoDB Atlas** (Cloud): Create a free account at [mongodb.com/cloud/atlas](https://www.mongodb.com/cloud/atlas) and get your connection string -5. Create a `.env` file in the backend directory: - ```bash - # Copy the template file - cp env.template .env - ``` - - Then edit `.env` with your configuration: - ```env - # MongoDB connection (default: mongodb://localhost:27017/) - # For Docker Compose (no authentication): - MONGODB_URI=mongodb://localhost:27017/ - # For Docker Compose with authentication (if you modify docker-compose.yml): - # MONGODB_URI=mongodb://admin:password@localhost:27017/ - # Or for MongoDB Atlas: - # MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/ - - # Email configuration (optional for testing) - SMTP_SERVER=smtp.gmail.com - SMTP_PORT=587 - EMAIL_USER=your-email@gmail.com - EMAIL_PASSWORD=your-app-password - - # Ollama Configuration (for AI-powered features) - # Remote Ollama server URL - OLLAMA_BASE_URL=http://your-remote-server-ip:11434 - # Optional: API key if your Ollama server requires authentication - # OLLAMA_API_KEY=your-api-key-here - # Model name to use (e.g., llama2, mistral, codellama, llama3) - OLLAMA_MODEL=llama2 - # Enable/disable Ollama features (true/false) - OLLAMA_ENABLED=false - ``` +## βš™οΈ Configuration -**Notes:** -- For Gmail, you'll need to use an [App Password](https://support.google.com/accounts/answer/185833) instead of your regular password. -- For Ollama, replace `your-remote-server-ip` with your actual server IP or domain. Set `OLLAMA_ENABLED=true` to enable AI features. +Edit `backend/.env`: -6. Run the backend server: -```bash -python app.py +```env +# MongoDB +MONGODB_URI=mongodb://localhost:27017/ + +# Email (SMTP) +SMTP_SERVER=smtp.gmail.com +SMTP_PORT=587 +EMAIL_USER=your-email@gmail.com +EMAIL_PASSWORD=your-app-password + +# Newsletter +NEWSLETTER_MAX_ARTICLES=10 +NEWSLETTER_HOURS_LOOKBACK=24 + +# Tracking +TRACKING_ENABLED=true +TRACKING_API_URL=http://localhost:5001 +TRACKING_DATA_RETENTION_DAYS=90 + +# Ollama (AI Summarization) +OLLAMA_ENABLED=true +OLLAMA_BASE_URL=http://127.0.0.1:11434 +OLLAMA_MODEL=phi3:latest ``` -The backend will run on `http://localhost:5001` (port 5001 to avoid conflict with AirPlay on macOS) +## πŸ“Š Usage -### Frontend Setup - -1. Navigate to the frontend directory: -```bash -cd frontend -``` - -2. Install dependencies: -```bash -npm install -``` - -3. Run the frontend server: -```bash -npm start -``` - -The frontend will run on `http://localhost:3000` - -## Usage - -1. Open your browser and go to `http://localhost:3000` -2. Enter your email address to subscribe to the newsletter -3. View the latest Munich news on the homepage -4. The backend will aggregate news from multiple Munich news sources - -## Sending Newsletters - -To send newsletters to all subscribers, you can add a scheduled task or manually trigger the `send_newsletter()` function in `app.py`. For production, consider using: - -- **Cron jobs** (Linux/Mac) -- **Task Scheduler** (Windows) -- **Celery** with Redis/RabbitMQ for more advanced scheduling -- **Cloud functions** (AWS Lambda, Google Cloud Functions) - -Example cron job to send daily at 8 AM: -``` -0 8 * * * cd /path/to/munich-news/backend && python -c "from app import send_newsletter; send_newsletter()" -``` - -## Project Structure - -``` -munich-news/ -β”œβ”€β”€ backend/ # Main API server -β”‚ β”œβ”€β”€ app.py # Flask application entry point -β”‚ β”œβ”€β”€ config.py # Configuration management -β”‚ β”œβ”€β”€ database.py # Database connection -β”‚ β”œβ”€β”€ routes/ # API endpoints (blueprints) -β”‚ β”œβ”€β”€ services/ # Business logic -β”‚ β”œβ”€β”€ templates/ # Email templates -β”‚ └── requirements.txt # Python dependencies -β”œβ”€β”€ news_crawler/ # Crawler microservice -β”‚ β”œβ”€β”€ crawler_service.py # Standalone crawler -β”‚ β”œβ”€β”€ ollama_client.py # AI summarization client -β”‚ β”œβ”€β”€ requirements.txt # Crawler dependencies -β”‚ └── README.md # Crawler documentation -β”œβ”€β”€ news_sender/ # Newsletter sender microservice -β”‚ β”œβ”€β”€ sender_service.py # Standalone email sender -β”‚ β”œβ”€β”€ newsletter_template.html # Email template -β”‚ β”œβ”€β”€ requirements.txt # Sender dependencies -β”‚ └── README.md # Sender documentation -β”œβ”€β”€ frontend/ # Web interface -β”‚ β”œβ”€β”€ server.js # Express server -β”‚ β”œβ”€β”€ package.json # Node.js dependencies -β”‚ └── public/ -β”‚ β”œβ”€β”€ index.html # Main page -β”‚ β”œβ”€β”€ styles.css # Styling -β”‚ └── app.js # Frontend JavaScript -β”œβ”€β”€ docker-compose.yml # Docker Compose for MongoDB (development) -β”œβ”€β”€ docker-compose.prod.yml # Docker Compose with authentication (production) -└── README.md -``` - -## API Endpoints - -### `POST /api/subscribe` -Subscribe to the newsletter -- Body: `{ "email": "user@example.com" }` - -### `POST /api/unsubscribe` -Unsubscribe from the newsletter -- Body: `{ "email": "user@example.com" }` - -### `GET /api/news` -Get latest Munich news articles - -### `GET /api/stats` -Get subscription statistics -- Returns: `{ "subscribers": number, "articles": number, "crawled_articles": number }` - -### `GET /api/news/` -Get full article content by URL -- Returns: Full article with content, author, word count, etc. - -### `GET /api/ollama/ping` -Test connection to Ollama server -- Returns: Connection status and Ollama configuration -- Response examples: - - Success: `{ "status": "success", "message": "...", "response": "...", "ollama_config": {...} }` - - Disabled: `{ "status": "disabled", "message": "...", "ollama_config": {...} }` - - Error: `{ "status": "error", "message": "...", "error_details": "...", "troubleshooting": {...}, "ollama_config": {...} }` - -### `GET /api/ollama/models` -List available models on Ollama server -- Returns: List of available models and current configuration -- Response: `{ "status": "success", "models": [...], "current_model": "...", "ollama_config": {...} }` - -### `GET /api/rss-feeds` -Get all RSS feeds -- Returns: `{ "feeds": [...] }` - -### `POST /api/rss-feeds` -Add a new RSS feed -- Body: `{ "name": "Feed Name", "url": "https://example.com/rss" }` -- Returns: `{ "message": "...", "id": "..." }` - -### `DELETE /api/rss-feeds/` -Remove an RSS feed -- Returns: `{ "message": "..." }` - -### `PATCH /api/rss-feeds//toggle` -Toggle RSS feed active status -- Returns: `{ "message": "...", "active": boolean }` - -## Database Schema - -### Articles Collection -```javascript -{ - _id: ObjectId, - title: String, - link: String (unique), - summary: String, - source: String, - published_at: String, - created_at: DateTime -} -``` - -### Subscribers Collection -```javascript -{ - _id: ObjectId, - email: String (unique, lowercase), - subscribed_at: DateTime, - status: String ('active' | 'inactive') -} -``` - -**Indexes:** -- `articles.link` - Unique index to prevent duplicate articles -- `articles.created_at` - For efficient sorting -- `subscribers.email` - Unique index for email lookups -- `subscribers.subscribed_at` - For analytics - -## News Crawler Microservice - -The project includes a standalone crawler microservice that fetches full article content from RSS feeds. - -### Running the Crawler +### View Logs ```bash -cd news_crawler +# All services +docker-compose logs -f -# Install dependencies -pip install -r requirements.txt - -# Run crawler -python crawler_service.py 10 +# Specific service +docker-compose logs -f crawler +docker-compose logs -f sender +docker-compose logs -f mongodb ``` -See `news_crawler/README.md` for detailed documentation. - -### What It Does - -- Crawls full article content from RSS feed links -- Extracts text, word count, and metadata -- Stores in MongoDB for AI processing -- Skips already-crawled articles -- Rate-limited (1 second between requests) - -## Customization - -### Adding News Sources - -Use the API to add RSS feeds dynamically: +### Manual Operations ```bash -curl -X POST http://localhost:5001/api/rss-feeds \ - -H "Content-Type: application/json" \ - -d '{"name": "Your Source Name", "url": "https://example.com/rss"}' +# Run crawler manually +docker-compose exec crawler python crawler_service.py 10 + +# Send test newsletter +docker-compose exec sender python sender_service.py test your-email@example.com + +# Preview newsletter +docker-compose exec sender python sender_service.py preview ``` -### Styling +### Database Access -Modify `frontend/public/styles.css` to customize the appearance. +```bash +# Connect to MongoDB +docker-compose exec mongodb mongosh munich_news -## License +# View articles +db.articles.find().sort({ crawled_at: -1 }).limit(5).pretty() -MIT +# View subscribers +db.subscribers.find({ active: true }).pretty() -## Contributing +# View tracking data +db.newsletter_sends.find().sort({ created_at: -1 }).limit(10).pretty() +``` -Feel free to submit issues and enhancement requests! +## πŸ”§ Management +### Add RSS Feeds + +```bash +mongosh munich_news + +db.rss_feeds.insertOne({ + name: "Source Name", + url: "https://example.com/rss", + active: true +}) +``` + +### Add Subscribers + +```bash +mongosh munich_news + +db.subscribers.insertOne({ + email: "user@example.com", + active: true, + tracking_enabled: true, + subscribed_at: new Date() +}) +``` + +### View Analytics + +```bash +# Newsletter metrics +curl http://localhost:5001/api/analytics/newsletter/2024-01-15 + +# Article performance +curl http://localhost:5001/api/analytics/article/https://example.com/article + +# Subscriber activity +curl http://localhost:5001/api/analytics/subscriber/user@example.com +``` + +## ⏰ Schedule Configuration + +### Change Crawler Time (default: 6:00 AM) + +Edit `news_crawler/scheduled_crawler.py`: +```python +schedule.every().day.at("06:00").do(run_crawler) # Change time +``` + +### Change Sender Time (default: 7:00 AM) + +Edit `news_sender/scheduled_sender.py`: +```python +schedule.every().day.at("07:00").do(run_sender) # Change time +``` + +After changes: +```bash +docker-compose up -d --build +``` + +## πŸ“ˆ Monitoring + +### Container Status + +```bash +docker-compose ps +``` + +### Check Next Scheduled Runs + +```bash +# Crawler +docker-compose logs crawler | grep "Next scheduled run" + +# Sender +docker-compose logs sender | grep "Next scheduled run" +``` + +### Engagement Metrics + +```bash +mongosh munich_news + +// Open rate +var sent = db.newsletter_sends.countDocuments({ newsletter_id: "2024-01-15" }) +var opened = db.newsletter_sends.countDocuments({ newsletter_id: "2024-01-15", opened: true }) +print("Open Rate: " + ((opened / sent) * 100).toFixed(2) + "%") + +// Click rate +var clicks = db.link_clicks.countDocuments({ newsletter_id: "2024-01-15" }) +print("Click Rate: " + ((clicks / sent) * 100).toFixed(2) + "%") +``` + +## πŸ› Troubleshooting + +### Crawler Not Finding Articles + +```bash +# Check RSS feeds +mongosh munich_news --eval "db.rss_feeds.find({ active: true })" + +# Test manually +docker-compose exec crawler python crawler_service.py 5 +``` + +### Newsletter Not Sending + +```bash +# Check email config +docker-compose exec sender python -c "from sender_service import Config; print(Config.SMTP_SERVER)" + +# Test email +docker-compose exec sender python sender_service.py test your-email@example.com +``` + +### Containers Not Starting + +```bash +# Check logs +docker-compose logs + +# Rebuild +docker-compose up -d --build + +# Reset everything +docker-compose down -v +docker-compose up -d +``` + +## πŸ” Privacy & Compliance + +### GDPR Features + +- **Data Retention**: Automatic anonymization after 90 days +- **Opt-Out**: Subscribers can disable tracking +- **Data Deletion**: Full data removal on request +- **Transparency**: Privacy notice in all emails + +### Privacy Endpoints + +```bash +# Delete subscriber data +curl -X DELETE http://localhost:5001/api/tracking/subscriber/user@example.com + +# Anonymize old data +curl -X POST http://localhost:5001/api/tracking/anonymize + +# Opt out of tracking +curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-out +``` + +## πŸ“š Documentation + +### Getting Started +- **[QUICKSTART.md](QUICKSTART.md)** - 5-minute setup guide +- **[PROJECT_STRUCTURE.md](PROJECT_STRUCTURE.md)** - Project layout +- **[CONTRIBUTING.md](CONTRIBUTING.md)** - Contribution guidelines + +### Technical Documentation +- **[docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)** - System architecture +- **[docs/DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Deployment guide +- **[docs/API.md](docs/API.md)** - API reference +- **[docs/DATABASE_SCHEMA.md](docs/DATABASE_SCHEMA.md)** - Database structure +- **[docs/BACKEND_STRUCTURE.md](docs/BACKEND_STRUCTURE.md)** - Backend organization + +### Component Documentation +- **[docs/CRAWLER_HOW_IT_WORKS.md](docs/CRAWLER_HOW_IT_WORKS.md)** - Crawler internals +- **[docs/EXTRACTION_STRATEGIES.md](docs/EXTRACTION_STRATEGIES.md)** - Content extraction +- **[docs/RSS_URL_EXTRACTION.md](docs/RSS_URL_EXTRACTION.md)** - RSS parsing + +## πŸ§ͺ Testing + +All test files are organized in the `tests/` directory: + +```bash +# Run crawler tests +docker-compose exec crawler python tests/crawler/test_crawler.py + +# Run sender tests +docker-compose exec sender python tests/sender/test_tracking_integration.py + +# Run backend tests +docker-compose exec backend python tests/backend/test_tracking.py +``` + +## πŸš€ Production Deployment + +### Environment Setup + +1. Update `backend/.env` with production values +2. Set strong MongoDB password +3. Use HTTPS for tracking URLs +4. Configure proper SMTP server + +### Security + +```bash +# Use production compose file +docker-compose -f docker-compose.prod.yml up -d + +# Set MongoDB password +export MONGO_PASSWORD=your-secure-password +``` + +### Monitoring + +- Set up log rotation +- Configure health checks +- Set up alerts for failures +- Monitor database size + +## πŸ“ License + +[Your License Here] + +## 🀝 Contributing + +Contributions welcome! Please read CONTRIBUTING.md first. + +## πŸ“§ Support + +For issues or questions, please open a GitHub issue. + +--- + +**Built with ❀️ for Munich News Daily** diff --git a/TEST_INSTRUCTIONS.md b/TEST_INSTRUCTIONS.md deleted file mode 100644 index 5e19df5..0000000 --- a/TEST_INSTRUCTIONS.md +++ /dev/null @@ -1,132 +0,0 @@ -# Testing RSS Feed URL Extraction - -## Quick Test (Recommended) - -Run this from the project root with backend virtual environment activated: - -```bash -# 1. Activate backend virtual environment -cd backend -source venv/bin/activate # On Windows: venv\Scripts\activate - -# 2. Go back to project root -cd .. - -# 3. Run the test -python test_feeds_quick.py -``` - -This will: -- βœ“ Check what RSS feeds are in your database -- βœ“ Fetch each feed -- βœ“ Test URL extraction on first 3 articles -- βœ“ Show what fields are available -- βœ“ Verify summary and date extraction - -## Expected Output - -``` -================================================================================ -RSS Feed Test - Checking Database Feeds -================================================================================ - -βœ“ Found 3 feed(s) in database - -================================================================================ -Feed: SΓΌddeutsche Zeitung MΓΌnchen -URL: https://www.sueddeutsche.de/muenchen/rss -Active: True -================================================================================ -Fetching RSS feed... -βœ“ Found 20 entries - ---- Entry 1 --- -Title: New U-Bahn Line Opens in Munich -βœ“ URL extracted: https://www.sueddeutsche.de/muenchen/article-123 -βœ“ Summary: The new U-Bahn line connecting the city center... -βœ“ Date: Mon, 10 Nov 2024 10:00:00 +0100 - ---- Entry 2 --- -Title: Munich Weather Update -βœ“ URL extracted: https://www.sueddeutsche.de/muenchen/article-124 -βœ“ Summary: Weather forecast for the week... -βœ“ Date: Mon, 10 Nov 2024 09:30:00 +0100 - -... -``` - -## If No Feeds Found - -Add a feed first: - -```bash -curl -X POST http://localhost:5001/api/rss-feeds \ - -H "Content-Type: application/json" \ - -d '{"name": "SΓΌddeutsche Politik", "url": "https://rss.sueddeutsche.de/rss/Politik"}' -``` - -## Testing News Crawler - -Once feeds are verified, test the crawler: - -```bash -# 1. Install crawler dependencies -cd news_crawler -pip install -r requirements.txt - -# 2. Run the test -python test_rss_feeds.py - -# 3. Or run the actual crawler -python crawler_service.py 5 -``` - -## Troubleshooting - -### "No module named 'pymongo'" -- Activate the backend virtual environment first -- Or install dependencies: `pip install -r backend/requirements.txt` - -### "No RSS feeds in database" -- Make sure backend is running -- Add feeds via API (see above) -- Or check if MongoDB is running: `docker-compose ps` - -### "Could not extract URL" -- The test will show available fields -- Check if the feed uses `guid`, `id`, or `links` instead of `link` -- Our utility should handle most cases automatically - -### "No entries found" -- The RSS feed URL might be invalid -- Try opening the URL in a browser -- Check if it returns valid XML - -## Manual Database Check - -Using mongosh: - -```bash -mongosh -use munich_news -db.rss_feeds.find() -db.articles.find().limit(3) -``` - -## What to Look For - -βœ… **Good signs:** -- URLs are extracted successfully -- URLs start with `http://` or `https://` -- Summaries are present -- Dates are extracted - -⚠️ **Warning signs:** -- "Could not extract URL" messages -- Empty summaries (not critical) -- Missing dates (not critical) - -❌ **Problems:** -- No entries found in feed -- All URL extractions fail -- Feed parsing errors diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..3717b9d --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,28 @@ +# MongoDB Configuration +MONGODB_URI=mongodb://localhost:27017/ + +# Email Configuration (Required) +SMTP_SERVER=smtp.gmail.com +SMTP_PORT=587 +EMAIL_USER=your-email@gmail.com +EMAIL_PASSWORD=your-app-password + +# Newsletter Settings +NEWSLETTER_MAX_ARTICLES=10 +NEWSLETTER_HOURS_LOOKBACK=24 +WEBSITE_URL=http://localhost:3000 + +# Tracking Configuration +TRACKING_ENABLED=true +TRACKING_API_URL=http://localhost:5001 +TRACKING_DATA_RETENTION_DAYS=90 + +# Ollama Configuration (AI Summarization) +OLLAMA_ENABLED=true +OLLAMA_BASE_URL=http://127.0.0.1:11434 +OLLAMA_MODEL=phi3:latest +OLLAMA_TIMEOUT=120 +SUMMARY_MAX_WORDS=150 + +# Flask Server Configuration +FLASK_PORT=5001 diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..a017b75 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY . . + +# Set timezone to Berlin +ENV TZ=Europe/Berlin +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +# Expose Flask port +EXPOSE 5001 + +# Run the Flask application +CMD ["python", "-u", "app.py"] diff --git a/backend/app.py b/backend/app.py index 10f8e88..727ebdf 100644 --- a/backend/app.py +++ b/backend/app.py @@ -7,6 +7,8 @@ from routes.news_routes import news_bp from routes.rss_routes import rss_bp from routes.ollama_routes import ollama_bp from routes.newsletter_routes import newsletter_bp +from routes.tracking_routes import tracking_bp +from routes.analytics_routes import analytics_bp # Initialize Flask app app = Flask(__name__) @@ -21,9 +23,17 @@ app.register_blueprint(news_bp) app.register_blueprint(rss_bp) app.register_blueprint(ollama_bp) app.register_blueprint(newsletter_bp) +app.register_blueprint(tracking_bp) +app.register_blueprint(analytics_bp) + +# Health check endpoint +@app.route('/health') +def health(): + return {'status': 'healthy', 'service': 'munich-news-backend'}, 200 # Print configuration Config.print_config() if __name__ == '__main__': - app.run(debug=True, port=Config.FLASK_PORT, host='127.0.0.1') + # Use 0.0.0.0 to allow Docker container access + app.run(debug=True, port=Config.FLASK_PORT, host='0.0.0.0') diff --git a/backend/config.py b/backend/config.py index e206865..4d429bb 100644 --- a/backend/config.py +++ b/backend/config.py @@ -40,6 +40,11 @@ class Config: # Flask FLASK_PORT = int(os.getenv('FLASK_PORT', '5000')) + # Tracking + TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true' + TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}') + TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90')) + @classmethod def print_config(cls): """Print configuration (without sensitive data)""" @@ -50,3 +55,5 @@ class Config: print(f" Ollama Base URL: {cls.OLLAMA_BASE_URL}") print(f" Ollama Model: {cls.OLLAMA_MODEL}") print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}") + print(f" Tracking Enabled: {cls.TRACKING_ENABLED}") + print(f" Tracking API URL: {cls.TRACKING_API_URL}") diff --git a/backend/database.py b/backend/database.py index 0adbe9f..ba19981 100644 --- a/backend/database.py +++ b/backend/database.py @@ -11,6 +11,11 @@ articles_collection = db['articles'] subscribers_collection = db['subscribers'] rss_feeds_collection = db['rss_feeds'] +# Tracking Collections +newsletter_sends_collection = db['newsletter_sends'] +link_clicks_collection = db['link_clicks'] +subscriber_activity_collection = db['subscriber_activity'] + def init_db(): """Initialize database with indexes""" @@ -25,6 +30,9 @@ def init_db(): # Create unique index on RSS feed URLs rss_feeds_collection.create_index('url', unique=True) + # Initialize tracking collections indexes + init_tracking_collections() + # Initialize default RSS feeds if collection is empty if rss_feeds_collection.count_documents({}) == 0: default_feeds = [ @@ -51,3 +59,37 @@ def init_db(): print(f"Initialized {len(default_feeds)} default RSS feeds") print("Database initialized with indexes") + + +def init_tracking_collections(): + """Initialize tracking collections with indexes for email tracking system""" + + # Newsletter Sends Collection Indexes + # Unique index on tracking_id for fast pixel/click lookups + newsletter_sends_collection.create_index('tracking_id', unique=True) + # Index on newsletter_id for analytics queries + newsletter_sends_collection.create_index('newsletter_id') + # Index on subscriber_email for user activity queries + newsletter_sends_collection.create_index('subscriber_email') + # Index on sent_at for time-based queries + newsletter_sends_collection.create_index('sent_at') + + # Link Clicks Collection Indexes + # Unique index on tracking_id for fast redirect lookups + link_clicks_collection.create_index('tracking_id', unique=True) + # Index on newsletter_id for analytics queries + link_clicks_collection.create_index('newsletter_id') + # Index on article_url for article performance queries + link_clicks_collection.create_index('article_url') + # Index on subscriber_email for user activity queries + link_clicks_collection.create_index('subscriber_email') + + # Subscriber Activity Collection Indexes + # Unique index on email for fast lookups + subscriber_activity_collection.create_index('email', unique=True) + # Index on status for filtering by activity level + subscriber_activity_collection.create_index('status') + # Index on last_opened_at for time-based queries + subscriber_activity_collection.create_index('last_opened_at') + + print("Tracking collections initialized with indexes") diff --git a/backend/env.template b/backend/env.template index ddb40be..1dd1497 100644 --- a/backend/env.template +++ b/backend/env.template @@ -30,3 +30,12 @@ OLLAMA_TIMEOUT=30 # Port for Flask server (default: 5001 to avoid AirPlay conflict on macOS) FLASK_PORT=5001 +# Tracking Configuration +# Enable/disable email tracking features (true/false) +TRACKING_ENABLED=true +# Base URL for tracking API (used in tracking pixel and link URLs) +# In production, use your actual domain (e.g., https://yourdomain.com) +TRACKING_API_URL=http://localhost:5001 +# Number of days to retain tracking data before anonymization +TRACKING_DATA_RETENTION_DAYS=90 + diff --git a/backend/init_tracking_db.py b/backend/init_tracking_db.py new file mode 100644 index 0000000..c66cd7d --- /dev/null +++ b/backend/init_tracking_db.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +Database initialization script for email tracking system. + +This script creates the necessary MongoDB collections and indexes +for tracking email opens and link clicks in the newsletter system. + +Collections created: +- newsletter_sends: Tracks each newsletter sent to each subscriber +- link_clicks: Tracks individual link clicks +- subscriber_activity: Aggregated activity status for each subscriber + +Usage: + python init_tracking_db.py +""" + +from pymongo import MongoClient, ASCENDING +from config import Config +from datetime import datetime + + +def init_tracking_database(): + """Initialize tracking collections with proper indexes""" + + print("Connecting to MongoDB...") + client = MongoClient(Config.MONGODB_URI) + db = client[Config.DB_NAME] + + print(f"Connected to database: {Config.DB_NAME}") + + # Get collection references + newsletter_sends = db['newsletter_sends'] + link_clicks = db['link_clicks'] + subscriber_activity = db['subscriber_activity'] + + print("\n=== Setting up Newsletter Sends Collection ===") + # Newsletter Sends Collection Indexes + newsletter_sends.create_index('tracking_id', unique=True) + print("βœ“ Created unique index on 'tracking_id'") + + newsletter_sends.create_index('newsletter_id') + print("βœ“ Created index on 'newsletter_id'") + + newsletter_sends.create_index('subscriber_email') + print("βœ“ Created index on 'subscriber_email'") + + newsletter_sends.create_index('sent_at') + print("βœ“ Created index on 'sent_at'") + + print("\n=== Setting up Link Clicks Collection ===") + # Link Clicks Collection Indexes + link_clicks.create_index('tracking_id', unique=True) + print("βœ“ Created unique index on 'tracking_id'") + + link_clicks.create_index('newsletter_id') + print("βœ“ Created index on 'newsletter_id'") + + link_clicks.create_index('article_url') + print("βœ“ Created index on 'article_url'") + + link_clicks.create_index('subscriber_email') + print("βœ“ Created index on 'subscriber_email'") + + print("\n=== Setting up Subscriber Activity Collection ===") + # Subscriber Activity Collection Indexes + subscriber_activity.create_index('email', unique=True) + print("βœ“ Created unique index on 'email'") + + subscriber_activity.create_index('status') + print("βœ“ Created index on 'status'") + + subscriber_activity.create_index('last_opened_at') + print("βœ“ Created index on 'last_opened_at'") + + # Display collection statistics + print("\n=== Collection Statistics ===") + print(f"newsletter_sends: {newsletter_sends.count_documents({})} documents") + print(f"link_clicks: {link_clicks.count_documents({})} documents") + print(f"subscriber_activity: {subscriber_activity.count_documents({})} documents") + + # List all indexes for verification + print("\n=== Index Verification ===") + print("\nNewsletter Sends Indexes:") + for index in newsletter_sends.list_indexes(): + print(f" - {index['name']}: {index.get('key', {})}") + + print("\nLink Clicks Indexes:") + for index in link_clicks.list_indexes(): + print(f" - {index['name']}: {index.get('key', {})}") + + print("\nSubscriber Activity Indexes:") + for index in subscriber_activity.list_indexes(): + print(f" - {index['name']}: {index.get('key', {})}") + + print("\nβœ… Tracking database initialization complete!") + + client.close() + + +if __name__ == '__main__': + try: + init_tracking_database() + except Exception as e: + print(f"\n❌ Error initializing tracking database: {e}") + import traceback + traceback.print_exc() + exit(1) diff --git a/backend/routes/analytics_routes.py b/backend/routes/analytics_routes.py new file mode 100644 index 0000000..3e9a94f --- /dev/null +++ b/backend/routes/analytics_routes.py @@ -0,0 +1,127 @@ +""" +Analytics routes for email tracking metrics and subscriber engagement. +""" + +from flask import Blueprint, jsonify, request +from services.analytics_service import ( + get_newsletter_metrics, + get_article_performance, + get_subscriber_activity_status, + update_subscriber_activity_statuses +) +from database import subscriber_activity_collection + +analytics_bp = Blueprint('analytics', __name__) + + +@analytics_bp.route('/api/analytics/newsletter/', methods=['GET']) +def get_newsletter_analytics(newsletter_id): + """ + Get comprehensive metrics for a specific newsletter. + + Args: + newsletter_id: Unique identifier for the newsletter batch + + Returns: + JSON response with newsletter metrics including: + - total_sent, total_opened, open_rate + - total_clicks, unique_clickers, click_through_rate + """ + try: + metrics = get_newsletter_metrics(newsletter_id) + return jsonify(metrics), 200 + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@analytics_bp.route('/api/analytics/article/', methods=['GET']) +def get_article_analytics(article_url): + """ + Get performance metrics for a specific article. + + Args: + article_url: The original article URL (passed as path parameter) + + Returns: + JSON response with article performance metrics including: + - total_sent, total_clicks, click_rate + - unique_clickers, newsletters + """ + try: + performance = get_article_performance(article_url) + return jsonify(performance), 200 + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@analytics_bp.route('/api/analytics/subscriber/', methods=['GET']) +def get_subscriber_analytics(email): + """ + Get activity status and engagement metrics for a specific subscriber. + + Args: + email: Subscriber email address + + Returns: + JSON response with subscriber activity data including: + - status, last_opened_at, last_clicked_at + - total_opens, total_clicks + - newsletters_received, newsletters_opened + """ + try: + # Get current activity status + status = get_subscriber_activity_status(email) + + # Get detailed activity record from database + activity_record = subscriber_activity_collection.find_one( + {'email': email}, + {'_id': 0} # Exclude MongoDB _id field + ) + + if activity_record: + # Convert datetime objects to ISO format strings + if activity_record.get('last_opened_at'): + activity_record['last_opened_at'] = activity_record['last_opened_at'].isoformat() + if activity_record.get('last_clicked_at'): + activity_record['last_clicked_at'] = activity_record['last_clicked_at'].isoformat() + if activity_record.get('updated_at'): + activity_record['updated_at'] = activity_record['updated_at'].isoformat() + + return jsonify(activity_record), 200 + else: + # Return basic status if no detailed record exists yet + return jsonify({ + 'email': email, + 'status': status, + 'last_opened_at': None, + 'last_clicked_at': None, + 'total_opens': 0, + 'total_clicks': 0, + 'newsletters_received': 0, + 'newsletters_opened': 0 + }), 200 + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@analytics_bp.route('/api/analytics/update-activity', methods=['POST']) +def update_activity_statuses(): + """ + Trigger batch update of subscriber activity statuses. + + Updates the subscriber_activity collection with current engagement + metrics for all subscribers. + + Returns: + JSON response with count of updated records + """ + try: + updated_count = update_subscriber_activity_statuses() + return jsonify({ + 'success': True, + 'updated_count': updated_count, + 'message': f'Updated activity status for {updated_count} subscribers' + }), 200 + except Exception as e: + return jsonify({'error': str(e)}), 500 diff --git a/backend/routes/tracking_routes.py b/backend/routes/tracking_routes.py new file mode 100644 index 0000000..2982fa8 --- /dev/null +++ b/backend/routes/tracking_routes.py @@ -0,0 +1,285 @@ +""" +Tracking routes for email open and link click tracking. +""" + +from flask import Blueprint, request, redirect, make_response, jsonify +from datetime import datetime +import base64 +from database import newsletter_sends_collection, link_clicks_collection +from services.tracking_service import delete_subscriber_tracking_data, anonymize_old_tracking_data +from config import Config + +tracking_bp = Blueprint('tracking', __name__) + +# 1x1 transparent PNG image (43 bytes) +TRANSPARENT_PNG = base64.b64decode( + 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=' +) + + +@tracking_bp.route('/api/track/pixel/', methods=['GET']) +def track_pixel(tracking_id): + """ + Track email opens via tracking pixel. + + Serves a 1x1 transparent PNG image and logs the email open event. + Handles multiple opens by updating last_opened_at and open_count. + Fails silently if tracking_id is invalid to avoid breaking email rendering. + + Args: + tracking_id: Unique tracking ID for the newsletter send + + Returns: + Response: 1x1 transparent PNG image with proper headers + """ + try: + # Look up tracking record + tracking_record = newsletter_sends_collection.find_one({'tracking_id': tracking_id}) + + if tracking_record: + # Get user agent for logging + user_agent = request.headers.get('User-Agent', '') + current_time = datetime.utcnow() + + # Update tracking record + update_data = { + 'opened': True, + 'last_opened_at': current_time, + 'user_agent': user_agent + } + + # Set first_opened_at only if this is the first open + if not tracking_record.get('opened'): + update_data['first_opened_at'] = current_time + + # Increment open count + newsletter_sends_collection.update_one( + {'tracking_id': tracking_id}, + { + '$set': update_data, + '$inc': {'open_count': 1} + } + ) + except Exception as e: + # Log error but don't fail - we still want to return the pixel + print(f"Error tracking pixel for {tracking_id}: {str(e)}") + + # Always return the transparent PNG, even if tracking fails + response = make_response(TRANSPARENT_PNG) + response.headers['Content-Type'] = 'image/png' + response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' + response.headers['Pragma'] = 'no-cache' + response.headers['Expires'] = '0' + + return response + + +@tracking_bp.route('/api/track/click/', methods=['GET']) +def track_click(tracking_id): + """ + Track link clicks and redirect to original article URL. + + Logs the click event and redirects the user to the original article URL. + Handles invalid tracking_id by redirecting to homepage. + Ensures redirect completes within 200ms. + + Args: + tracking_id: Unique tracking ID for the article link + + Returns: + Response: 302 redirect to original article URL or homepage + """ + # Default redirect URL (homepage) + redirect_url = Config.TRACKING_API_URL or 'http://localhost:5001' + + try: + # Look up tracking record + tracking_record = link_clicks_collection.find_one({'tracking_id': tracking_id}) + + if tracking_record: + # Get the original article URL + redirect_url = tracking_record.get('article_url', redirect_url) + + # Get user agent for logging + user_agent = request.headers.get('User-Agent', '') + current_time = datetime.utcnow() + + # Update tracking record with click event + link_clicks_collection.update_one( + {'tracking_id': tracking_id}, + { + '$set': { + 'clicked': True, + 'clicked_at': current_time, + 'user_agent': user_agent + } + } + ) + except Exception as e: + # Log error but still redirect + print(f"Error tracking click for {tracking_id}: {str(e)}") + + # Redirect to the article URL (or homepage if tracking failed) + return redirect(redirect_url, code=302) + + + +@tracking_bp.route('/api/tracking/subscriber/', methods=['DELETE']) +def delete_subscriber_data(email): + """ + Delete all tracking data for a specific subscriber. + + Removes all tracking records associated with the subscriber's email address + from all tracking collections (newsletter_sends, link_clicks, subscriber_activity). + Supports GDPR right to be forgotten. + + Args: + email: Email address of the subscriber + + Returns: + JSON response with deletion counts and confirmation message + """ + try: + # Delete all tracking data for the subscriber + result = delete_subscriber_tracking_data(email) + + return jsonify({ + 'success': True, + 'message': f'All tracking data deleted for {email}', + 'deleted_counts': result + }), 200 + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + + +@tracking_bp.route('/api/tracking/anonymize', methods=['POST']) +def anonymize_tracking_data(): + """ + Anonymize tracking data older than the retention period. + + Removes email addresses from old tracking records while preserving + aggregated metrics. Default retention period is 90 days. + + Request body (optional): + { + "retention_days": 90 // Number of days to retain personal data + } + + Returns: + JSON response with anonymization counts + """ + try: + # Get retention days from request body (default: 90) + data = request.get_json() or {} + retention_days = data.get('retention_days', 90) + + # Validate retention_days + if not isinstance(retention_days, int) or retention_days < 1: + return jsonify({ + 'success': False, + 'error': 'retention_days must be a positive integer' + }), 400 + + # Anonymize old tracking data + result = anonymize_old_tracking_data(retention_days) + + return jsonify({ + 'success': True, + 'message': f'Anonymized tracking data older than {retention_days} days', + 'anonymized_counts': result + }), 200 + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + + + +@tracking_bp.route('/api/tracking/subscriber//opt-out', methods=['POST']) +def opt_out_tracking(email): + """ + Opt a subscriber out of tracking. + + Sets the tracking_enabled field to False for the subscriber, + preventing future tracking of their email opens and link clicks. + + Args: + email: Email address of the subscriber + + Returns: + JSON response with confirmation message + """ + try: + from database import subscribers_collection + + # Update subscriber to opt out of tracking + result = subscribers_collection.update_one( + {'email': email}, + {'$set': {'tracking_enabled': False}}, + upsert=False + ) + + if result.matched_count == 0: + return jsonify({ + 'success': False, + 'error': f'Subscriber {email} not found' + }), 404 + + return jsonify({ + 'success': True, + 'message': f'Subscriber {email} has opted out of tracking' + }), 200 + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 + + +@tracking_bp.route('/api/tracking/subscriber//opt-in', methods=['POST']) +def opt_in_tracking(email): + """ + Opt a subscriber back into tracking. + + Sets the tracking_enabled field to True for the subscriber, + enabling tracking of their email opens and link clicks. + + Args: + email: Email address of the subscriber + + Returns: + JSON response with confirmation message + """ + try: + from database import subscribers_collection + + # Update subscriber to opt in to tracking + result = subscribers_collection.update_one( + {'email': email}, + {'$set': {'tracking_enabled': True}}, + upsert=False + ) + + if result.matched_count == 0: + return jsonify({ + 'success': False, + 'error': f'Subscriber {email} not found' + }), 404 + + return jsonify({ + 'success': True, + 'message': f'Subscriber {email} has opted in to tracking' + }), 200 + + except Exception as e: + return jsonify({ + 'success': False, + 'error': str(e) + }), 500 diff --git a/backend/services/analytics_service.py b/backend/services/analytics_service.py new file mode 100644 index 0000000..56c79e4 --- /dev/null +++ b/backend/services/analytics_service.py @@ -0,0 +1,306 @@ +""" +Analytics service for email tracking metrics and subscriber engagement. +Calculates open rates, click rates, and subscriber activity status. +""" + +from datetime import datetime, timedelta +from typing import Dict, Optional +from database import ( + newsletter_sends_collection, + link_clicks_collection, + subscriber_activity_collection +) + + +def get_open_rate(newsletter_id: str) -> float: + """ + Calculate the percentage of subscribers who opened a specific newsletter. + + Args: + newsletter_id: Unique identifier for the newsletter batch + + Returns: + float: Open rate as a percentage (0-100) + """ + # Count total sends for this newsletter + total_sends = newsletter_sends_collection.count_documents({ + 'newsletter_id': newsletter_id + }) + + if total_sends == 0: + return 0.0 + + # Count how many were opened + opened_count = newsletter_sends_collection.count_documents({ + 'newsletter_id': newsletter_id, + 'opened': True + }) + + # Calculate percentage + open_rate = (opened_count / total_sends) * 100 + return round(open_rate, 2) + + +def get_click_rate(article_url: str) -> float: + """ + Calculate the percentage of subscribers who clicked a specific article link. + + Args: + article_url: The original article URL + + Returns: + float: Click rate as a percentage (0-100) + """ + # Count total link tracking records for this article + total_links = link_clicks_collection.count_documents({ + 'article_url': article_url + }) + + if total_links == 0: + return 0.0 + + # Count how many were clicked + clicked_count = link_clicks_collection.count_documents({ + 'article_url': article_url, + 'clicked': True + }) + + # Calculate percentage + click_rate = (clicked_count / total_links) * 100 + return round(click_rate, 2) + + +def get_newsletter_metrics(newsletter_id: str) -> Dict: + """ + Get comprehensive metrics for a specific newsletter. + + Args: + newsletter_id: Unique identifier for the newsletter batch + + Returns: + dict: Dictionary containing: + - newsletter_id: The newsletter ID + - total_sent: Total number of emails sent + - total_opened: Number of emails opened + - open_rate: Percentage of emails opened + - total_clicks: Total number of link clicks + - unique_clickers: Number of unique subscribers who clicked + - click_through_rate: Percentage of recipients who clicked any link + """ + # Get total sends + total_sent = newsletter_sends_collection.count_documents({ + 'newsletter_id': newsletter_id + }) + + # Get total opened + total_opened = newsletter_sends_collection.count_documents({ + 'newsletter_id': newsletter_id, + 'opened': True + }) + + # Calculate open rate + open_rate = (total_opened / total_sent * 100) if total_sent > 0 else 0.0 + + # Get total clicks for this newsletter + total_clicks = link_clicks_collection.count_documents({ + 'newsletter_id': newsletter_id, + 'clicked': True + }) + + # Get unique clickers (distinct subscriber emails who clicked) + unique_clickers = len(link_clicks_collection.distinct( + 'subscriber_email', + {'newsletter_id': newsletter_id, 'clicked': True} + )) + + # Calculate click-through rate (unique clickers / total sent) + click_through_rate = (unique_clickers / total_sent * 100) if total_sent > 0 else 0.0 + + return { + 'newsletter_id': newsletter_id, + 'total_sent': total_sent, + 'total_opened': total_opened, + 'open_rate': round(open_rate, 2), + 'total_clicks': total_clicks, + 'unique_clickers': unique_clickers, + 'click_through_rate': round(click_through_rate, 2) + } + + +def get_article_performance(article_url: str) -> Dict: + """ + Get performance metrics for a specific article across all newsletters. + + Args: + article_url: The original article URL + + Returns: + dict: Dictionary containing: + - article_url: The article URL + - total_sent: Total times this article was sent + - total_clicks: Total number of clicks + - click_rate: Percentage of recipients who clicked + - unique_clickers: Number of unique subscribers who clicked + - newsletters: List of newsletter IDs that included this article + """ + # Get all link tracking records for this article + total_sent = link_clicks_collection.count_documents({ + 'article_url': article_url + }) + + # Get total clicks + total_clicks = link_clicks_collection.count_documents({ + 'article_url': article_url, + 'clicked': True + }) + + # Calculate click rate + click_rate = (total_clicks / total_sent * 100) if total_sent > 0 else 0.0 + + # Get unique clickers + unique_clickers = len(link_clicks_collection.distinct( + 'subscriber_email', + {'article_url': article_url, 'clicked': True} + )) + + # Get list of newsletters that included this article + newsletters = link_clicks_collection.distinct( + 'newsletter_id', + {'article_url': article_url} + ) + + return { + 'article_url': article_url, + 'total_sent': total_sent, + 'total_clicks': total_clicks, + 'click_rate': round(click_rate, 2), + 'unique_clickers': unique_clickers, + 'newsletters': newsletters + } + + +def get_subscriber_activity_status(email: str) -> str: + """ + Get the activity status for a specific subscriber. + + Classifies subscribers based on their last email open: + - 'active': Opened an email in the last 30 days + - 'inactive': No opens in 30-60 days + - 'dormant': No opens in 60+ days + - 'new': No opens yet + + Args: + email: Subscriber email address + + Returns: + str: Activity status ('active', 'inactive', 'dormant', or 'new') + """ + # Find the most recent open for this subscriber + most_recent_open = newsletter_sends_collection.find_one( + {'subscriber_email': email, 'opened': True}, + sort=[('last_opened_at', -1)] + ) + + if not most_recent_open: + # Check if subscriber has received any newsletters + has_received = newsletter_sends_collection.count_documents({ + 'subscriber_email': email + }) > 0 + + return 'new' if has_received else 'new' + + # Calculate days since last open + last_opened_at = most_recent_open.get('last_opened_at') + if not last_opened_at: + return 'new' + + days_since_open = (datetime.utcnow() - last_opened_at).days + + # Classify based on days since last open + if days_since_open <= 30: + return 'active' + elif days_since_open <= 60: + return 'inactive' + else: + return 'dormant' + + +def update_subscriber_activity_statuses() -> int: + """ + Batch update activity statuses for all subscribers. + + Updates the subscriber_activity collection with current activity status, + engagement metrics, and last interaction timestamps for all subscribers + who have received newsletters. + + Returns: + int: Number of subscriber records updated + """ + # Get all unique subscriber emails from newsletter sends + all_subscribers = newsletter_sends_collection.distinct('subscriber_email') + + updated_count = 0 + + for email in all_subscribers: + # Get activity status + status = get_subscriber_activity_status(email) + + # Get last opened timestamp + last_open_record = newsletter_sends_collection.find_one( + {'subscriber_email': email, 'opened': True}, + sort=[('last_opened_at', -1)] + ) + last_opened_at = last_open_record.get('last_opened_at') if last_open_record else None + + # Get last clicked timestamp + last_click_record = link_clicks_collection.find_one( + {'subscriber_email': email, 'clicked': True}, + sort=[('clicked_at', -1)] + ) + last_clicked_at = last_click_record.get('clicked_at') if last_click_record else None + + # Count total opens + total_opens = newsletter_sends_collection.count_documents({ + 'subscriber_email': email, + 'opened': True + }) + + # Count total clicks + total_clicks = link_clicks_collection.count_documents({ + 'subscriber_email': email, + 'clicked': True + }) + + # Count newsletters received + newsletters_received = newsletter_sends_collection.count_documents({ + 'subscriber_email': email + }) + + # Count newsletters opened (distinct newsletter_ids) + newsletters_opened = len(newsletter_sends_collection.distinct( + 'newsletter_id', + {'subscriber_email': email, 'opened': True} + )) + + # Update or insert subscriber activity record + subscriber_activity_collection.update_one( + {'email': email}, + { + '$set': { + 'email': email, + 'status': status, + 'last_opened_at': last_opened_at, + 'last_clicked_at': last_clicked_at, + 'total_opens': total_opens, + 'total_clicks': total_clicks, + 'newsletters_received': newsletters_received, + 'newsletters_opened': newsletters_opened, + 'updated_at': datetime.utcnow() + } + }, + upsert=True + ) + + updated_count += 1 + + return updated_count diff --git a/backend/services/tracking_service.py b/backend/services/tracking_service.py new file mode 100644 index 0000000..ff065db --- /dev/null +++ b/backend/services/tracking_service.py @@ -0,0 +1,215 @@ +""" +Email tracking service for Munich News Daily newsletter system. +Handles tracking ID generation and tracking record creation. +""" + +import uuid +from datetime import datetime, timedelta +from typing import Dict, List, Optional +from database import newsletter_sends_collection, link_clicks_collection, subscriber_activity_collection, subscribers_collection + + +def generate_tracking_id() -> str: + """ + Generate a unique tracking ID using UUID4. + + Returns: + str: A unique UUID4 string for tracking purposes + """ + return str(uuid.uuid4()) + + +def create_newsletter_tracking( + newsletter_id: str, + subscriber_email: str, + article_links: Optional[List[Dict[str, str]]] = None +) -> Dict[str, any]: + """ + Create tracking records for a newsletter send. + + Creates a tracking record in newsletter_sends collection for email open tracking, + and creates tracking records in link_clicks collection for each article link. + Respects subscriber opt-out preferences. + + Args: + newsletter_id: Unique identifier for the newsletter batch (e.g., date-based) + subscriber_email: Email address of the recipient + article_links: Optional list of article dictionaries with 'url' and 'title' keys + + Returns: + dict: Tracking information containing: + - pixel_tracking_id: ID for the tracking pixel (None if opted out) + - link_tracking_map: Dict mapping original URLs to tracking IDs (empty if opted out) + - newsletter_id: The newsletter batch ID + - subscriber_email: The recipient email + - tracking_enabled: Boolean indicating if tracking is enabled for this subscriber + """ + # Check if subscriber has opted out of tracking + subscriber = subscribers_collection.find_one({'email': subscriber_email}) + tracking_enabled = subscriber.get('tracking_enabled', True) if subscriber else True + + # If tracking is disabled, return empty tracking data + if not tracking_enabled: + return { + 'pixel_tracking_id': None, + 'link_tracking_map': {}, + 'newsletter_id': newsletter_id, + 'subscriber_email': subscriber_email, + 'tracking_enabled': False + } + + # Generate tracking ID for the email open pixel + pixel_tracking_id = generate_tracking_id() + + # Create newsletter send tracking record + newsletter_send_doc = { + 'newsletter_id': newsletter_id, + 'subscriber_email': subscriber_email, + 'tracking_id': pixel_tracking_id, + 'sent_at': datetime.utcnow(), + 'opened': False, + 'first_opened_at': None, + 'last_opened_at': None, + 'open_count': 0, + 'created_at': datetime.utcnow() + } + + newsletter_sends_collection.insert_one(newsletter_send_doc) + + # Create tracking records for article links + link_tracking_map = {} + + if article_links: + for article in article_links: + article_url = article.get('url') + article_title = article.get('title', '') + + if article_url: + link_tracking_id = generate_tracking_id() + + # Create link click tracking record + link_click_doc = { + 'tracking_id': link_tracking_id, + 'newsletter_id': newsletter_id, + 'subscriber_email': subscriber_email, + 'article_url': article_url, + 'article_title': article_title, + 'clicked': False, + 'clicked_at': None, + 'user_agent': None, + 'created_at': datetime.utcnow() + } + + link_clicks_collection.insert_one(link_click_doc) + + # Map original URL to tracking ID + link_tracking_map[article_url] = link_tracking_id + + return { + 'pixel_tracking_id': pixel_tracking_id, + 'link_tracking_map': link_tracking_map, + 'newsletter_id': newsletter_id, + 'subscriber_email': subscriber_email, + 'tracking_enabled': True + } + + + +def anonymize_old_tracking_data(retention_days: int = 90) -> Dict[str, int]: + """ + Anonymize tracking data older than the specified retention period. + + Removes email addresses from tracking records while preserving aggregated metrics. + This helps comply with privacy regulations by not retaining personal data indefinitely. + + Args: + retention_days: Number of days to retain personal data (default: 90) + + Returns: + dict: Count of anonymized records for each collection: + - newsletter_sends_anonymized: Number of newsletter send records anonymized + - link_clicks_anonymized: Number of link click records anonymized + - total_anonymized: Total number of records anonymized + """ + cutoff_date = datetime.utcnow() - timedelta(days=retention_days) + + # Anonymize newsletter_sends records + newsletter_result = newsletter_sends_collection.update_many( + { + 'sent_at': {'$lt': cutoff_date}, + 'subscriber_email': {'$ne': 'anonymized'} # Don't re-anonymize + }, + { + '$set': { + 'subscriber_email': 'anonymized', + 'anonymized_at': datetime.utcnow() + } + } + ) + + # Anonymize link_clicks records + link_clicks_result = link_clicks_collection.update_many( + { + 'created_at': {'$lt': cutoff_date}, + 'subscriber_email': {'$ne': 'anonymized'} # Don't re-anonymize + }, + { + '$set': { + 'subscriber_email': 'anonymized', + 'anonymized_at': datetime.utcnow() + } + } + ) + + newsletter_count = newsletter_result.modified_count + link_clicks_count = link_clicks_result.modified_count + + return { + 'newsletter_sends_anonymized': newsletter_count, + 'link_clicks_anonymized': link_clicks_count, + 'total_anonymized': newsletter_count + link_clicks_count + } + + +def delete_subscriber_tracking_data(subscriber_email: str) -> Dict[str, int]: + """ + Delete all tracking data for a specific subscriber. + + Removes all tracking records associated with a subscriber's email address + from all tracking collections. This supports GDPR right to be forgotten. + + Args: + subscriber_email: Email address of the subscriber + + Returns: + dict: Count of deleted records for each collection: + - newsletter_sends_deleted: Number of newsletter send records deleted + - link_clicks_deleted: Number of link click records deleted + - subscriber_activity_deleted: Number of activity records deleted + - total_deleted: Total number of records deleted + """ + # Delete from newsletter_sends + newsletter_result = newsletter_sends_collection.delete_many({ + 'subscriber_email': subscriber_email + }) + + # Delete from link_clicks + link_clicks_result = link_clicks_collection.delete_many({ + 'subscriber_email': subscriber_email + }) + + # Delete from subscriber_activity + activity_result = subscriber_activity_collection.delete_many({ + 'email': subscriber_email + }) + + newsletter_count = newsletter_result.deleted_count + link_clicks_count = link_clicks_result.deleted_count + activity_count = activity_result.deleted_count + + return { + 'newsletter_sends_deleted': newsletter_count, + 'link_clicks_deleted': link_clicks_count, + 'subscriber_activity_deleted': activity_count, + 'total_deleted': newsletter_count + link_clicks_count + activity_count + } diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml deleted file mode 100644 index 431dc6f..0000000 --- a/docker-compose.prod.yml +++ /dev/null @@ -1,33 +0,0 @@ -version: '3.8' - -# Production version with authentication enabled -# Usage: docker-compose -f docker-compose.prod.yml up -d - -services: - mongodb: - image: mongo:7.0 - container_name: munich-news-mongodb - restart: unless-stopped - ports: - - "27017:27017" - environment: - MONGO_INITDB_ROOT_USERNAME: admin - MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme} - MONGO_INITDB_DATABASE: munich_news - volumes: - - mongodb_data:/data/db - - mongodb_config:/data/configdb - networks: - - munich-news-network - command: mongod --bind_ip_all --auth - -volumes: - mongodb_data: - driver: local - mongodb_config: - driver: local - -networks: - munich-news-network: - driver: bridge - diff --git a/docker-compose.yml b/docker-compose.yml index eafab4b..45ded0b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,24 +1,106 @@ version: '3.8' services: + # MongoDB Database mongodb: - image: mongo:7.0 + image: mongo:latest container_name: munich-news-mongodb restart: unless-stopped ports: - "27017:27017" - # For development: MongoDB runs without authentication - # For production: Uncomment the environment variables below and update MONGODB_URI - # environment: - # MONGO_INITDB_ROOT_USERNAME: admin - # MONGO_INITDB_ROOT_PASSWORD: password - # MONGO_INITDB_DATABASE: munich_news + environment: + # For production, set MONGO_PASSWORD environment variable + MONGO_INITDB_ROOT_USERNAME: ${MONGO_USERNAME:-admin} + MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme} + MONGO_INITDB_DATABASE: munich_news volumes: - mongodb_data:/data/db - mongodb_config:/data/configdb networks: - munich-news-network - command: mongod --bind_ip_all + command: mongod --bind_ip_all ${MONGO_AUTH:---auth} + healthcheck: + test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet + interval: 30s + timeout: 10s + retries: 3 + + # News Crawler - Runs at 6 AM Berlin time + crawler: + build: + context: . + dockerfile: news_crawler/Dockerfile + container_name: munich-news-crawler + restart: unless-stopped + depends_on: + - mongodb + environment: + - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/ + - TZ=Europe/Berlin + volumes: + - ./backend/.env:/app/.env:ro + - ./backend/config.py:/app/config.py:ro + - ./backend/ollama_client.py:/app/ollama_client.py:ro + - ./news_crawler:/app:ro + networks: + - munich-news-network + healthcheck: + test: ["CMD", "python", "-c", "import sys; sys.exit(0)"] + interval: 1m + timeout: 10s + retries: 3 + + # Backend API - Tracking and analytics + backend: + build: + context: ./backend + dockerfile: Dockerfile + container_name: munich-news-backend + restart: unless-stopped + depends_on: + - mongodb + ports: + - "5001:5001" + environment: + - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/ + - FLASK_PORT=5001 + - TZ=Europe/Berlin + volumes: + - ./backend/.env:/app/.env:ro + networks: + - munich-news-network + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # Newsletter Sender - Runs at 7 AM Berlin time + sender: + build: + context: . + dockerfile: news_sender/Dockerfile + container_name: munich-news-sender + restart: unless-stopped + depends_on: + - mongodb + - backend + - crawler + environment: + - MONGODB_URI=mongodb://${MONGO_USERNAME:-admin}:${MONGO_PASSWORD:-changeme}@mongodb:27017/ + - TZ=Europe/Berlin + volumes: + - ./backend/.env:/app/.env:ro + - ./backend/services:/app/backend/services:ro + - ./news_sender:/app:ro + networks: + - munich-news-network + healthcheck: + test: ["CMD", "python", "-c", "import sys; sys.exit(0)"] + interval: 1m + timeout: 10s + retries: 3 volumes: mongodb_data: @@ -29,4 +111,3 @@ volumes: networks: munich-news-network: driver: bridge - diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..0919bb3 --- /dev/null +++ b/docs/API.md @@ -0,0 +1,223 @@ +# API Reference + +## Tracking Endpoints + +### Track Email Open + +```http +GET /api/track/pixel/ +``` + +Returns a 1x1 transparent PNG and logs the email open event. + +**Response**: Image (image/png) + +### Track Link Click + +```http +GET /api/track/click/ +``` + +Logs the click event and redirects to the original article URL. + +**Response**: 302 Redirect + +## Analytics Endpoints + +### Get Newsletter Metrics + +```http +GET /api/analytics/newsletter/ +``` + +Returns comprehensive metrics for a specific newsletter. + +**Response**: +```json +{ + "newsletter_id": "2024-01-15", + "total_sent": 100, + "total_opened": 75, + "open_rate": 75.0, + "unique_openers": 70, + "total_clicks": 45, + "unique_clickers": 30, + "click_through_rate": 30.0 +} +``` + +### Get Article Performance + +```http +GET /api/analytics/article/ +``` + +Returns performance metrics for a specific article. + +**Response**: +```json +{ + "article_url": "https://example.com/article", + "total_sent": 100, + "total_clicks": 25, + "click_rate": 25.0, + "unique_clickers": 20, + "newsletters": ["2024-01-15", "2024-01-16"] +} +``` + +### Get Subscriber Activity + +```http +GET /api/analytics/subscriber/ +``` + +Returns activity status and engagement metrics for a subscriber. + +**Response**: +```json +{ + "email": "user@example.com", + "status": "active", + "last_opened_at": "2024-01-15T10:30:00", + "last_clicked_at": "2024-01-15T10:35:00", + "total_opens": 45, + "total_clicks": 20, + "newsletters_received": 50, + "newsletters_opened": 45 +} +``` + +## Privacy Endpoints + +### Delete Subscriber Data + +```http +DELETE /api/tracking/subscriber/ +``` + +Deletes all tracking data for a subscriber (GDPR compliance). + +**Response**: +```json +{ + "success": true, + "message": "All tracking data deleted for user@example.com", + "deleted_counts": { + "newsletter_sends": 50, + "link_clicks": 25, + "subscriber_activity": 1 + } +} +``` + +### Anonymize Old Data + +```http +POST /api/tracking/anonymize +``` + +Anonymizes tracking data older than the retention period. + +**Request Body** (optional): +```json +{ + "retention_days": 90 +} +``` + +**Response**: +```json +{ + "success": true, + "message": "Anonymized tracking data older than 90 days", + "anonymized_counts": { + "newsletter_sends": 1250, + "link_clicks": 650 + } +} +``` + +### Opt Out of Tracking + +```http +POST /api/tracking/subscriber//opt-out +``` + +Disables tracking for a subscriber. + +**Response**: +```json +{ + "success": true, + "message": "Subscriber user@example.com has opted out of tracking" +} +``` + +### Opt In to Tracking + +```http +POST /api/tracking/subscriber//opt-in +``` + +Re-enables tracking for a subscriber. + +**Response**: +```json +{ + "success": true, + "message": "Subscriber user@example.com has opted in to tracking" +} +``` + +## Examples + +### Using curl + +```bash +# Get newsletter metrics +curl http://localhost:5001/api/analytics/newsletter/2024-01-15 + +# Delete subscriber data +curl -X DELETE http://localhost:5001/api/tracking/subscriber/user@example.com + +# Anonymize old data +curl -X POST http://localhost:5001/api/tracking/anonymize \ + -H "Content-Type: application/json" \ + -d '{"retention_days": 90}' + +# Opt out of tracking +curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-out +``` + +### Using Python + +```python +import requests + +# Get newsletter metrics +response = requests.get('http://localhost:5001/api/analytics/newsletter/2024-01-15') +metrics = response.json() +print(f"Open rate: {metrics['open_rate']}%") + +# Delete subscriber data +response = requests.delete('http://localhost:5001/api/tracking/subscriber/user@example.com') +result = response.json() +print(result['message']) +``` + +## Error Responses + +All endpoints return standard error responses: + +```json +{ + "success": false, + "error": "Error message here" +} +``` + +HTTP Status Codes: +- `200` - Success +- `404` - Not found +- `500` - Server error diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..c715015 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,131 @@ +# System Architecture + +## Overview + +Munich News Daily is a fully automated news aggregation and newsletter system with the following components: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Munich News Daily System β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +6:00 AM Berlin β†’ News Crawler + ↓ + Fetches RSS feeds + Extracts full content + Generates AI summaries + Saves to MongoDB + ↓ +7:00 AM Berlin β†’ Newsletter Sender + ↓ + Waits for crawler + Fetches articles + Generates newsletter + Sends to subscribers + ↓ + βœ… Done! +``` + +## Components + +### 1. MongoDB Database +- **Purpose**: Central data storage +- **Collections**: + - `articles`: News articles with summaries + - `subscribers`: Email subscribers + - `rss_feeds`: RSS feed sources + - `newsletter_sends`: Email tracking data + - `link_clicks`: Link click tracking + - `subscriber_activity`: Engagement metrics + +### 2. News Crawler +- **Schedule**: Daily at 6:00 AM Berlin time +- **Functions**: + - Fetches articles from RSS feeds + - Extracts full article content + - Generates AI summaries using Ollama + - Saves to MongoDB +- **Technology**: Python, BeautifulSoup, Ollama + +### 3. Newsletter Sender +- **Schedule**: Daily at 7:00 AM Berlin time +- **Functions**: + - Waits for crawler to finish (max 30 min) + - Fetches today's articles + - Generates HTML newsletter + - Injects tracking pixels + - Sends to all subscribers +- **Technology**: Python, Jinja2, SMTP + +### 4. Backend API (Optional) +- **Purpose**: Tracking and analytics +- **Endpoints**: + - `/api/track/pixel/` - Email open tracking + - `/api/track/click/` - Link click tracking + - `/api/analytics/*` - Engagement metrics + - `/api/tracking/*` - Privacy controls +- **Technology**: Flask, Python + +## Data Flow + +``` +RSS Feeds β†’ Crawler β†’ MongoDB β†’ Sender β†’ Subscribers + ↓ + Backend API + ↓ + Analytics +``` + +## Coordination + +The sender waits for the crawler to ensure fresh content: + +1. Sender starts at 7:00 AM +2. Checks for recent articles every 30 seconds +3. Maximum wait time: 30 minutes +4. Proceeds once crawler finishes or timeout + +## Technology Stack + +- **Backend**: Python 3.11 +- **Database**: MongoDB 7.0 +- **AI**: Ollama (Phi3 model) +- **Scheduling**: Python schedule library +- **Email**: SMTP with HTML templates +- **Tracking**: Pixel tracking + redirect URLs +- **Infrastructure**: Docker & Docker Compose + +## Deployment + +All components run in Docker containers: + +``` +docker-compose up -d +``` + +Containers: +- `munich-news-mongodb` - Database +- `munich-news-crawler` - Crawler service +- `munich-news-sender` - Sender service + +## Security + +- MongoDB authentication enabled +- Environment variables for secrets +- HTTPS for tracking URLs (production) +- GDPR-compliant data retention +- Privacy controls (opt-out, deletion) + +## Monitoring + +- Docker logs for all services +- MongoDB for data verification +- Health checks on containers +- Engagement metrics via API + +## Scalability + +- Horizontal: Add more crawler instances +- Vertical: Increase container resources +- Database: MongoDB sharding if needed +- Caching: Redis for API responses (future) diff --git a/backend/STRUCTURE.md b/docs/BACKEND_STRUCTURE.md similarity index 79% rename from backend/STRUCTURE.md rename to docs/BACKEND_STRUCTURE.md index 17dbd11..4cab09b 100644 --- a/backend/STRUCTURE.md +++ b/docs/BACKEND_STRUCTURE.md @@ -17,13 +17,17 @@ backend/ β”‚ β”œβ”€β”€ subscription_routes.py # /api/subscribe, /api/unsubscribe β”‚ β”œβ”€β”€ news_routes.py # /api/news, /api/stats β”‚ β”œβ”€β”€ rss_routes.py # /api/rss-feeds (CRUD operations) -β”‚ └── ollama_routes.py # /api/ollama/* (AI features) +β”‚ β”œβ”€β”€ ollama_routes.py # /api/ollama/* (AI features) +β”‚ β”œβ”€β”€ tracking_routes.py # /api/track/* (email tracking) +β”‚ └── analytics_routes.py # /api/analytics/* (engagement metrics) β”‚ └── services/ # Business logic layer β”œβ”€β”€ __init__.py β”œβ”€β”€ news_service.py # News fetching and storage logic β”œβ”€β”€ email_service.py # Newsletter email sending - └── ollama_service.py # Ollama AI integration + β”œβ”€β”€ ollama_service.py # Ollama AI integration + β”œβ”€β”€ tracking_service.py # Email tracking (opens/clicks) + └── analytics_service.py # Engagement analytics ``` ## Key Components @@ -49,12 +53,16 @@ Each route file is a Flask Blueprint handling specific API endpoints: - **news_routes.py**: News fetching and statistics - **rss_routes.py**: RSS feed management (add/remove/list/toggle) - **ollama_routes.py**: AI/Ollama integration endpoints +- **tracking_routes.py**: Email tracking (pixel, click redirects, data deletion) +- **analytics_routes.py**: Engagement analytics (open rates, click rates, subscriber activity) ### services/ Business logic separated from route handlers: - **news_service.py**: Fetches news from RSS feeds, saves to database - **email_service.py**: Sends newsletter emails to subscribers - **ollama_service.py**: Communicates with Ollama AI server +- **tracking_service.py**: Email tracking logic (tracking IDs, pixel generation, click logging) +- **analytics_service.py**: Analytics calculations (open rates, click rates, activity classification) ## Benefits of This Structure diff --git a/CHANGELOG.md b/docs/CHANGELOG.md similarity index 100% rename from CHANGELOG.md rename to docs/CHANGELOG.md diff --git a/news_crawler/HOW_IT_WORKS.md b/docs/CRAWLER_HOW_IT_WORKS.md similarity index 100% rename from news_crawler/HOW_IT_WORKS.md rename to docs/CRAWLER_HOW_IT_WORKS.md diff --git a/backend/DATABASE_SCHEMA.md b/docs/DATABASE_SCHEMA.md similarity index 54% rename from backend/DATABASE_SCHEMA.md rename to docs/DATABASE_SCHEMA.md index 6af803c..9490425 100644 --- a/backend/DATABASE_SCHEMA.md +++ b/docs/DATABASE_SCHEMA.md @@ -78,6 +78,134 @@ Stores all newsletter subscribers. } ``` +### 3. Newsletter Sends Collection (`newsletter_sends`) + +Tracks each newsletter sent to each subscriber for email open tracking. + +**Document Structure:** +```javascript +{ + _id: ObjectId, // Auto-generated MongoDB ID + newsletter_id: String, // Unique ID for this newsletter batch (date-based) + subscriber_email: String, // Recipient email + tracking_id: String, // Unique tracking ID for this send (UUID) + sent_at: DateTime, // When email was sent (UTC) + opened: Boolean, // Whether email was opened + first_opened_at: DateTime, // First open timestamp (null if not opened) + last_opened_at: DateTime, // Most recent open timestamp + open_count: Number, // Number of times opened + created_at: DateTime // Record creation time (UTC) +} +``` + +**Indexes:** +- `tracking_id` - Unique index for fast pixel request lookups +- `newsletter_id` - Index for analytics queries +- `subscriber_email` - Index for user activity queries +- `sent_at` - Index for time-based queries + +**Example Document:** +```javascript +{ + _id: ObjectId("507f1f77bcf86cd799439013"), + newsletter_id: "2024-01-15", + subscriber_email: "user@example.com", + tracking_id: "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + sent_at: ISODate("2024-01-15T08:00:00.000Z"), + opened: true, + first_opened_at: ISODate("2024-01-15T09:30:00.000Z"), + last_opened_at: ISODate("2024-01-15T14:20:00.000Z"), + open_count: 3, + created_at: ISODate("2024-01-15T08:00:00.000Z") +} +``` + +### 4. Link Clicks Collection (`link_clicks`) + +Tracks individual link clicks from newsletters. + +**Document Structure:** +```javascript +{ + _id: ObjectId, // Auto-generated MongoDB ID + tracking_id: String, // Unique tracking ID for this link (UUID) + newsletter_id: String, // Which newsletter this link was in + subscriber_email: String, // Who clicked + article_url: String, // Original article URL + article_title: String, // Article title for reporting + clicked_at: DateTime, // When link was clicked (UTC) + user_agent: String, // Browser/client info + created_at: DateTime // Record creation time (UTC) +} +``` + +**Indexes:** +- `tracking_id` - Unique index for fast redirect request lookups +- `newsletter_id` - Index for analytics queries +- `article_url` - Index for article performance queries +- `subscriber_email` - Index for user activity queries + +**Example Document:** +```javascript +{ + _id: ObjectId("507f1f77bcf86cd799439014"), + tracking_id: "b2c3d4e5-f6a7-8901-bcde-f12345678901", + newsletter_id: "2024-01-15", + subscriber_email: "user@example.com", + article_url: "https://www.sueddeutsche.de/muenchen/ubahn-1.123456", + article_title: "New U-Bahn Line Opens in Munich", + clicked_at: ISODate("2024-01-15T09:35:00.000Z"), + user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + created_at: ISODate("2024-01-15T09:35:00.000Z") +} +``` + +### 5. Subscriber Activity Collection (`subscriber_activity`) + +Aggregated activity status for each subscriber. + +**Document Structure:** +```javascript +{ + _id: ObjectId, // Auto-generated MongoDB ID + email: String, // Subscriber email (unique) + status: String, // 'active', 'inactive', or 'dormant' + last_opened_at: DateTime, // Most recent email open (UTC) + last_clicked_at: DateTime, // Most recent link click (UTC) + total_opens: Number, // Lifetime open count + total_clicks: Number, // Lifetime click count + newsletters_received: Number, // Total newsletters sent + newsletters_opened: Number, // Total newsletters opened + updated_at: DateTime // Last status update (UTC) +} +``` + +**Indexes:** +- `email` - Unique index for fast lookups +- `status` - Index for filtering by activity level +- `last_opened_at` - Index for time-based queries + +**Activity Status Classification:** +- **active**: Opened an email in the last 30 days +- **inactive**: No opens in 30-60 days +- **dormant**: No opens in 60+ days + +**Example Document:** +```javascript +{ + _id: ObjectId("507f1f77bcf86cd799439015"), + email: "user@example.com", + status: "active", + last_opened_at: ISODate("2024-01-15T09:30:00.000Z"), + last_clicked_at: ISODate("2024-01-15T09:35:00.000Z"), + total_opens: 45, + total_clicks: 23, + newsletters_received: 60, + newsletters_opened: 45, + updated_at: ISODate("2024-01-15T10:00:00.000Z") +} +``` + ## Design Decisions ### Why MongoDB? diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..5e6160d --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,274 @@ +# Deployment Guide + +## Quick Start + +```bash +# 1. Clone repository +git clone +cd munich-news + +# 2. Configure environment +cp backend/.env.example backend/.env +# Edit backend/.env with your settings + +# 3. Start system +docker-compose up -d + +# 4. View logs +docker-compose logs -f +``` + +## Environment Configuration + +### Required Settings + +Edit `backend/.env`: + +```env +# Email (Required) +SMTP_SERVER=smtp.gmail.com +SMTP_PORT=587 +EMAIL_USER=your-email@gmail.com +EMAIL_PASSWORD=your-app-password + +# MongoDB (Optional - defaults provided) +MONGODB_URI=mongodb://localhost:27017/ + +# Tracking (Optional) +TRACKING_ENABLED=true +TRACKING_API_URL=http://localhost:5001 +``` + +### Optional Settings + +```env +# Newsletter +NEWSLETTER_MAX_ARTICLES=10 +NEWSLETTER_HOURS_LOOKBACK=24 + +# Ollama AI +OLLAMA_ENABLED=true +OLLAMA_BASE_URL=http://127.0.0.1:11434 +OLLAMA_MODEL=phi3:latest + +# Tracking +TRACKING_DATA_RETENTION_DAYS=90 +``` + +## Production Deployment + +### 1. Set MongoDB Password + +```bash +export MONGO_PASSWORD=your-secure-password +docker-compose up -d +``` + +### 2. Use HTTPS for Tracking + +Update `backend/.env`: +```env +TRACKING_API_URL=https://yourdomain.com +``` + +### 3. Configure Log Rotation + +Add to `docker-compose.yml`: +```yaml +services: + crawler: + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" +``` + +### 4. Set Up Backups + +```bash +# Daily MongoDB backup +0 3 * * * docker exec munich-news-mongodb mongodump --out=/data/backup/$(date +\%Y\%m\%d) +``` + +### 5. Enable Backend API + +Uncomment backend service in `docker-compose.yml`: +```yaml +backend: + build: + context: ./backend + ports: + - "5001:5001" + # ... rest of config +``` + +## Schedule Configuration + +### Change Crawler Time + +Edit `news_crawler/scheduled_crawler.py`: +```python +schedule.every().day.at("06:00").do(run_crawler) # Change time +``` + +### Change Sender Time + +Edit `news_sender/scheduled_sender.py`: +```python +schedule.every().day.at("07:00").do(run_sender) # Change time +``` + +Rebuild after changes: +```bash +docker-compose up -d --build +``` + +## Database Setup + +### Add RSS Feeds + +```bash +mongosh munich_news + +db.rss_feeds.insertMany([ + { + name: "SΓΌddeutsche Zeitung MΓΌnchen", + url: "https://www.sueddeutsche.de/muenchen/rss", + active: true + }, + { + name: "Merkur MΓΌnchen", + url: "https://www.merkur.de/lokales/muenchen/rss/feed.rss", + active: true + } +]) +``` + +### Add Subscribers + +```bash +mongosh munich_news + +db.subscribers.insertMany([ + { + email: "user1@example.com", + active: true, + tracking_enabled: true, + subscribed_at: new Date() + }, + { + email: "user2@example.com", + active: true, + tracking_enabled: true, + subscribed_at: new Date() + } +]) +``` + +## Monitoring + +### Check Container Status + +```bash +docker-compose ps +``` + +### View Logs + +```bash +# All services +docker-compose logs -f + +# Specific service +docker-compose logs -f crawler +docker-compose logs -f sender +``` + +### Check Database + +```bash +mongosh munich_news + +// Count articles +db.articles.countDocuments() + +// Count subscribers +db.subscribers.countDocuments({ active: true }) + +// View recent articles +db.articles.find().sort({ crawled_at: -1 }).limit(5) +``` + +## Troubleshooting + +### Containers Won't Start + +```bash +# Check logs +docker-compose logs + +# Rebuild +docker-compose up -d --build + +# Reset everything +docker-compose down -v +docker-compose up -d +``` + +### Crawler Not Finding Articles + +```bash +# Check RSS feeds +mongosh munich_news --eval "db.rss_feeds.find({ active: true })" + +# Test manually +docker-compose exec crawler python crawler_service.py 5 +``` + +### Newsletter Not Sending + +```bash +# Test email +docker-compose exec sender python sender_service.py test your-email@example.com + +# Check SMTP config +docker-compose exec sender python -c "from sender_service import Config; print(Config.SMTP_SERVER)" +``` + +## Maintenance + +### Update System + +```bash +git pull +docker-compose up -d --build +``` + +### Backup Database + +```bash +docker exec munich-news-mongodb mongodump --out=/data/backup +``` + +### Clean Old Data + +```bash +mongosh munich_news + +// Delete articles older than 90 days +db.articles.deleteMany({ + crawled_at: { $lt: new Date(Date.now() - 90*24*60*60*1000) } +}) +``` + +## Security Checklist + +- [ ] Set strong MongoDB password +- [ ] Use HTTPS for tracking URLs +- [ ] Secure SMTP credentials +- [ ] Enable firewall rules +- [ ] Set up log rotation +- [ ] Configure backups +- [ ] Monitor for failures +- [ ] Keep dependencies updated diff --git a/news_crawler/EXTRACTION_STRATEGIES.md b/docs/EXTRACTION_STRATEGIES.md similarity index 100% rename from news_crawler/EXTRACTION_STRATEGIES.md rename to docs/EXTRACTION_STRATEGIES.md diff --git a/ARCHITECTURE.md b/docs/OLD_ARCHITECTURE.md similarity index 100% rename from ARCHITECTURE.md rename to docs/OLD_ARCHITECTURE.md diff --git a/QUICK_REFERENCE.md b/docs/QUICK_REFERENCE.md similarity index 83% rename from QUICK_REFERENCE.md rename to docs/QUICK_REFERENCE.md index 5ad15b4..427bd78 100644 --- a/QUICK_REFERENCE.md +++ b/docs/QUICK_REFERENCE.md @@ -84,6 +84,33 @@ curl http://localhost:5001/api/ollama/ping curl http://localhost:5001/api/ollama/models ``` +### Email Tracking & Analytics + +**Get newsletter metrics:** +```bash +curl http://localhost:5001/api/analytics/newsletter/ +``` + +**Get article performance:** +```bash +curl http://localhost:5001/api/analytics/article/ +``` + +**Get subscriber activity:** +```bash +curl http://localhost:5001/api/analytics/subscriber/ +``` + +**Delete subscriber tracking data:** +```bash +curl -X DELETE http://localhost:5001/api/tracking/subscriber/ +``` + +**Anonymize old tracking data:** +```bash +curl -X POST http://localhost:5001/api/tracking/anonymize +``` + ### Database **Connect to MongoDB:** @@ -110,6 +137,13 @@ db.subscribers.countDocuments({status: "active"}) db.rss_feeds.find() ``` +**Check tracking data:** +```javascript +db.newsletter_sends.find().limit(5) +db.link_clicks.find().limit(5) +db.subscriber_activity.find() +``` + ## File Locations ### Configuration @@ -186,6 +220,9 @@ EMAIL_PASSWORD=your-app-password OLLAMA_BASE_URL=http://127.0.0.1:11434 OLLAMA_MODEL=phi3:latest OLLAMA_ENABLED=true +TRACKING_ENABLED=true +TRACKING_API_URL=http://localhost:5001 +TRACKING_DATA_RETENTION_DAYS=90 ``` ## Development Workflow diff --git a/news_crawler/RSS_URL_EXTRACTION.md b/docs/RSS_URL_EXTRACTION.md similarity index 100% rename from news_crawler/RSS_URL_EXTRACTION.md rename to docs/RSS_URL_EXTRACTION.md diff --git a/docs/SYSTEM_ARCHITECTURE.md b/docs/SYSTEM_ARCHITECTURE.md new file mode 100644 index 0000000..cf7b4da --- /dev/null +++ b/docs/SYSTEM_ARCHITECTURE.md @@ -0,0 +1,412 @@ +# Munich News Daily - System Architecture + +## πŸ“Š Complete System Overview + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Munich News Daily System β”‚ +β”‚ Fully Automated Pipeline β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + + Daily Schedule + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ 6:00 AM Berlin β”‚ + β”‚ News Crawler β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ News Crawler β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”β”‚ +β”‚ β”‚ Fetch RSS β”‚β†’ β”‚ Extract β”‚β†’ β”‚ Summarize β”‚β†’ β”‚ Save to β”‚β”‚ +β”‚ β”‚ Feeds β”‚ β”‚ Content β”‚ β”‚ with AI β”‚ β”‚ MongoDB β”‚β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜β”‚ +β”‚ β”‚ +β”‚ Sources: SΓΌddeutsche, Merkur, BR24, etc. β”‚ +β”‚ Output: Full articles + AI summaries β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ Articles saved + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ MongoDB β”‚ + β”‚ (Data Storage) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ Wait for crawler + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ 7:00 AM Berlin β”‚ + β”‚ Newsletter Sender β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Newsletter Sender β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”β”‚ +β”‚ β”‚ Wait for β”‚β†’ β”‚ Fetch β”‚β†’ β”‚ Generate β”‚β†’ β”‚ Send to β”‚β”‚ +β”‚ β”‚ Crawler β”‚ β”‚ Articles β”‚ β”‚ Newsletter β”‚ β”‚ Subscribersβ”‚β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜β”‚ +β”‚ β”‚ +β”‚ Features: Tracking pixels, link tracking, HTML templates β”‚ +β”‚ Output: Personalized newsletters with engagement tracking β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ Emails sent + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Subscribers β”‚ + β”‚ (Email Inboxes) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”‚ Opens & clicks + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Tracking System β”‚ + β”‚ (Analytics API) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## πŸ”„ Data Flow + +### 1. Content Acquisition (6:00 AM) + +``` +RSS Feeds β†’ Crawler β†’ Full Content β†’ AI Summary β†’ MongoDB +``` + +**Details**: +- Fetches from multiple RSS sources +- Extracts full article text +- Generates concise summaries using Ollama +- Stores with metadata (author, date, source) + +### 2. Newsletter Generation (7:00 AM) + +``` +MongoDB β†’ Articles β†’ Template β†’ HTML β†’ Email +``` + +**Details**: +- Waits for crawler to finish (max 30 min) +- Fetches today's articles with summaries +- Applies Jinja2 template +- Injects tracking pixels +- Replaces links with tracking URLs + +### 3. Engagement Tracking (Ongoing) + +``` +Email Open β†’ Pixel Load β†’ Log Event β†’ Analytics +Link Click β†’ Redirect β†’ Log Event β†’ Analytics +``` + +**Details**: +- Tracks email opens via 1x1 pixel +- Tracks link clicks via redirect URLs +- Stores engagement data in MongoDB +- Provides analytics API + +## πŸ—οΈ Component Architecture + +### Docker Containers + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Docker Network β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ MongoDB β”‚ β”‚ Crawler β”‚ β”‚ Sender β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Port: 27017 │←─│ Schedule: │←─│ Schedule: β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ 6:00 AM β”‚ β”‚ 7:00 AM β”‚ β”‚ +β”‚ β”‚ Storage: β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ - articles β”‚ β”‚ Depends on: β”‚ β”‚ Depends on: β”‚ β”‚ +β”‚ β”‚ - subscribersβ”‚ β”‚ - MongoDB β”‚ β”‚ - MongoDB β”‚ β”‚ +β”‚ β”‚ - tracking β”‚ β”‚ β”‚ β”‚ - Crawler β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ All containers auto-restart on failure β”‚ +β”‚ All use Europe/Berlin timezone β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Backend Services + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Backend Services β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Flask API (Port 5001) β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ Tracking β”‚ β”‚ Analytics β”‚ β”‚ Privacy β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Endpoints β”‚ β”‚ Endpoints β”‚ β”‚ Endpoints β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Services Layer β”‚ β”‚ +β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ +β”‚ β”‚ β”‚ Tracking β”‚ β”‚ Analytics β”‚ β”‚ Ollama β”‚ β”‚ β”‚ +β”‚ β”‚ β”‚ Service β”‚ β”‚ Service β”‚ β”‚ Client β”‚ β”‚ β”‚ +β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## πŸ“… Daily Timeline + +``` +Time (Berlin) β”‚ Event β”‚ Duration +───────────────┼──────────────────────────┼────────── +05:59:59 β”‚ System idle β”‚ - +06:00:00 β”‚ Crawler starts β”‚ ~10-20 min +06:00:01 β”‚ - Fetch RSS feeds β”‚ +06:02:00 β”‚ - Extract content β”‚ +06:05:00 β”‚ - Generate summaries β”‚ +06:15:00 β”‚ - Save to MongoDB β”‚ +06:20:00 β”‚ Crawler finishes β”‚ +06:20:01 β”‚ System idle β”‚ ~40 min +07:00:00 β”‚ Sender starts β”‚ ~5-10 min +07:00:01 β”‚ - Wait for crawler β”‚ (checks every 30s) +07:00:30 β”‚ - Crawler confirmed done β”‚ +07:00:31 β”‚ - Fetch articles β”‚ +07:01:00 β”‚ - Generate newsletters β”‚ +07:02:00 β”‚ - Send to subscribers β”‚ +07:10:00 β”‚ Sender finishes β”‚ +07:10:01 β”‚ System idle β”‚ Until tomorrow +``` + +## πŸ” Security & Privacy + +### Data Protection + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Privacy Features β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Data Retention β”‚ β”‚ +β”‚ β”‚ - Personal data: 90 days β”‚ β”‚ +β”‚ β”‚ - Anonymization: Automatic β”‚ β”‚ +β”‚ β”‚ - Deletion: On request β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ User Rights β”‚ β”‚ +β”‚ β”‚ - Opt-out: Anytime β”‚ β”‚ +β”‚ β”‚ - Data access: API available β”‚ β”‚ +β”‚ β”‚ - Data deletion: Full removal β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Compliance β”‚ β”‚ +β”‚ β”‚ - GDPR compliant β”‚ β”‚ +β”‚ β”‚ - Privacy notice in emails β”‚ β”‚ +β”‚ β”‚ - Transparent tracking β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## πŸ“Š Database Schema + +### Collections + +``` +MongoDB (munich_news) +β”‚ +β”œβ”€β”€ articles +β”‚ β”œβ”€β”€ title +β”‚ β”œβ”€β”€ author +β”‚ β”œβ”€β”€ content (full text) +β”‚ β”œβ”€β”€ summary (AI generated) +β”‚ β”œβ”€β”€ link +β”‚ β”œβ”€β”€ source +β”‚ β”œβ”€β”€ published_at +β”‚ └── crawled_at +β”‚ +β”œβ”€β”€ subscribers +β”‚ β”œβ”€β”€ email +β”‚ β”œβ”€β”€ active +β”‚ β”œβ”€β”€ tracking_enabled +β”‚ └── subscribed_at +β”‚ +β”œβ”€β”€ rss_feeds +β”‚ β”œβ”€β”€ name +β”‚ β”œβ”€β”€ url +β”‚ └── active +β”‚ +β”œβ”€β”€ newsletter_sends +β”‚ β”œβ”€β”€ tracking_id +β”‚ β”œβ”€β”€ newsletter_id +β”‚ β”œβ”€β”€ subscriber_email +β”‚ β”œβ”€β”€ opened +β”‚ β”œβ”€β”€ first_opened_at +β”‚ └── open_count +β”‚ +β”œβ”€β”€ link_clicks +β”‚ β”œβ”€β”€ tracking_id +β”‚ β”œβ”€β”€ newsletter_id +β”‚ β”œβ”€β”€ subscriber_email +β”‚ β”œβ”€β”€ article_url +β”‚ β”œβ”€β”€ clicked +β”‚ └── clicked_at +β”‚ +└── subscriber_activity + β”œβ”€β”€ email + β”œβ”€β”€ status (active/inactive/dormant) + β”œβ”€β”€ last_opened_at + β”œβ”€β”€ last_clicked_at + β”œβ”€β”€ total_opens + └── total_clicks +``` + +## πŸš€ Deployment Architecture + +### Development + +``` +Local Machine +β”œβ”€β”€ Docker Compose +β”‚ β”œβ”€β”€ MongoDB (no auth) +β”‚ β”œβ”€β”€ Crawler +β”‚ └── Sender +β”œβ”€β”€ Backend (manual start) +β”‚ └── Flask API +└── Ollama (optional) + └── AI Summarization +``` + +### Production + +``` +Server +β”œβ”€β”€ Docker Compose (prod) +β”‚ β”œβ”€β”€ MongoDB (with auth) +β”‚ β”œβ”€β”€ Crawler +β”‚ └── Sender +β”œβ”€β”€ Backend (systemd/pm2) +β”‚ └── Flask API (HTTPS) +β”œβ”€β”€ Ollama (optional) +β”‚ └── AI Summarization +└── Nginx (reverse proxy) + └── SSL/TLS +``` + +## πŸ”„ Coordination Mechanism + +### Crawler-Sender Synchronization + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Coordination Flow β”‚ +β”‚ β”‚ +β”‚ 6:00 AM β†’ Crawler starts β”‚ +β”‚ ↓ β”‚ +β”‚ Crawling articles... β”‚ +β”‚ ↓ β”‚ +β”‚ Saves to MongoDB β”‚ +β”‚ ↓ β”‚ +β”‚ 6:20 AM β†’ Crawler finishes β”‚ +β”‚ ↓ β”‚ +β”‚ 7:00 AM β†’ Sender starts β”‚ +β”‚ ↓ β”‚ +β”‚ Check: Recent articles? ──→ No ──┐ β”‚ +β”‚ ↓ Yes β”‚ β”‚ +β”‚ Proceed with send β”‚ β”‚ +β”‚ β”‚ β”‚ +β”‚ ← Wait 30s ← Wait 30s ← Wait 30sβ”˜ β”‚ +β”‚ (max 30 minutes) β”‚ +β”‚ β”‚ +β”‚ 7:10 AM β†’ Newsletter sent β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## πŸ“ˆ Monitoring & Observability + +### Key Metrics + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Metrics to Monitor β”‚ +β”‚ β”‚ +β”‚ Crawler: β”‚ +β”‚ - Articles crawled per day β”‚ +β”‚ - Crawl duration β”‚ +β”‚ - Success/failure rate β”‚ +β”‚ - Summary generation rate β”‚ +β”‚ β”‚ +β”‚ Sender: β”‚ +β”‚ - Newsletters sent per day β”‚ +β”‚ - Send duration β”‚ +β”‚ - Success/failure rate β”‚ +β”‚ - Wait time for crawler β”‚ +β”‚ β”‚ +β”‚ Engagement: β”‚ +β”‚ - Open rate β”‚ +β”‚ - Click-through rate β”‚ +β”‚ - Active subscribers β”‚ +β”‚ - Dormant subscribers β”‚ +β”‚ β”‚ +β”‚ System: β”‚ +β”‚ - Container uptime β”‚ +β”‚ - Database size β”‚ +β”‚ - Error rate β”‚ +β”‚ - Response times β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## πŸ› οΈ Maintenance Tasks + +### Daily +- Check logs for errors +- Verify newsletters sent +- Monitor engagement metrics + +### Weekly +- Review article quality +- Check subscriber growth +- Analyze engagement trends + +### Monthly +- Archive old articles +- Clean up dormant subscribers +- Update dependencies +- Review system performance + +## πŸ“š Technology Stack + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Technology Stack β”‚ +β”‚ β”‚ +β”‚ Backend: β”‚ +β”‚ - Python 3.11 β”‚ +β”‚ - Flask (API) β”‚ +β”‚ - PyMongo (Database) β”‚ +β”‚ - Schedule (Automation) β”‚ +β”‚ - Jinja2 (Templates) β”‚ +β”‚ - BeautifulSoup (Parsing) β”‚ +β”‚ β”‚ +β”‚ Database: β”‚ +β”‚ - MongoDB 7.0 β”‚ +β”‚ β”‚ +β”‚ AI/ML: β”‚ +β”‚ - Ollama (Summarization) β”‚ +β”‚ - Phi3 Model (default) β”‚ +β”‚ β”‚ +β”‚ Infrastructure: β”‚ +β”‚ - Docker & Docker Compose β”‚ +β”‚ - Linux (Ubuntu/Debian) β”‚ +β”‚ β”‚ +β”‚ Email: β”‚ +β”‚ - SMTP (configurable) β”‚ +β”‚ - HTML emails with tracking β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +**Last Updated**: 2024-01-16 +**Version**: 1.0 +**Status**: Production Ready βœ… diff --git a/news_crawler/CHANGES.md b/news_crawler/CHANGES.md deleted file mode 100644 index 0e48270..0000000 --- a/news_crawler/CHANGES.md +++ /dev/null @@ -1,191 +0,0 @@ -# Recent Changes - Full Content Storage - -## βœ… What Changed - -### 1. Removed Content Length Limit -**Before:** -```python -'content': content_text[:10000] # Limited to 10k chars -``` - -**After:** -```python -'content': content_text # Full content, no limit -``` - -### 2. Simplified Database Schema -**Before:** -```javascript -{ - summary: String, // Short summary - full_content: String // Limited content -} -``` - -**After:** -```javascript -{ - content: String // Full article content, no limit -} -``` - -### 3. Enhanced API Response -**Before:** -```javascript -{ - title: "...", - link: "...", - summary: "..." -} -``` - -**After:** -```javascript -{ - title: "...", - author: "...", // NEW! - link: "...", - preview: "...", // First 200 chars - word_count: 1250, // NEW! - has_full_content: true // NEW! -} -``` - -## πŸ“Š Database Structure - -### Articles Collection -```javascript -{ - _id: ObjectId, - title: String, // Article title - author: String, // Article author (extracted) - link: String, // Article URL (unique) - content: String, // FULL article content (no limit) - word_count: Number, // Word count - source: String, // RSS feed name - published_at: String, // Publication date - crawled_at: DateTime, // When crawled - created_at: DateTime // When added -} -``` - -## πŸ†• New API Endpoint - -### GET /api/news/ -Get full article content by URL. - -**Example:** -```bash -# URL encode the article URL -curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle" -``` - -**Response:** -```json -{ - "title": "New U-Bahn Line Opens in Munich", - "author": "Max Mustermann", - "link": "https://example.com/article", - "content": "The full article text here... (complete, no truncation)", - "word_count": 1250, - "source": "SΓΌddeutsche Zeitung MΓΌnchen", - "published_at": "2024-11-10T10:00:00Z", - "crawled_at": "2024-11-10T16:30:00Z", - "created_at": "2024-11-10T16:00:00Z" -} -``` - -## πŸ“ˆ Enhanced Stats - -### GET /api/stats -Now includes crawled article count: - -```json -{ - "subscribers": 150, - "articles": 500, - "crawled_articles": 350 // NEW! -} -``` - -## 🎯 Benefits - -1. **Complete Content** - No truncation, full articles stored -2. **Better for AI** - Full context for summarization/analysis -3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content` -4. **More Metadata** - Author, word count, crawl timestamp -5. **Better API** - Preview in list, full content on demand - -## πŸ”„ Migration - -If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field. - -To migrate old articles: -```javascript -// MongoDB shell -db.articles.updateMany( - { full_content: { $exists: true } }, - [ - { - $set: { - content: "$full_content" - } - }, - { - $unset: ["full_content", "summary"] - } - ] -) -``` - -## πŸš€ Usage - -### Crawl Articles -```bash -cd news_crawler -python crawler_service.py 10 -``` - -### Get Article List (with previews) -```bash -curl http://localhost:5001/api/news -``` - -### Get Full Article Content -```bash -# Get the article URL from the list, then: -curl "http://localhost:5001/api/news/" -``` - -### Check Stats -```bash -curl http://localhost:5001/api/stats -``` - -## πŸ“ Example Workflow - -1. **Add RSS Feed** -```bash -curl -X POST http://localhost:5001/api/rss-feeds \ - -H "Content-Type: application/json" \ - -d '{"name": "News Source", "url": "https://example.com/rss"}' -``` - -2. **Crawl Articles** -```bash -cd news_crawler -python crawler_service.py 10 -``` - -3. **View Articles** -```bash -curl http://localhost:5001/api/news -``` - -4. **Get Full Content** -```bash -# Copy article link from above, URL encode it -curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle" -``` - -Now you have complete article content ready for AI processing! πŸŽ‰ diff --git a/news_crawler/Dockerfile b/news_crawler/Dockerfile index 4d14a0a..46932d5 100644 --- a/news_crawler/Dockerfile +++ b/news_crawler/Dockerfile @@ -6,8 +6,20 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Copy crawler service -COPY crawler_service.py . +# Copy crawler files +COPY . . -# Run crawler -CMD ["python", "crawler_service.py"] +# Copy backend config files (needed for Config class) +COPY ../backend/config.py /app/config.py +COPY ../backend/ollama_client.py /app/ollama_client.py +COPY ../backend/.env /app/.env + +# Make the scheduler executable +RUN chmod +x scheduled_crawler.py + +# Set timezone to Berlin +ENV TZ=Europe/Berlin +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +# Run the scheduled crawler +CMD ["python", "-u", "scheduled_crawler.py"] diff --git a/news_crawler/QUICKSTART.md b/news_crawler/QUICKSTART.md deleted file mode 100644 index b59694f..0000000 --- a/news_crawler/QUICKSTART.md +++ /dev/null @@ -1,127 +0,0 @@ -# News Crawler - Quick Start - -## 1. Install Dependencies - -```bash -cd news_crawler -pip install -r requirements.txt -``` - -## 2. Configure Environment - -Make sure MongoDB is running and accessible. The crawler will use the same database as the backend. - -Default connection: `mongodb://localhost:27017/` - -To use a different MongoDB URI, create a `.env` file: -```env -MONGODB_URI=mongodb://localhost:27017/ -``` - -## 3. Run the Crawler - -```bash -# Crawl up to 10 articles per feed -python crawler_service.py - -# Crawl up to 20 articles per feed -python crawler_service.py 20 -``` - -## 4. Verify Results - -Check your MongoDB database: - -```bash -# Using mongosh -mongosh -use munich_news -db.articles.find({full_content: {$exists: true}}).count() -db.articles.findOne({full_content: {$exists: true}}) -``` - -## 5. Schedule Regular Crawling - -### Option A: Cron (Linux/Mac) - -```bash -# Edit crontab -crontab -e - -# Add this line to run every 6 hours -0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py -``` - -### Option B: Docker - -```bash -# Build and run -docker-compose up - -# Or run as a one-off -docker-compose run --rm crawler -``` - -### Option C: Manual - -Just run the script whenever you want to fetch new articles: - -```bash -python crawler_service.py -``` - -## What Gets Crawled? - -The crawler: -1. Fetches all active RSS feeds from the database -2. For each feed, gets the latest articles -3. Crawls the full content from each article URL -4. Saves: title, full_content, word_count, crawled_at -5. Skips articles that already have content - -## Output Example - -``` -============================================================ -πŸš€ Starting RSS Feed Crawler -============================================================ -Found 3 active feed(s) - -πŸ“° Crawling feed: SΓΌddeutsche Zeitung MΓΌnchen - URL: https://www.sueddeutsche.de/muenchen/rss - πŸ” Crawling: New U-Bahn Line Opens in Munich... - βœ“ Saved (1250 words) - πŸ” Crawling: Munich Weather Update... - βœ“ Saved (450 words) - βœ“ Crawled 2 articles from SΓΌddeutsche Zeitung MΓΌnchen - -============================================================ -βœ“ Crawling Complete! - Total feeds processed: 3 - Total articles crawled: 15 - Duration: 45.23 seconds -============================================================ -``` - -## Troubleshooting - -**No feeds found:** -- Make sure you've added RSS feeds via the backend API -- Check MongoDB connection - -**Can't extract content:** -- Some sites block scrapers -- Some sites require JavaScript (not supported yet) -- Check if the URL is accessible - -**Timeout errors:** -- Increase timeout in the code -- Check your internet connection - -## Next Steps - -Once articles are crawled, you can: -- View them in the frontend -- Use Ollama to summarize them -- Generate newsletters with full content -- Perform text analysis diff --git a/news_crawler/README.md b/news_crawler/README.md deleted file mode 100644 index 0feb37c..0000000 --- a/news_crawler/README.md +++ /dev/null @@ -1,225 +0,0 @@ -# News Crawler Microservice - -A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB. - -## Features - -- πŸ” Extracts full article content from RSS feed links -- πŸ“Š Calculates word count -- πŸ”„ Avoids re-crawling already processed articles -- ⏱️ Rate limiting (1 second delay between requests) -- 🎯 Smart content extraction using multiple selectors -- 🧹 Cleans up scripts, styles, and navigation elements - -## Installation - -1. Create a virtual environment: -```bash -python -m venv venv -source venv/bin/activate # On Windows: venv\Scripts\activate -``` - -2. Install dependencies: -```bash -pip install -r requirements.txt -``` - -3. Configure environment variables: -Create a `.env` file in the project root (or use the backend's `.env`): -```env -MONGODB_URI=mongodb://localhost:27017/ -``` - -## Usage - -### Standalone Execution - -Run the crawler directly: - -```bash -# Crawl up to 10 articles per feed (default) -python crawler_service.py - -# Crawl up to 20 articles per feed -python crawler_service.py 20 -``` - -### As a Module - -```python -from crawler_service import crawl_all_feeds, crawl_rss_feed - -# Crawl all active feeds -result = crawl_all_feeds(max_articles_per_feed=10) -print(result) - -# Crawl a specific feed -crawl_rss_feed( - feed_url='https://example.com/rss', - feed_name='Example News', - max_articles=10 -) -``` - -### Via Backend API - -The backend has integrated endpoints: - -```bash -# Start crawler -curl -X POST http://localhost:5001/api/crawler/start - -# Check status -curl http://localhost:5001/api/crawler/status - -# Crawl specific feed -curl -X POST http://localhost:5001/api/crawler/feed/ -``` - -## How It Works - -1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB -2. **Parse Feed**: Extracts article links from each feed -3. **Crawl Content**: For each article: - - Fetches HTML page - - Removes scripts, styles, navigation - - Extracts main content using smart selectors - - Calculates word count -4. **Store Data**: Saves to MongoDB with metadata -5. **Skip Duplicates**: Avoids re-crawling articles with existing content - -## Content Extraction Strategy - -The crawler tries multiple selectors in order: - -1. `
` tag -2. Elements with class containing "article-content", "article-body" -3. Elements with class containing "post-content", "entry-content" -4. `
` tag -5. Fallback to all `

` tags in body - -## Database Schema - -Articles are stored with these fields: - -```javascript -{ - title: String, // Article title - link: String, // Article URL (unique) - summary: String, // Short summary - full_content: String, // Full article text (max 10,000 chars) - word_count: Number, // Number of words - source: String, // RSS feed name - published_at: String, // Publication date - crawled_at: DateTime, // When content was crawled - created_at: DateTime // When added to database -} -``` - -## Scheduling - -### Using Cron (Linux/Mac) - -```bash -# Run every 6 hours -0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py -``` - -### Using systemd Timer (Linux) - -Create `/etc/systemd/system/news-crawler.service`: -```ini -[Unit] -Description=News Crawler Service - -[Service] -Type=oneshot -WorkingDirectory=/path/to/news_crawler -ExecStart=/path/to/venv/bin/python crawler_service.py -User=your-user -``` - -Create `/etc/systemd/system/news-crawler.timer`: -```ini -[Unit] -Description=Run News Crawler every 6 hours - -[Timer] -OnBootSec=5min -OnUnitActiveSec=6h - -[Install] -WantedBy=timers.target -``` - -Enable and start: -```bash -sudo systemctl enable news-crawler.timer -sudo systemctl start news-crawler.timer -``` - -### Using Docker - -Create `Dockerfile`: -```dockerfile -FROM python:3.11-slim - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY crawler_service.py . - -CMD ["python", "crawler_service.py"] -``` - -Build and run: -```bash -docker build -t news-crawler . -docker run --env-file ../.env news-crawler -``` - -## Configuration - -Environment variables: - -- `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`) - -## Rate Limiting - -- 1 second delay between article requests -- Respects server resources -- User-Agent header included - -## Troubleshooting - -**Issue: Can't extract content** -- Some sites block scrapers -- Try adjusting User-Agent header -- Some sites require JavaScript (consider Selenium) - -**Issue: Timeout errors** -- Increase timeout in `extract_article_content()` -- Check network connectivity - -**Issue: Memory usage** -- Reduce `max_articles_per_feed` -- Content limited to 10,000 characters per article - -## Architecture - -This is a standalone microservice that: -- Can run independently of the main backend -- Shares the same MongoDB database -- Can be deployed separately -- Can be scheduled independently - -## Next Steps - -Once articles are crawled, you can: -- Use Ollama to summarize articles -- Perform sentiment analysis -- Extract keywords and topics -- Generate newsletter content -- Create article recommendations diff --git a/news_crawler/docker-compose.yml b/news_crawler/docker-compose.yml deleted file mode 100644 index 026a95d..0000000 --- a/news_crawler/docker-compose.yml +++ /dev/null @@ -1,33 +0,0 @@ -version: '3.8' - -services: - crawler: - build: . - container_name: news-crawler - environment: - - MONGODB_URI=mongodb://mongodb:27017/ - networks: - - munich-news-network - depends_on: - - mongodb - # Run once and exit - restart: "no" - - mongodb: - image: mongo:7.0 - container_name: munich-news-mongodb - restart: unless-stopped - ports: - - "27017:27017" - volumes: - - mongodb_data:/data/db - networks: - - munich-news-network - -volumes: - mongodb_data: - driver: local - -networks: - munich-news-network: - driver: bridge diff --git a/news_crawler/requirements.txt b/news_crawler/requirements.txt index 07b84ea..19b585e 100644 --- a/news_crawler/requirements.txt +++ b/news_crawler/requirements.txt @@ -4,3 +4,5 @@ requests==2.31.0 feedparser==6.0.10 pymongo==4.6.1 python-dotenv==1.0.0 +schedule==1.2.0 +pytz==2023.3 diff --git a/news_crawler/scheduled_crawler.py b/news_crawler/scheduled_crawler.py new file mode 100755 index 0000000..73b04db --- /dev/null +++ b/news_crawler/scheduled_crawler.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +""" +Scheduled crawler that runs daily at 6 AM Berlin time +""" +import schedule +import time +from datetime import datetime +import pytz +from crawler_service import crawl_all_feeds + +# Berlin timezone +BERLIN_TZ = pytz.timezone('Europe/Berlin') + +def run_crawler(): + """Run the crawler and log the execution""" + berlin_time = datetime.now(BERLIN_TZ) + print(f"\n{'='*60}") + print(f"πŸ• Scheduled crawler started at {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + print(f"{'='*60}\n") + + try: + # Run crawler with max 20 articles per feed + result = crawl_all_feeds(max_articles_per_feed=20) + + print(f"\n{'='*60}") + print(f"βœ“ Scheduled crawler completed successfully") + print(f" Articles crawled: {result['total_articles_crawled']}") + print(f" Duration: {result['duration_seconds']}s") + print(f"{'='*60}\n") + + except Exception as e: + print(f"\n{'='*60}") + print(f"βœ— Scheduled crawler failed: {e}") + print(f"{'='*60}\n") + +def main(): + """Main scheduler loop""" + print("πŸ€– Munich News Crawler Scheduler") + print("="*60) + print("Schedule: Daily at 6:00 AM Berlin time") + print("Timezone: Europe/Berlin (CET/CEST)") + print("="*60) + + # Schedule the crawler to run at 6 AM Berlin time + schedule.every().day.at("06:00").do(run_crawler) + + # Show next run time + berlin_time = datetime.now(BERLIN_TZ) + print(f"\nCurrent time (Berlin): {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + # Get next scheduled run + next_run = schedule.next_run() + if next_run: + # Convert to Berlin time for display + next_run_berlin = next_run.astimezone(BERLIN_TZ) + print(f"Next scheduled run: {next_run_berlin.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + print("\n⏳ Scheduler is running... (Press Ctrl+C to stop)\n") + + # Run immediately on startup (optional - comment out if you don't want this) + print("πŸš€ Running initial crawl on startup...") + run_crawler() + + # Keep the scheduler running + while True: + schedule.run_pending() + time.sleep(60) # Check every minute + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + print("\n\nπŸ‘‹ Scheduler stopped by user") + except Exception as e: + print(f"\n\nβœ— Scheduler error: {e}") diff --git a/news_sender/Dockerfile b/news_sender/Dockerfile new file mode 100644 index 0000000..5e8adb9 --- /dev/null +++ b/news_sender/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy sender files +COPY . . + +# Copy backend files (needed for tracking and config) +COPY ../backend/services /app/backend/services +COPY ../backend/.env /app/.env + +# Make the scheduler executable +RUN chmod +x scheduled_sender.py + +# Set timezone to Berlin +ENV TZ=Europe/Berlin +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +# Run the scheduled sender +CMD ["python", "-u", "scheduled_sender.py"] diff --git a/news_sender/README.md b/news_sender/README.md deleted file mode 100644 index 74649b9..0000000 --- a/news_sender/README.md +++ /dev/null @@ -1,303 +0,0 @@ -# News Sender Microservice - -Standalone service for sending Munich News Daily newsletters to subscribers. - -## Features - -- πŸ“§ Sends beautiful HTML newsletters -- πŸ€– Uses AI-generated article summaries -- πŸ“Š Tracks sending statistics -- πŸ§ͺ Test mode for development -- πŸ“ Preview generation -- πŸ”„ Fetches data from shared MongoDB - -## Installation - -```bash -cd news_sender -pip install -r requirements.txt -``` - -## Configuration - -The service uses the same `.env` file as the backend (`../backend/.env`): - -```env -# MongoDB -MONGODB_URI=mongodb://localhost:27017/ - -# Email (Gmail example) -SMTP_SERVER=smtp.gmail.com -SMTP_PORT=587 -EMAIL_USER=your-email@gmail.com -EMAIL_PASSWORD=your-app-password - -# Newsletter Settings (optional) -NEWSLETTER_MAX_ARTICLES=10 -WEBSITE_URL=http://localhost:3000 -``` - -**Gmail Setup:** -1. Enable 2-factor authentication -2. Generate an App Password: https://support.google.com/accounts/answer/185833 -3. Use the App Password (not your regular password) - -## Usage - -### 1. Preview Newsletter - -Generate HTML preview without sending: - -```bash -python sender_service.py preview -``` - -This creates `newsletter_preview.html` - open it in your browser to see how the newsletter looks. - -### 2. Send Test Email - -Send to a single email address for testing: - -```bash -python sender_service.py test your-email@example.com -``` - -### 3. Send to All Subscribers - -Send newsletter to all active subscribers: - -```bash -# Send with default article count (10) -python sender_service.py send - -# Send with custom article count -python sender_service.py send 15 -``` - -### 4. Use as Python Module - -```python -from sender_service import send_newsletter, preview_newsletter - -# Send newsletter -result = send_newsletter(max_articles=10) -print(f"Sent to {result['sent_count']} subscribers") - -# Generate preview -html = preview_newsletter(max_articles=5) -``` - -## How It Works - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ 1. Fetch Articles from MongoDB β”‚ -β”‚ - Get latest articles with AI summaries β”‚ -β”‚ - Sort by creation date (newest first) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - ↓ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ 2. Fetch Active Subscribers β”‚ -β”‚ - Get all subscribers with status='active' β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - ↓ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ 3. Render Newsletter HTML β”‚ -β”‚ - Load newsletter_template.html β”‚ -β”‚ - Populate with articles and metadata β”‚ -β”‚ - Generate beautiful HTML email β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - ↓ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ 4. Send Emails β”‚ -β”‚ - Connect to SMTP server β”‚ -β”‚ - Send to each subscriber β”‚ -β”‚ - Track success/failure β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - ↓ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ 5. Report Statistics β”‚ -β”‚ - Total sent β”‚ -β”‚ - Failed sends β”‚ -β”‚ - Error details β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## Output Example - -``` -====================================================================== -πŸ“§ Munich News Daily - Newsletter Sender -====================================================================== - -Fetching latest 10 articles with AI summaries... -βœ“ Found 10 articles - -Fetching active subscribers... -βœ“ Found 150 active subscriber(s) - -Rendering newsletter HTML... -βœ“ Newsletter rendered - -Sending newsletter: 'Munich News Daily - November 10, 2024' ----------------------------------------------------------------------- -[1/150] Sending to user1@example.com... βœ“ -[2/150] Sending to user2@example.com... βœ“ -[3/150] Sending to user3@example.com... βœ“ -... - -====================================================================== -πŸ“Š Sending Complete -====================================================================== -βœ“ Successfully sent: 148 -βœ— Failed: 2 -πŸ“° Articles included: 10 -====================================================================== -``` - -## Scheduling - -### Using Cron (Linux/Mac) - -Send newsletter daily at 8 AM: - -```bash -# Edit crontab -crontab -e - -# Add this line -0 8 * * * cd /path/to/news_sender && /path/to/venv/bin/python sender_service.py send -``` - -### Using systemd Timer (Linux) - -Create `/etc/systemd/system/news-sender.service`: - -```ini -[Unit] -Description=Munich News Sender - -[Service] -Type=oneshot -WorkingDirectory=/path/to/news_sender -ExecStart=/path/to/venv/bin/python sender_service.py send -User=your-user -``` - -Create `/etc/systemd/system/news-sender.timer`: - -```ini -[Unit] -Description=Send Munich News Daily at 8 AM - -[Timer] -OnCalendar=daily -OnCalendar=*-*-* 08:00:00 - -[Install] -WantedBy=timers.target -``` - -Enable and start: - -```bash -sudo systemctl enable news-sender.timer -sudo systemctl start news-sender.timer -``` - -### Using Docker - -Create `Dockerfile`: - -```dockerfile -FROM python:3.11-slim - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY sender_service.py newsletter_template.html ./ - -CMD ["python", "sender_service.py", "send"] -``` - -Build and run: - -```bash -docker build -t news-sender . -docker run --env-file ../backend/.env news-sender -``` - -## Troubleshooting - -### "Email credentials not configured" -- Check that `EMAIL_USER` and `EMAIL_PASSWORD` are set in `.env` -- For Gmail, use an App Password, not your regular password - -### "No articles with summaries found" -- Run the crawler first: `cd ../news_crawler && python crawler_service.py 10` -- Make sure Ollama is enabled and working -- Check MongoDB has articles with `summary` field - -### "No active subscribers found" -- Add subscribers via the backend API -- Check subscriber status is 'active' in MongoDB - -### SMTP Connection Errors -- Verify SMTP server and port are correct -- Check firewall isn't blocking SMTP port -- For Gmail, ensure "Less secure app access" is enabled or use App Password - -### Emails Going to Spam -- Set up SPF, DKIM, and DMARC records for your domain -- Use a verified email address -- Avoid spam trigger words in subject/content -- Include unsubscribe link (already included in template) - -## Architecture - -This is a standalone microservice that: -- Runs independently of the backend -- Shares the same MongoDB database -- Can be deployed separately -- Can be scheduled independently -- Has no dependencies on backend code - -## Integration with Other Services - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Backend β”‚ β”‚ Crawler β”‚ β”‚ Sender β”‚ -β”‚ (Flask) β”‚ β”‚ (Scraper) β”‚ β”‚ (Email) β”‚ -β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ β”‚ β”‚ - β”‚ β”‚ β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ MongoDB β”‚ - β”‚ (Shared DB) β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## Next Steps - -1. **Test the newsletter:** - ```bash - python sender_service.py test your-email@example.com - ``` - -2. **Schedule daily sending:** - - Set up cron job or systemd timer - - Choose appropriate time (e.g., 8 AM) - -3. **Monitor sending:** - - Check logs for errors - - Track open rates (requires email tracking service) - - Monitor spam complaints - -4. **Optimize:** - - Add email tracking pixels - - A/B test subject lines - - Personalize content per subscriber diff --git a/news_sender/newsletter_template.html b/news_sender/newsletter_template.html index 134eb81..a2db610 100644 --- a/news_sender/newsletter_template.html +++ b/news_sender/newsletter_template.html @@ -146,6 +146,14 @@ Unsubscribe

+ {% if tracking_enabled %} + +

+ This email contains tracking to measure engagement and improve our content.
+ We respect your privacy and anonymize data after 90 days. +

+ {% endif %} +

Β© {{ year }} Munich News Daily. All rights reserved.

diff --git a/news_sender/requirements.txt b/news_sender/requirements.txt index 267e0bd..14d0870 100644 --- a/news_sender/requirements.txt +++ b/news_sender/requirements.txt @@ -1,3 +1,6 @@ pymongo==4.6.1 python-dotenv==1.0.0 Jinja2==3.1.2 +beautifulsoup4==4.12.2 +schedule==1.2.0 +pytz==2023.3 diff --git a/news_sender/scheduled_sender.py b/news_sender/scheduled_sender.py new file mode 100755 index 0000000..cca166a --- /dev/null +++ b/news_sender/scheduled_sender.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Scheduled newsletter sender that runs daily at 7 AM Berlin time +Waits for crawler to finish before sending to ensure fresh content +""" +import schedule +import time +from datetime import datetime, timedelta +import pytz +from pathlib import Path +import sys + +# Add current directory to path +sys.path.insert(0, str(Path(__file__).parent)) + +from sender_service import send_newsletter, get_latest_articles, Config + +# Berlin timezone +BERLIN_TZ = pytz.timezone('Europe/Berlin') + +# Maximum time to wait for crawler (in minutes) +MAX_WAIT_TIME = 30 + +def check_crawler_finished(): + """ + Check if crawler has finished by looking for recent articles + Returns: (bool, str) - (is_finished, message) + """ + try: + # Check if we have articles from today + articles = get_latest_articles(max_articles=1, hours=2) + + if articles: + # Check if the most recent article was crawled recently (within last 2 hours) + latest_article = articles[0] + crawled_at = latest_article.get('crawled_at') + + if crawled_at: + time_since_crawl = datetime.utcnow() - crawled_at + minutes_since = time_since_crawl.total_seconds() / 60 + + if minutes_since < 120: # Within last 2 hours + return True, f"Crawler finished {int(minutes_since)} minutes ago" + + return False, "No recent articles found" + + except Exception as e: + return False, f"Error checking crawler status: {e}" + + +def wait_for_crawler(max_wait_minutes=30): + """ + Wait for crawler to finish before sending newsletter + + Args: + max_wait_minutes: Maximum time to wait in minutes + + Returns: + bool: True if crawler finished, False if timeout + """ + berlin_time = datetime.now(BERLIN_TZ) + print(f"\n⏳ Waiting for crawler to finish...") + print(f" Current time: {berlin_time.strftime('%H:%M:%S %Z')}") + print(f" Max wait time: {max_wait_minutes} minutes") + + start_time = time.time() + check_interval = 30 # Check every 30 seconds + + while True: + elapsed_minutes = (time.time() - start_time) / 60 + + # Check if crawler finished + is_finished, message = check_crawler_finished() + + if is_finished: + print(f" βœ“ {message}") + return True + + # Check if we've exceeded max wait time + if elapsed_minutes >= max_wait_minutes: + print(f" ⚠ Timeout after {max_wait_minutes} minutes") + print(f" Proceeding with available articles...") + return False + + # Show progress + remaining = max_wait_minutes - elapsed_minutes + print(f" ⏳ Still waiting... ({remaining:.1f} minutes remaining) - {message}") + + # Wait before next check + time.sleep(check_interval) + + +def run_sender(): + """Run the newsletter sender with crawler coordination""" + berlin_time = datetime.now(BERLIN_TZ) + print(f"\n{'='*70}") + print(f"πŸ“§ Scheduled newsletter sender started") + print(f" Time: {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + print(f"{'='*70}\n") + + try: + # Wait for crawler to finish (max 30 minutes) + crawler_finished = wait_for_crawler(max_wait_minutes=MAX_WAIT_TIME) + + if not crawler_finished: + print(f"\n⚠ Crawler may still be running, but proceeding anyway...") + + print(f"\n{'='*70}") + print(f"πŸ“§ Starting newsletter send...") + print(f"{'='*70}\n") + + # Send newsletter to all subscribers + result = send_newsletter(max_articles=Config.MAX_ARTICLES) + + if result['success']: + print(f"\n{'='*70}") + print(f"βœ… Newsletter sent successfully!") + print(f" Sent: {result['sent_count']}/{result['total_subscribers']}") + print(f" Articles: {result['article_count']}") + print(f" Failed: {result['failed_count']}") + print(f"{'='*70}\n") + else: + print(f"\n{'='*70}") + print(f"❌ Newsletter send failed: {result.get('error', 'Unknown error')}") + print(f"{'='*70}\n") + + except Exception as e: + print(f"\n{'='*70}") + print(f"❌ Scheduled sender error: {e}") + print(f"{'='*70}\n") + import traceback + traceback.print_exc() + + +def main(): + """Main scheduler loop""" + print("πŸ“§ Munich News Newsletter Scheduler") + print("="*70) + print("Schedule: Daily at 7:00 AM Berlin time") + print("Timezone: Europe/Berlin (CET/CEST)") + print("Coordination: Waits for crawler to finish (max 30 min)") + print("="*70) + + # Schedule the sender to run at 7 AM Berlin time + schedule.every().day.at("07:00").do(run_sender) + + # Show next run time + berlin_time = datetime.now(BERLIN_TZ) + print(f"\nCurrent time (Berlin): {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + # Get next scheduled run + next_run = schedule.next_run() + if next_run: + # Convert to Berlin time for display + next_run_berlin = next_run.astimezone(BERLIN_TZ) + print(f"Next scheduled run: {next_run_berlin.strftime('%Y-%m-%d %H:%M:%S %Z')}") + + print("\n⏳ Scheduler is running... (Press Ctrl+C to stop)\n") + + # Optional: Run immediately on startup (comment out if you don't want this) + # print("πŸš€ Running initial send on startup...") + # run_sender() + + # Keep the scheduler running + while True: + schedule.run_pending() + time.sleep(60) # Check every minute + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + print("\n\nπŸ‘‹ Scheduler stopped by user") + except Exception as e: + print(f"\n\n❌ Scheduler error: {e}") + import traceback + traceback.print_exc() diff --git a/news_sender/sender_service.py b/news_sender/sender_service.py index cfdc24d..036cb32 100644 --- a/news_sender/sender_service.py +++ b/news_sender/sender_service.py @@ -11,8 +11,17 @@ from pathlib import Path from jinja2 import Template from pymongo import MongoClient import os +import sys from dotenv import load_dotenv +# Add backend directory to path for importing tracking service +backend_dir = Path(__file__).parent.parent / 'backend' +sys.path.insert(0, str(backend_dir)) + +# Import tracking modules +from services import tracking_service +from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls + # Load environment variables from backend/.env backend_dir = Path(__file__).parent.parent / 'backend' env_path = backend_dir / '.env' @@ -40,6 +49,11 @@ class Config: MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10')) HOURS_LOOKBACK = int(os.getenv('NEWSLETTER_HOURS_LOOKBACK', '24')) WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000') + + # Tracking + TRACKING_ENABLED = os.getenv('TRACKING_ENABLED', 'true').lower() == 'true' + TRACKING_API_URL = os.getenv('TRACKING_API_URL', 'http://localhost:5001') + TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90')) # MongoDB connection @@ -117,15 +131,20 @@ def get_active_subscribers(): return [doc['email'] for doc in cursor] -def render_newsletter_html(articles): +def render_newsletter_html(articles, tracking_enabled=False, pixel_tracking_id=None, + link_tracking_map=None, api_url=None): """ - Render newsletter HTML from template + Render newsletter HTML from template with optional tracking integration Args: articles: List of article dictionaries + tracking_enabled: Whether to inject tracking pixel and replace links + pixel_tracking_id: Tracking ID for the email open pixel + link_tracking_map: Dictionary mapping original URLs to tracking IDs + api_url: Base URL for the tracking API Returns: - str: Rendered HTML content + str: Rendered HTML content with tracking injected if enabled """ # Load template template_path = Path(__file__).parent / 'newsletter_template.html' @@ -142,11 +161,23 @@ def render_newsletter_html(articles): 'article_count': len(articles), 'articles': articles, 'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe', - 'website_link': Config.WEBSITE_URL + 'website_link': Config.WEBSITE_URL, + 'tracking_enabled': tracking_enabled } # Render HTML - return template.render(**template_data) + html = template.render(**template_data) + + # Inject tracking if enabled + if tracking_enabled and pixel_tracking_id and api_url: + # Inject tracking pixel + html = inject_tracking_pixel(html, pixel_tracking_id, api_url) + + # Replace article links with tracking URLs + if link_tracking_map: + html = replace_article_links(html, link_tracking_map, api_url) + + return html def send_email(to_email, subject, html_content): @@ -246,14 +277,14 @@ def send_newsletter(max_articles=None, test_email=None): 'error': 'No active subscribers' } - # Render newsletter - print("\nRendering newsletter HTML...") - html_content = render_newsletter_html(articles) - print("βœ“ Newsletter rendered") + # Generate newsletter ID (date-based) + newsletter_id = f"newsletter-{datetime.now().strftime('%Y-%m-%d')}" # Send to subscribers subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}" print(f"\nSending newsletter: '{subject}'") + print(f"Newsletter ID: {newsletter_id}") + print(f"Tracking enabled: {Config.TRACKING_ENABLED}") print("-" * 70) sent_count = 0 @@ -262,6 +293,34 @@ def send_newsletter(max_articles=None, test_email=None): for i, email in enumerate(subscribers, 1): print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ') + + # Generate tracking data for this subscriber if tracking is enabled + if Config.TRACKING_ENABLED: + try: + tracking_data = generate_tracking_urls( + articles=articles, + newsletter_id=newsletter_id, + subscriber_email=email, + tracking_service=tracking_service + ) + + # Render newsletter with tracking + html_content = render_newsletter_html( + articles=articles, + tracking_enabled=True, + pixel_tracking_id=tracking_data['pixel_tracking_id'], + link_tracking_map=tracking_data['link_tracking_map'], + api_url=Config.TRACKING_API_URL + ) + except Exception as e: + print(f"⚠ Tracking error: {e}, sending without tracking...", end=' ') + # Fallback: send without tracking + html_content = render_newsletter_html(articles) + else: + # Render newsletter without tracking + html_content = render_newsletter_html(articles) + + # Send email success, error = send_email(email, subject, html_content) if success: @@ -310,12 +369,11 @@ def preview_newsletter(max_articles=None, hours=None): today_date = datetime.now().strftime('%B %d, %Y') return f"

No articles from today found

No articles published today ({today_date}). Run the crawler with Ollama enabled to get fresh content.

" - return render_newsletter_html(articles) + # Preview without tracking + return render_newsletter_html(articles, tracking_enabled=False) if __name__ == '__main__': - import sys - # Parse command line arguments if len(sys.argv) > 1: command = sys.argv[1] diff --git a/news_sender/tracking_integration.py b/news_sender/tracking_integration.py new file mode 100644 index 0000000..6b8d385 --- /dev/null +++ b/news_sender/tracking_integration.py @@ -0,0 +1,150 @@ +""" +Tracking integration module for Munich News Daily newsletter system. +Handles injection of tracking pixels and replacement of article links with tracking URLs. +""" + +import re +from typing import Dict, List +from bs4 import BeautifulSoup + + +def inject_tracking_pixel(html: str, tracking_id: str, api_url: str) -> str: + """ + Inject tracking pixel into newsletter HTML before closing tag. + + The tracking pixel is a 1x1 transparent image that loads when the email is opened, + allowing us to track email opens. + + Args: + html: Original newsletter HTML content + tracking_id: Unique tracking ID for this newsletter send (None if tracking disabled) + api_url: Base URL for the tracking API (e.g., http://localhost:5001) + + Returns: + str: HTML with tracking pixel injected (unchanged if tracking_id is None) + + Example: + >>> html = '

Content

' + >>> inject_tracking_pixel(html, 'abc-123', 'http://api.example.com') + '

Content

' + """ + # Skip tracking if no tracking_id provided (subscriber opted out) + if not tracking_id: + return html + + # Construct tracking pixel URL + pixel_url = f"{api_url}/api/track/pixel/{tracking_id}" + + # Create tracking pixel HTML + pixel_html = f'' + + # Inject pixel before closing tag + if '' in html: + html = html.replace('', f'{pixel_html}') + else: + # Fallback: append to end if no tag found + html += pixel_html + + return html + + +def replace_article_links( + html: str, + link_tracking_map: Dict[str, str], + api_url: str +) -> str: + """ + Replace article links in newsletter HTML with tracking URLs. + + Finds all article links in the HTML and replaces them with tracking redirect URLs + that log clicks before redirecting to the original article. + + Args: + html: Original newsletter HTML content + link_tracking_map: Dictionary mapping original URLs to tracking IDs (empty if tracking disabled) + api_url: Base URL for the tracking API (e.g., http://localhost:5001) + + Returns: + str: HTML with article links replaced by tracking URLs (unchanged if map is empty) + + Example: + >>> html = 'Read' + >>> mapping = {'https://example.com/article': 'track-123'} + >>> replace_article_links(html, mapping, 'http://api.example.com') + 'Read' + """ + # Skip tracking if no tracking map provided (subscriber opted out) + if not link_tracking_map: + return html + + # Parse HTML with BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Find all tags with href attributes + for link in soup.find_all('a', href=True): + original_url = link['href'] + + # Check if this URL should be tracked + if original_url in link_tracking_map: + tracking_id = link_tracking_map[original_url] + tracking_url = f"{api_url}/api/track/click/{tracking_id}" + + # Replace the href with tracking URL + link['href'] = tracking_url + + # Return modified HTML + return str(soup) + + +def generate_tracking_urls( + articles: List[Dict], + newsletter_id: str, + subscriber_email: str, + tracking_service +) -> Dict[str, str]: + """ + Generate tracking records for all article links and return URL mapping. + + Creates tracking records in the database for each article link and returns + a mapping of original URLs to tracking IDs. + + Args: + articles: List of article dictionaries with 'link' and 'title' keys + newsletter_id: Unique identifier for the newsletter batch + subscriber_email: Email address of the recipient + tracking_service: Tracking service module with create_newsletter_tracking function + + Returns: + dict: Dictionary containing: + - pixel_tracking_id: ID for the tracking pixel + - link_tracking_map: Dict mapping original URLs to tracking IDs + + Example: + >>> articles = [{'link': 'https://example.com/1', 'title': 'Article 1'}] + >>> generate_tracking_urls(articles, 'news-2024-01-01', 'user@example.com', tracking_service) + { + 'pixel_tracking_id': 'uuid-for-pixel', + 'link_tracking_map': {'https://example.com/1': 'uuid-for-link'} + } + """ + # Prepare article links for tracking + article_links = [] + for article in articles: + if 'link' in article and article['link']: + article_links.append({ + 'url': article['link'], + 'title': article.get('title', '') + }) + + # Create tracking records using the tracking service + tracking_data = tracking_service.create_newsletter_tracking( + newsletter_id=newsletter_id, + subscriber_email=subscriber_email, + article_links=article_links + ) + + return { + 'pixel_tracking_id': tracking_data['pixel_tracking_id'], + 'link_tracking_map': tracking_data['link_tracking_map'], + 'tracking_enabled': tracking_data.get('tracking_enabled', True) + } diff --git a/tests/backend/test_analytics.py b/tests/backend/test_analytics.py new file mode 100644 index 0000000..49acb8f --- /dev/null +++ b/tests/backend/test_analytics.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python +""" +Test analytics functionality for email tracking +Run from backend directory with venv activated: + cd backend + source venv/bin/activate # or venv\Scripts\activate on Windows + python test_analytics.py +""" + +import sys +import os +from datetime import datetime, timedelta + +# Add backend directory to path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from services.analytics_service import ( + get_open_rate, + get_click_rate, + get_newsletter_metrics, + get_article_performance, + get_subscriber_activity_status, + update_subscriber_activity_statuses +) +from database import ( + newsletter_sends_collection, + link_clicks_collection, + subscriber_activity_collection +) +from app import app + +print("\n" + "="*80) +print("Analytics Service Tests") +print("="*80) + +# Test counters +tests_passed = 0 +tests_failed = 0 + +def test_result(test_name, passed, message=""): + """Print test result""" + global tests_passed, tests_failed + if passed: + tests_passed += 1 + print(f"βœ“ {test_name}") + if message: + print(f" {message}") + else: + tests_failed += 1 + print(f"❌ {test_name}") + if message: + print(f" {message}") + + +# Setup test data +print("\n" + "-"*80) +print("Setting up test data...") +print("-"*80) + +try: + # Clean up existing test data + newsletter_sends_collection.delete_many({'newsletter_id': {'$regex': '^test-analytics-'}}) + link_clicks_collection.delete_many({'newsletter_id': {'$regex': '^test-analytics-'}}) + subscriber_activity_collection.delete_many({'email': {'$regex': '^test-analytics-'}}) + + # Create test newsletter sends + test_newsletter_id = 'test-analytics-newsletter-001' + + # Create 10 newsletter sends: 7 opened, 3 not opened + for i in range(10): + opened = i < 7 # First 7 are opened + doc = { + 'newsletter_id': test_newsletter_id, + 'subscriber_email': f'test-analytics-user{i}@example.com', + 'tracking_id': f'test-pixel-{i}', + 'sent_at': datetime.utcnow(), + 'opened': opened, + 'first_opened_at': datetime.utcnow() if opened else None, + 'last_opened_at': datetime.utcnow() if opened else None, + 'open_count': 1 if opened else 0, + 'created_at': datetime.utcnow() + } + newsletter_sends_collection.insert_one(doc) + + # Create test link clicks for an article + test_article_url = 'https://example.com/test-analytics-article' + + # Create 10 link tracking records: 4 clicked, 6 not clicked + for i in range(10): + clicked = i < 4 # First 4 are clicked + doc = { + 'tracking_id': f'test-link-{i}', + 'newsletter_id': test_newsletter_id, + 'subscriber_email': f'test-analytics-user{i}@example.com', + 'article_url': test_article_url, + 'article_title': 'Test Analytics Article', + 'clicked': clicked, + 'clicked_at': datetime.utcnow() if clicked else None, + 'user_agent': 'Test Agent' if clicked else None, + 'created_at': datetime.utcnow() + } + link_clicks_collection.insert_one(doc) + + print("βœ“ Test data created") + +except Exception as e: + print(f"❌ Error setting up test data: {str(e)}") + import traceback + traceback.print_exc() + sys.exit(1) + + +# Test 1: Open Rate Calculation +print("\n" + "-"*80) +print("Test 1: Open Rate Calculation") +print("-"*80) + +try: + open_rate = get_open_rate(test_newsletter_id) + + # Expected: 7 out of 10 = 70% + is_correct = open_rate == 70.0 + test_result("Calculate open rate", is_correct, f"Open rate: {open_rate}% (expected 70%)") + + # Test with non-existent newsletter + open_rate_empty = get_open_rate('non-existent-newsletter') + handles_empty = open_rate_empty == 0.0 + test_result("Handle non-existent newsletter", handles_empty, + f"Open rate: {open_rate_empty}% (expected 0%)") + +except Exception as e: + test_result("Open rate calculation", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 2: Click Rate Calculation +print("\n" + "-"*80) +print("Test 2: Click Rate Calculation") +print("-"*80) + +try: + click_rate = get_click_rate(test_article_url) + + # Expected: 4 out of 10 = 40% + is_correct = click_rate == 40.0 + test_result("Calculate click rate", is_correct, f"Click rate: {click_rate}% (expected 40%)") + + # Test with non-existent article + click_rate_empty = get_click_rate('https://example.com/non-existent') + handles_empty = click_rate_empty == 0.0 + test_result("Handle non-existent article", handles_empty, + f"Click rate: {click_rate_empty}% (expected 0%)") + +except Exception as e: + test_result("Click rate calculation", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 3: Newsletter Metrics +print("\n" + "-"*80) +print("Test 3: Newsletter Metrics") +print("-"*80) + +try: + metrics = get_newsletter_metrics(test_newsletter_id) + + # Verify all expected fields + has_all_fields = all(key in metrics for key in [ + 'newsletter_id', 'total_sent', 'total_opened', 'open_rate', + 'total_clicks', 'unique_clickers', 'click_through_rate' + ]) + test_result("Returns all required fields", has_all_fields) + + # Verify values + correct_sent = metrics['total_sent'] == 10 + test_result("Correct total_sent", correct_sent, f"Total sent: {metrics['total_sent']}") + + correct_opened = metrics['total_opened'] == 7 + test_result("Correct total_opened", correct_opened, f"Total opened: {metrics['total_opened']}") + + correct_open_rate = metrics['open_rate'] == 70.0 + test_result("Correct open_rate", correct_open_rate, f"Open rate: {metrics['open_rate']}%") + + correct_clicks = metrics['total_clicks'] == 4 + test_result("Correct total_clicks", correct_clicks, f"Total clicks: {metrics['total_clicks']}") + + correct_unique_clickers = metrics['unique_clickers'] == 4 + test_result("Correct unique_clickers", correct_unique_clickers, + f"Unique clickers: {metrics['unique_clickers']}") + + correct_ctr = metrics['click_through_rate'] == 40.0 + test_result("Correct click_through_rate", correct_ctr, + f"CTR: {metrics['click_through_rate']}%") + +except Exception as e: + test_result("Newsletter metrics", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 4: Article Performance +print("\n" + "-"*80) +print("Test 4: Article Performance") +print("-"*80) + +try: + performance = get_article_performance(test_article_url) + + # Verify all expected fields + has_all_fields = all(key in performance for key in [ + 'article_url', 'total_sent', 'total_clicks', 'click_rate', + 'unique_clickers', 'newsletters' + ]) + test_result("Returns all required fields", has_all_fields) + + # Verify values + correct_sent = performance['total_sent'] == 10 + test_result("Correct total_sent", correct_sent, f"Total sent: {performance['total_sent']}") + + correct_clicks = performance['total_clicks'] == 4 + test_result("Correct total_clicks", correct_clicks, f"Total clicks: {performance['total_clicks']}") + + correct_click_rate = performance['click_rate'] == 40.0 + test_result("Correct click_rate", correct_click_rate, f"Click rate: {performance['click_rate']}%") + + correct_unique = performance['unique_clickers'] == 4 + test_result("Correct unique_clickers", correct_unique, + f"Unique clickers: {performance['unique_clickers']}") + + has_newsletters = len(performance['newsletters']) > 0 + test_result("Returns newsletter list", has_newsletters, + f"Newsletters: {performance['newsletters']}") + +except Exception as e: + test_result("Article performance", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 5: Activity Status Classification +print("\n" + "-"*80) +print("Test 5: Activity Status Classification") +print("-"*80) + +try: + # Create test data for activity classification + now = datetime.utcnow() + + # Active user (opened 10 days ago) + newsletter_sends_collection.insert_one({ + 'newsletter_id': 'test-analytics-activity', + 'subscriber_email': 'test-analytics-active@example.com', + 'tracking_id': 'test-active-pixel', + 'sent_at': now - timedelta(days=10), + 'opened': True, + 'first_opened_at': now - timedelta(days=10), + 'last_opened_at': now - timedelta(days=10), + 'open_count': 1, + 'created_at': now - timedelta(days=10) + }) + + # Inactive user (opened 45 days ago) + newsletter_sends_collection.insert_one({ + 'newsletter_id': 'test-analytics-activity', + 'subscriber_email': 'test-analytics-inactive@example.com', + 'tracking_id': 'test-inactive-pixel', + 'sent_at': now - timedelta(days=45), + 'opened': True, + 'first_opened_at': now - timedelta(days=45), + 'last_opened_at': now - timedelta(days=45), + 'open_count': 1, + 'created_at': now - timedelta(days=45) + }) + + # Dormant user (opened 90 days ago) + newsletter_sends_collection.insert_one({ + 'newsletter_id': 'test-analytics-activity', + 'subscriber_email': 'test-analytics-dormant@example.com', + 'tracking_id': 'test-dormant-pixel', + 'sent_at': now - timedelta(days=90), + 'opened': True, + 'first_opened_at': now - timedelta(days=90), + 'last_opened_at': now - timedelta(days=90), + 'open_count': 1, + 'created_at': now - timedelta(days=90) + }) + + # New user (never opened) + newsletter_sends_collection.insert_one({ + 'newsletter_id': 'test-analytics-activity', + 'subscriber_email': 'test-analytics-new@example.com', + 'tracking_id': 'test-new-pixel', + 'sent_at': now - timedelta(days=5), + 'opened': False, + 'first_opened_at': None, + 'last_opened_at': None, + 'open_count': 0, + 'created_at': now - timedelta(days=5) + }) + + # Test classifications + active_status = get_subscriber_activity_status('test-analytics-active@example.com') + is_active = active_status == 'active' + test_result("Classify active user", is_active, f"Status: {active_status}") + + inactive_status = get_subscriber_activity_status('test-analytics-inactive@example.com') + is_inactive = inactive_status == 'inactive' + test_result("Classify inactive user", is_inactive, f"Status: {inactive_status}") + + dormant_status = get_subscriber_activity_status('test-analytics-dormant@example.com') + is_dormant = dormant_status == 'dormant' + test_result("Classify dormant user", is_dormant, f"Status: {dormant_status}") + + new_status = get_subscriber_activity_status('test-analytics-new@example.com') + is_new = new_status == 'new' + test_result("Classify new user", is_new, f"Status: {new_status}") + +except Exception as e: + test_result("Activity status classification", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 6: Batch Update Activity Statuses +print("\n" + "-"*80) +print("Test 6: Batch Update Activity Statuses") +print("-"*80) + +try: + updated_count = update_subscriber_activity_statuses() + + # Should update all test subscribers + has_updates = updated_count > 0 + test_result("Updates subscriber records", has_updates, + f"Updated {updated_count} subscribers") + + # Verify a record was created + activity_record = subscriber_activity_collection.find_one({ + 'email': 'test-analytics-active@example.com' + }) + + record_exists = activity_record is not None + test_result("Creates activity record", record_exists) + + if activity_record: + has_required_fields = all(key in activity_record for key in [ + 'email', 'status', 'total_opens', 'total_clicks', + 'newsletters_received', 'newsletters_opened', 'updated_at' + ]) + test_result("Activity record has required fields", has_required_fields) + + correct_status = activity_record['status'] == 'active' + test_result("Activity record has correct status", correct_status, + f"Status: {activity_record['status']}") + +except Exception as e: + test_result("Batch update activity statuses", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 7: Analytics API Endpoints +print("\n" + "-"*80) +print("Test 7: Analytics API Endpoints") +print("-"*80) + +try: + with app.test_client() as client: + # Test newsletter analytics endpoint + response = client.get(f'/api/analytics/newsletter/{test_newsletter_id}') + + is_200 = response.status_code == 200 + test_result("Newsletter endpoint returns 200", is_200, f"Status: {response.status_code}") + + if is_200: + data = response.get_json() + has_data = data is not None and 'open_rate' in data + test_result("Newsletter endpoint returns data", has_data) + + # Test article analytics endpoint + response = client.get(f'/api/analytics/article/{test_article_url}') + + is_200 = response.status_code == 200 + test_result("Article endpoint returns 200", is_200, f"Status: {response.status_code}") + + if is_200: + data = response.get_json() + has_data = data is not None and 'click_rate' in data + test_result("Article endpoint returns data", has_data) + + # Test subscriber analytics endpoint + response = client.get('/api/analytics/subscriber/test-analytics-active@example.com') + + is_200 = response.status_code == 200 + test_result("Subscriber endpoint returns 200", is_200, f"Status: {response.status_code}") + + if is_200: + data = response.get_json() + has_data = data is not None and 'status' in data + test_result("Subscriber endpoint returns data", has_data) + + # Test update activity endpoint + response = client.post('/api/analytics/update-activity') + + is_200 = response.status_code == 200 + test_result("Update activity endpoint returns 200", is_200, f"Status: {response.status_code}") + + if is_200: + data = response.get_json() + has_count = data is not None and 'updated_count' in data + test_result("Update activity endpoint returns count", has_count) + +except Exception as e: + test_result("Analytics API endpoints", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Clean up test data +print("\n" + "-"*80) +print("Cleaning up test data...") +print("-"*80) + +try: + newsletter_sends_collection.delete_many({'newsletter_id': {'$regex': '^test-analytics-'}}) + link_clicks_collection.delete_many({'newsletter_id': {'$regex': '^test-analytics-'}}) + subscriber_activity_collection.delete_many({'email': {'$regex': '^test-analytics-'}}) + print("βœ“ Test data cleaned up") +except Exception as e: + print(f"⚠ Error cleaning up: {str(e)}") + + +# Summary +print("\n" + "="*80) +print("TEST SUMMARY") +print("="*80) +print(f"Total tests: {tests_passed + tests_failed}") +print(f"βœ“ Passed: {tests_passed}") +print(f"❌ Failed: {tests_failed}") + +if tests_failed == 0: + print("\nπŸŽ‰ All tests passed!") +else: + print(f"\n⚠ {tests_failed} test(s) failed") + +print("="*80 + "\n") + +# Exit with appropriate code +sys.exit(0 if tests_failed == 0 else 1) diff --git a/tests/backend/test_privacy.py b/tests/backend/test_privacy.py new file mode 100644 index 0000000..167e3c6 --- /dev/null +++ b/tests/backend/test_privacy.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python +""" +Test privacy compliance features for email tracking +Run from backend directory with venv activated: + cd backend + source venv/bin/activate # or venv\Scripts\activate on Windows + python test_privacy.py +""" + +import sys +import os +from datetime import datetime, timedelta +from pymongo import MongoClient + +# Add backend directory to path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from config import Config +from services.tracking_service import ( + create_newsletter_tracking, + anonymize_old_tracking_data, + delete_subscriber_tracking_data +) +from database import ( + newsletter_sends_collection, + link_clicks_collection, + subscriber_activity_collection, + subscribers_collection +) +from app import app + +print("\n" + "="*80) +print("Privacy Compliance Tests") +print("="*80) + +# Test counters +tests_passed = 0 +tests_failed = 0 + +def test_result(test_name, passed, message=""): + """Print test result""" + global tests_passed, tests_failed + if passed: + tests_passed += 1 + print(f"βœ“ {test_name}") + if message: + print(f" {message}") + else: + tests_failed += 1 + print(f"❌ {test_name}") + if message: + print(f" {message}") + + +# Setup: Clean up test data +print("\n" + "-"*80) +print("Setup: Cleaning test data") +print("-"*80) + +test_newsletter_id = 'privacy-test-newsletter' +test_email = 'privacy-test@example.com' +test_email_opted_out = 'opted-out@example.com' + +newsletter_sends_collection.delete_many({'newsletter_id': test_newsletter_id}) +link_clicks_collection.delete_many({'newsletter_id': test_newsletter_id}) +subscriber_activity_collection.delete_many({'email': {'$in': [test_email, test_email_opted_out]}}) +subscribers_collection.delete_many({'email': {'$in': [test_email, test_email_opted_out]}}) + +print("βœ“ Test data cleaned") + + +# Test 1: Data Anonymization +print("\n" + "-"*80) +print("Test 1: Data Anonymization") +print("-"*80) + +try: + # Create old tracking records (older than 90 days) + old_date = datetime.utcnow() - timedelta(days=100) + + old_newsletter_doc = { + 'newsletter_id': test_newsletter_id, + 'subscriber_email': 'old-user@example.com', + 'tracking_id': 'old-tracking-id-1', + 'sent_at': old_date, + 'opened': True, + 'first_opened_at': old_date, + 'last_opened_at': old_date, + 'open_count': 3, + 'created_at': old_date + } + newsletter_sends_collection.insert_one(old_newsletter_doc) + + old_link_doc = { + 'tracking_id': 'old-link-tracking-id-1', + 'newsletter_id': test_newsletter_id, + 'subscriber_email': 'old-user@example.com', + 'article_url': 'https://example.com/old-article', + 'article_title': 'Old Article', + 'clicked': True, + 'clicked_at': old_date, + 'created_at': old_date + } + link_clicks_collection.insert_one(old_link_doc) + + # Create recent tracking records (within 90 days) + recent_date = datetime.utcnow() - timedelta(days=30) + + recent_newsletter_doc = { + 'newsletter_id': test_newsletter_id, + 'subscriber_email': 'recent-user@example.com', + 'tracking_id': 'recent-tracking-id-1', + 'sent_at': recent_date, + 'opened': True, + 'first_opened_at': recent_date, + 'last_opened_at': recent_date, + 'open_count': 1, + 'created_at': recent_date + } + newsletter_sends_collection.insert_one(recent_newsletter_doc) + + # Run anonymization + result = anonymize_old_tracking_data(retention_days=90) + + # Check that old records were anonymized + old_newsletter_after = newsletter_sends_collection.find_one({'tracking_id': 'old-tracking-id-1'}) + old_anonymized = old_newsletter_after and old_newsletter_after['subscriber_email'] == 'anonymized' + test_result("Anonymizes old newsletter records", old_anonymized, + f"Email: {old_newsletter_after.get('subscriber_email', 'N/A') if old_newsletter_after else 'N/A'}") + + old_link_after = link_clicks_collection.find_one({'tracking_id': 'old-link-tracking-id-1'}) + link_anonymized = old_link_after and old_link_after['subscriber_email'] == 'anonymized' + test_result("Anonymizes old link click records", link_anonymized, + f"Email: {old_link_after.get('subscriber_email', 'N/A') if old_link_after else 'N/A'}") + + # Check that aggregated metrics are preserved + metrics_preserved = ( + old_newsletter_after and + old_newsletter_after['open_count'] == 3 and + old_newsletter_after['opened'] == True + ) + test_result("Preserves aggregated metrics", metrics_preserved, + f"Open count: {old_newsletter_after.get('open_count', 0) if old_newsletter_after else 0}") + + # Check that recent records were NOT anonymized + recent_newsletter_after = newsletter_sends_collection.find_one({'tracking_id': 'recent-tracking-id-1'}) + recent_not_anonymized = ( + recent_newsletter_after and + recent_newsletter_after['subscriber_email'] == 'recent-user@example.com' + ) + test_result("Does not anonymize recent records", recent_not_anonymized, + f"Email: {recent_newsletter_after.get('subscriber_email', 'N/A') if recent_newsletter_after else 'N/A'}") + + # Check return counts + correct_counts = result['newsletter_sends_anonymized'] >= 1 and result['link_clicks_anonymized'] >= 1 + test_result("Returns correct anonymization counts", correct_counts, + f"Newsletter: {result['newsletter_sends_anonymized']}, Links: {result['link_clicks_anonymized']}") + +except Exception as e: + test_result("Data anonymization", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 2: User Data Deletion +print("\n" + "-"*80) +print("Test 2: User Data Deletion") +print("-"*80) + +try: + # Create tracking records for a specific user + article_links = [ + {'url': 'https://example.com/article1', 'title': 'Article 1'}, + {'url': 'https://example.com/article2', 'title': 'Article 2'} + ] + + tracking_data = create_newsletter_tracking( + newsletter_id=test_newsletter_id, + subscriber_email=test_email, + article_links=article_links + ) + + # Create subscriber activity record + subscriber_activity_collection.insert_one({ + 'email': test_email, + 'status': 'active', + 'last_opened_at': datetime.utcnow(), + 'total_opens': 5, + 'total_clicks': 3 + }) + + # Verify records exist + newsletter_count_before = newsletter_sends_collection.count_documents({'subscriber_email': test_email}) + link_count_before = link_clicks_collection.count_documents({'subscriber_email': test_email}) + activity_count_before = subscriber_activity_collection.count_documents({'email': test_email}) + + records_exist = newsletter_count_before > 0 and link_count_before > 0 and activity_count_before > 0 + test_result("Creates test tracking records", records_exist, + f"Newsletter: {newsletter_count_before}, Links: {link_count_before}, Activity: {activity_count_before}") + + # Delete all tracking data for the user + delete_result = delete_subscriber_tracking_data(test_email) + + # Verify all records were deleted + newsletter_count_after = newsletter_sends_collection.count_documents({'subscriber_email': test_email}) + link_count_after = link_clicks_collection.count_documents({'subscriber_email': test_email}) + activity_count_after = subscriber_activity_collection.count_documents({'email': test_email}) + + all_deleted = newsletter_count_after == 0 and link_count_after == 0 and activity_count_after == 0 + test_result("Deletes all tracking records", all_deleted, + f"Remaining - Newsletter: {newsletter_count_after}, Links: {link_count_after}, Activity: {activity_count_after}") + + # Check return counts + correct_delete_counts = ( + delete_result['newsletter_sends_deleted'] == newsletter_count_before and + delete_result['link_clicks_deleted'] == link_count_before and + delete_result['subscriber_activity_deleted'] == activity_count_before + ) + test_result("Returns correct deletion counts", correct_delete_counts, + f"Deleted - Newsletter: {delete_result['newsletter_sends_deleted']}, Links: {delete_result['link_clicks_deleted']}, Activity: {delete_result['subscriber_activity_deleted']}") + +except Exception as e: + test_result("User data deletion", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 3: Tracking Opt-Out +print("\n" + "-"*80) +print("Test 3: Tracking Opt-Out") +print("-"*80) + +try: + # Create subscriber with tracking disabled + subscribers_collection.insert_one({ + 'email': test_email_opted_out, + 'subscribed_at': datetime.utcnow(), + 'tracking_enabled': False + }) + + # Try to create tracking for opted-out subscriber + article_links = [ + {'url': 'https://example.com/article1', 'title': 'Article 1'} + ] + + tracking_data_opted_out = create_newsletter_tracking( + newsletter_id=test_newsletter_id, + subscriber_email=test_email_opted_out, + article_links=article_links + ) + + # Check that no tracking was created + no_pixel_id = tracking_data_opted_out['pixel_tracking_id'] is None + test_result("Does not create pixel tracking for opted-out users", no_pixel_id, + f"Pixel ID: {tracking_data_opted_out['pixel_tracking_id']}") + + empty_link_map = len(tracking_data_opted_out['link_tracking_map']) == 0 + test_result("Does not create link tracking for opted-out users", empty_link_map, + f"Link map size: {len(tracking_data_opted_out['link_tracking_map'])}") + + tracking_disabled_flag = tracking_data_opted_out.get('tracking_enabled') == False + test_result("Returns tracking_enabled=False for opted-out users", tracking_disabled_flag) + + # Verify no database records were created + newsletter_count = newsletter_sends_collection.count_documents({'subscriber_email': test_email_opted_out}) + link_count = link_clicks_collection.count_documents({'subscriber_email': test_email_opted_out}) + + no_db_records = newsletter_count == 0 and link_count == 0 + test_result("Does not create database records for opted-out users", no_db_records, + f"Newsletter records: {newsletter_count}, Link records: {link_count}") + + # Test opt-in/opt-out endpoints + with app.test_client() as client: + # Create a subscriber with tracking enabled + subscribers_collection.insert_one({ + 'email': test_email, + 'subscribed_at': datetime.utcnow(), + 'tracking_enabled': True + }) + + # Opt out + response = client.post(f'/api/tracking/subscriber/{test_email}/opt-out') + opt_out_success = response.status_code == 200 and response.json.get('success') == True + test_result("Opt-out endpoint works", opt_out_success, + f"Status: {response.status_code}") + + # Verify tracking is disabled + subscriber = subscribers_collection.find_one({'email': test_email}) + tracking_disabled = subscriber and subscriber.get('tracking_enabled') == False + test_result("Opt-out disables tracking in database", tracking_disabled) + + # Opt back in + response = client.post(f'/api/tracking/subscriber/{test_email}/opt-in') + opt_in_success = response.status_code == 200 and response.json.get('success') == True + test_result("Opt-in endpoint works", opt_in_success, + f"Status: {response.status_code}") + + # Verify tracking is enabled + subscriber = subscribers_collection.find_one({'email': test_email}) + tracking_enabled = subscriber and subscriber.get('tracking_enabled') == True + test_result("Opt-in enables tracking in database", tracking_enabled) + +except Exception as e: + test_result("Tracking opt-out", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 4: Privacy API Endpoints +print("\n" + "-"*80) +print("Test 4: Privacy API Endpoints") +print("-"*80) + +try: + with app.test_client() as client: + # Create test tracking data + article_links = [{'url': 'https://example.com/test', 'title': 'Test'}] + create_newsletter_tracking( + newsletter_id=test_newsletter_id, + subscriber_email='api-test@example.com', + article_links=article_links + ) + + # Test deletion endpoint + response = client.delete('/api/tracking/subscriber/api-test@example.com') + + delete_endpoint_works = response.status_code == 200 and response.json.get('success') == True + test_result("Deletion endpoint returns success", delete_endpoint_works, + f"Status: {response.status_code}") + + # Verify data was deleted + remaining_records = newsletter_sends_collection.count_documents({'subscriber_email': 'api-test@example.com'}) + data_deleted = remaining_records == 0 + test_result("Deletion endpoint removes data", data_deleted, + f"Remaining records: {remaining_records}") + + # Test anonymization endpoint + response = client.post('/api/tracking/anonymize', json={'retention_days': 90}) + + anonymize_endpoint_works = response.status_code == 200 and response.json.get('success') == True + test_result("Anonymization endpoint returns success", anonymize_endpoint_works, + f"Status: {response.status_code}") + + has_counts = 'anonymized_counts' in response.json + test_result("Anonymization endpoint returns counts", has_counts) + +except Exception as e: + test_result("Privacy API endpoints", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Clean up test data +print("\n" + "-"*80) +print("Cleaning up test data...") +print("-"*80) + +try: + newsletter_sends_collection.delete_many({'newsletter_id': test_newsletter_id}) + link_clicks_collection.delete_many({'newsletter_id': test_newsletter_id}) + subscriber_activity_collection.delete_many({'email': {'$in': [test_email, test_email_opted_out, 'api-test@example.com']}}) + subscribers_collection.delete_many({'email': {'$in': [test_email, test_email_opted_out, 'api-test@example.com']}}) + + # Clean up anonymized records + newsletter_sends_collection.delete_many({'subscriber_email': 'anonymized'}) + link_clicks_collection.delete_many({'subscriber_email': 'anonymized'}) + + print("βœ“ Test data cleaned up") +except Exception as e: + print(f"⚠ Error cleaning up: {str(e)}") + + +# Summary +print("\n" + "="*80) +print("TEST SUMMARY") +print("="*80) +print(f"Total tests: {tests_passed + tests_failed}") +print(f"βœ“ Passed: {tests_passed}") +print(f"❌ Failed: {tests_failed}") + +if tests_failed == 0: + print("\nπŸŽ‰ All privacy compliance tests passed!") +else: + print(f"\n⚠ {tests_failed} test(s) failed") + +print("="*80 + "\n") + +# Exit with appropriate code +sys.exit(0 if tests_failed == 0 else 1) diff --git a/backend/test_rss_extraction.py b/tests/backend/test_rss_extraction.py similarity index 100% rename from backend/test_rss_extraction.py rename to tests/backend/test_rss_extraction.py diff --git a/tests/backend/test_tracking.py b/tests/backend/test_tracking.py new file mode 100644 index 0000000..9a15764 --- /dev/null +++ b/tests/backend/test_tracking.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python +""" +Test email tracking functionality +Run from backend directory with venv activated: + cd backend + source venv/bin/activate # or venv\Scripts\activate on Windows + python test_tracking.py +""" + +import sys +import os +from datetime import datetime +from pymongo import MongoClient + +# Add backend directory to path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from config import Config +from services.tracking_service import generate_tracking_id, create_newsletter_tracking +from database import newsletter_sends_collection, link_clicks_collection +from app import app + +print("\n" + "="*80) +print("Email Tracking System Tests") +print("="*80) + +# Test counters +tests_passed = 0 +tests_failed = 0 + +def test_result(test_name, passed, message=""): + """Print test result""" + global tests_passed, tests_failed + if passed: + tests_passed += 1 + print(f"βœ“ {test_name}") + if message: + print(f" {message}") + else: + tests_failed += 1 + print(f"❌ {test_name}") + if message: + print(f" {message}") + + +# Test 1: Tracking ID Generation +print("\n" + "-"*80) +print("Test 1: Tracking ID Generation") +print("-"*80) + +try: + tracking_id = generate_tracking_id() + + # Check format (UUID4) + is_valid_uuid = len(tracking_id) == 36 and tracking_id.count('-') == 4 + test_result("Generate tracking ID", is_valid_uuid, f"Generated ID: {tracking_id}") + + # Check uniqueness + tracking_id2 = generate_tracking_id() + is_unique = tracking_id != tracking_id2 + test_result("Tracking IDs are unique", is_unique, f"ID1: {tracking_id[:8]}... ID2: {tracking_id2[:8]}...") + +except Exception as e: + test_result("Generate tracking ID", False, f"Error: {str(e)}") + + +# Test 2: Create Newsletter Tracking +print("\n" + "-"*80) +print("Test 2: Create Newsletter Tracking") +print("-"*80) + +try: + # Clean up test data first + newsletter_sends_collection.delete_many({'newsletter_id': 'test-newsletter-001'}) + link_clicks_collection.delete_many({'newsletter_id': 'test-newsletter-001'}) + + # Create tracking with article links + article_links = [ + {'url': 'https://example.com/article1', 'title': 'Test Article 1'}, + {'url': 'https://example.com/article2', 'title': 'Test Article 2'} + ] + + tracking_data = create_newsletter_tracking( + newsletter_id='test-newsletter-001', + subscriber_email='test@example.com', + article_links=article_links + ) + + # Verify return data structure + has_pixel_id = 'pixel_tracking_id' in tracking_data + test_result("Returns pixel tracking ID", has_pixel_id) + + has_link_map = 'link_tracking_map' in tracking_data + test_result("Returns link tracking map", has_link_map) + + correct_link_count = len(tracking_data.get('link_tracking_map', {})) == 2 + test_result("Creates tracking for all links", correct_link_count, + f"Created {len(tracking_data.get('link_tracking_map', {}))} link tracking records") + + # Verify database records + newsletter_record = newsletter_sends_collection.find_one({ + 'tracking_id': tracking_data['pixel_tracking_id'] + }) + + record_exists = newsletter_record is not None + test_result("Creates newsletter_sends record", record_exists) + + if newsletter_record: + correct_initial_state = ( + newsletter_record['opened'] == False and + newsletter_record['open_count'] == 0 and + newsletter_record['first_opened_at'] is None + ) + test_result("Newsletter record has correct initial state", correct_initial_state) + + # Verify link click records + link_records = list(link_clicks_collection.find({'newsletter_id': 'test-newsletter-001'})) + correct_link_records = len(link_records) == 2 + test_result("Creates link_clicks records", correct_link_records, + f"Created {len(link_records)} link click records") + +except Exception as e: + test_result("Create newsletter tracking", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 3: Tracking Pixel Endpoint +print("\n" + "-"*80) +print("Test 3: Tracking Pixel Endpoint") +print("-"*80) + +try: + with app.test_client() as client: + # Test with valid tracking ID + pixel_tracking_id = tracking_data['pixel_tracking_id'] + response = client.get(f'/api/track/pixel/{pixel_tracking_id}') + + is_png = response.content_type == 'image/png' + test_result("Returns PNG for valid tracking_id", is_png, + f"Content-Type: {response.content_type}") + + is_200 = response.status_code == 200 + test_result("Returns 200 status", is_200, f"Status: {response.status_code}") + + # Verify database was updated + updated_record = newsletter_sends_collection.find_one({ + 'tracking_id': pixel_tracking_id + }) + + was_logged = ( + updated_record and + updated_record['opened'] == True and + updated_record['open_count'] == 1 and + updated_record['first_opened_at'] is not None + ) + test_result("Logs email open event", was_logged, + f"Open count: {updated_record.get('open_count', 0) if updated_record else 0}") + + # Test multiple opens + response2 = client.get(f'/api/track/pixel/{pixel_tracking_id}') + updated_record2 = newsletter_sends_collection.find_one({ + 'tracking_id': pixel_tracking_id + }) + + handles_multiple = ( + updated_record2 and + updated_record2['open_count'] == 2 and + updated_record2['last_opened_at'] != updated_record2['first_opened_at'] + ) + test_result("Handles multiple opens", handles_multiple, + f"Open count: {updated_record2.get('open_count', 0) if updated_record2 else 0}") + + # Test with invalid tracking ID + response3 = client.get('/api/track/pixel/invalid-tracking-id-12345') + + fails_silently = response3.status_code == 200 and response3.content_type == 'image/png' + test_result("Returns PNG for invalid tracking_id (fails silently)", fails_silently) + +except Exception as e: + test_result("Tracking pixel endpoint", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Test 4: Link Redirect Endpoint +print("\n" + "-"*80) +print("Test 4: Link Redirect Endpoint") +print("-"*80) + +try: + with app.test_client() as client: + # Test with valid tracking ID + article_url = 'https://example.com/article1' + link_tracking_id = tracking_data['link_tracking_map'][article_url] + + response = client.get(f'/api/track/click/{link_tracking_id}', follow_redirects=False) + + is_redirect = response.status_code == 302 + test_result("Returns 302 redirect", is_redirect, f"Status: {response.status_code}") + + correct_location = response.location == article_url + test_result("Redirects to correct URL", correct_location, + f"Location: {response.location}") + + # Verify database was updated + click_record = link_clicks_collection.find_one({ + 'tracking_id': link_tracking_id + }) + + was_logged = ( + click_record and + click_record['clicked'] == True and + click_record['clicked_at'] is not None + ) + test_result("Logs click event", was_logged) + + # Test with invalid tracking ID + response2 = client.get('/api/track/click/invalid-tracking-id-12345', follow_redirects=False) + + redirects_on_invalid = response2.status_code == 302 + test_result("Redirects on invalid tracking_id", redirects_on_invalid, + f"Redirects to: {response2.location}") + +except Exception as e: + test_result("Link redirect endpoint", False, f"Error: {str(e)}") + import traceback + traceback.print_exc() + + +# Clean up test data +print("\n" + "-"*80) +print("Cleaning up test data...") +print("-"*80) + +try: + newsletter_sends_collection.delete_many({'newsletter_id': 'test-newsletter-001'}) + link_clicks_collection.delete_many({'newsletter_id': 'test-newsletter-001'}) + print("βœ“ Test data cleaned up") +except Exception as e: + print(f"⚠ Error cleaning up: {str(e)}") + + +# Summary +print("\n" + "="*80) +print("TEST SUMMARY") +print("="*80) +print(f"Total tests: {tests_passed + tests_failed}") +print(f"βœ“ Passed: {tests_passed}") +print(f"❌ Failed: {tests_failed}") + +if tests_failed == 0: + print("\nπŸŽ‰ All tests passed!") +else: + print(f"\n⚠ {tests_failed} test(s) failed") + +print("="*80 + "\n") + +# Exit with appropriate code +sys.exit(0 if tests_failed == 0 else 1) diff --git a/news_crawler/test_crawler.py b/tests/crawler/test_crawler.py similarity index 100% rename from news_crawler/test_crawler.py rename to tests/crawler/test_crawler.py diff --git a/news_crawler/test_ollama.py b/tests/crawler/test_ollama.py similarity index 100% rename from news_crawler/test_ollama.py rename to tests/crawler/test_ollama.py diff --git a/news_crawler/test_rss_feeds.py b/tests/crawler/test_rss_feeds.py similarity index 100% rename from news_crawler/test_rss_feeds.py rename to tests/crawler/test_rss_feeds.py diff --git a/tests/sender/test_newsletter_tracking.py b/tests/sender/test_newsletter_tracking.py new file mode 100644 index 0000000..a29afd5 --- /dev/null +++ b/tests/sender/test_newsletter_tracking.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +""" +Integration test for newsletter with tracking. +Tests the full flow of generating a newsletter with tracking enabled. +""" + +import sys +from pathlib import Path +from datetime import datetime + +# Add backend directory to path +backend_dir = Path(__file__).parent.parent / 'backend' +sys.path.insert(0, str(backend_dir)) + +# Mock the tracking service to avoid database dependency +class MockTrackingService: + """Mock tracking service for testing""" + + @staticmethod + def create_newsletter_tracking(newsletter_id, subscriber_email, article_links=None): + """Mock create_newsletter_tracking function""" + link_tracking_map = {} + + if article_links: + for i, article in enumerate(article_links): + link_tracking_map[article['url']] = f"mock-link-{i}" + + return { + 'pixel_tracking_id': 'mock-pixel-123', + 'link_tracking_map': link_tracking_map, + 'newsletter_id': newsletter_id, + 'subscriber_email': subscriber_email + } + +# Import after setting up path +from tracking_integration import inject_tracking_pixel, replace_article_links, generate_tracking_urls +from jinja2 import Template + + +def test_newsletter_with_tracking(): + """Test generating a newsletter with tracking enabled""" + print("\n" + "="*70) + print("NEWSLETTER TRACKING INTEGRATION TEST") + print("="*70) + + # Mock article data + articles = [ + { + 'title': 'Munich Tech Summit Announces 2025 Dates', + 'author': 'Tech Reporter', + 'link': 'https://example.com/tech-summit', + 'summary': 'The annual Munich Tech Summit will return in 2025 with exciting new features.', + 'source': 'Munich Tech News', + 'published_at': datetime.now() + }, + { + 'title': 'New Public Transport Routes Launched', + 'author': 'Transport Desk', + 'link': 'https://example.com/transport-routes', + 'summary': 'MVG announces three new bus routes connecting suburban areas.', + 'source': 'Munich Transport', + 'published_at': datetime.now() + } + ] + + # Configuration + newsletter_id = 'test-newsletter-2025-11-11' + subscriber_email = 'test@example.com' + api_url = 'http://localhost:5001' + + print(f"\nNewsletter ID: {newsletter_id}") + print(f"Subscriber: {subscriber_email}") + print(f"Articles: {len(articles)}") + print(f"API URL: {api_url}") + + # Step 1: Generate tracking URLs + print("\n" + "-"*70) + print("Step 1: Generate tracking data") + print("-"*70) + + tracking_data = generate_tracking_urls( + articles=articles, + newsletter_id=newsletter_id, + subscriber_email=subscriber_email, + tracking_service=MockTrackingService + ) + + print(f"βœ“ Pixel tracking ID: {tracking_data['pixel_tracking_id']}") + print(f"βœ“ Link tracking map: {len(tracking_data['link_tracking_map'])} links") + for url, tracking_id in tracking_data['link_tracking_map'].items(): + print(f" - {url} β†’ {tracking_id}") + + # Step 2: Load and render template + print("\n" + "-"*70) + print("Step 2: Render newsletter template") + print("-"*70) + + template_path = Path(__file__).parent / 'newsletter_template.html' + with open(template_path, 'r', encoding='utf-8') as f: + template_content = f.read() + + template = Template(template_content) + + now = datetime.now() + template_data = { + 'date': now.strftime('%A, %B %d, %Y'), + 'year': now.year, + 'article_count': len(articles), + 'articles': articles, + 'unsubscribe_link': 'http://localhost:3000/unsubscribe', + 'website_link': 'http://localhost:3000', + 'tracking_enabled': True + } + + html = template.render(**template_data) + print("βœ“ Template rendered") + + # Step 3: Inject tracking pixel + print("\n" + "-"*70) + print("Step 3: Inject tracking pixel") + print("-"*70) + + html = inject_tracking_pixel( + html, + tracking_data['pixel_tracking_id'], + api_url + ) + + pixel_url = f"{api_url}/api/track/pixel/{tracking_data['pixel_tracking_id']}" + if pixel_url in html: + print(f"βœ“ Tracking pixel injected: {pixel_url}") + else: + print(f"βœ— Tracking pixel NOT found") + return False + + # Step 4: Replace article links + print("\n" + "-"*70) + print("Step 4: Replace article links with tracking URLs") + print("-"*70) + + html = replace_article_links( + html, + tracking_data['link_tracking_map'], + api_url + ) + + # Verify all article links were replaced + success = True + for article in articles: + original_url = article['link'] + tracking_id = tracking_data['link_tracking_map'].get(original_url) + + if tracking_id: + tracking_url = f"{api_url}/api/track/click/{tracking_id}" + if tracking_url in html: + print(f"βœ“ Link replaced: {original_url}") + print(f" β†’ {tracking_url}") + else: + print(f"βœ— Link NOT replaced: {original_url}") + success = False + + # Verify original URL is NOT in the HTML (should be replaced) + if f'href="{original_url}"' in html: + print(f"βœ— Original URL still present: {original_url}") + success = False + + # Step 5: Verify privacy notice + print("\n" + "-"*70) + print("Step 5: Verify privacy notice") + print("-"*70) + + if "This email contains tracking to measure engagement" in html: + print("βœ“ Privacy notice present in footer") + else: + print("βœ— Privacy notice NOT found") + success = False + + # Step 6: Save output for inspection + print("\n" + "-"*70) + print("Step 6: Save test output") + print("-"*70) + + output_file = 'test_newsletter_with_tracking.html' + with open(output_file, 'w', encoding='utf-8') as f: + f.write(html) + + print(f"βœ“ Test newsletter saved to: {output_file}") + print(f" Open it in your browser to inspect the tracking integration") + + return success + + +if __name__ == '__main__': + print("\n" + "="*70) + print("TESTING NEWSLETTER WITH TRACKING") + print("="*70) + + success = test_newsletter_with_tracking() + + print("\n" + "="*70) + if success: + print("βœ“ ALL TESTS PASSED") + print("="*70 + "\n") + sys.exit(0) + else: + print("βœ— SOME TESTS FAILED") + print("="*70 + "\n") + sys.exit(1) diff --git a/tests/sender/test_newsletter_with_tracking.html b/tests/sender/test_newsletter_with_tracking.html new file mode 100644 index 0000000..a36bfe9 --- /dev/null +++ b/tests/sender/test_newsletter_with_tracking.html @@ -0,0 +1,179 @@ + + + + + + + +Munich News Daily + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ Munich News Daily +

+

+ Tuesday, November 11, 2025 +

+
+

+ Good morning β˜€οΈ +

+

+ Here's what's happening in Munich today. We've summarized 2 stories using AI so you can stay informed in under 5 minutes. +

+
+
+
+ + + + + +
+ + 1 + +
+ +

+ Munich Tech Summit Announces 2025 Dates +

+ +

+Munich Tech News + β€’ Tech Reporter +

+ +

+ The annual Munich Tech Summit will return in 2025 with exciting new features. +

+ + + Read more β†’ + +
+
+
+ + + + + +
+ + 2 + +
+ +

+ New Public Transport Routes Launched +

+ +

+Munich Transport + β€’ Transport Desk +

+ +

+ MVG announces three new bus routes connecting suburban areas. +

+ + + Read more β†’ + +
+
+
+ + + + +
+

+ Today's Digest +

+

+ 2 +

+

+ stories β€’ AI-summarized β€’ 5 min read +

+
+
+

+ Munich News Daily +

+

+ AI-powered news summaries for busy people.
+ Delivered daily to your inbox. +

+ +

+Visit Website + β€’ +Unsubscribe +

+ +

+ This email contains tracking to measure engagement and improve our content.
+ We respect your privacy and anonymize data after 90 days. +

+

+ Β© 2025 Munich News Daily. All rights reserved. +

+
+ +
+ + + \ No newline at end of file diff --git a/tests/sender/test_tracking_integration.py b/tests/sender/test_tracking_integration.py new file mode 100644 index 0000000..0f552b4 --- /dev/null +++ b/tests/sender/test_tracking_integration.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +""" +Test script for tracking integration in newsletter sender. +Tests tracking pixel injection and link replacement. +""" + +import sys +from pathlib import Path + +# Add backend directory to path +backend_dir = Path(__file__).parent.parent / 'backend' +sys.path.insert(0, str(backend_dir)) + +from tracking_integration import inject_tracking_pixel, replace_article_links + + +def test_inject_tracking_pixel(): + """Test that tracking pixel is correctly injected into HTML""" + print("\n" + "="*70) + print("TEST 1: Inject Tracking Pixel") + print("="*70) + + # Test HTML + html = """ + +

Newsletter content

+ +""" + + tracking_id = "test-tracking-123" + api_url = "http://localhost:5001" + + # Inject pixel + result = inject_tracking_pixel(html, tracking_id, api_url) + + # Verify pixel is present + expected_pixel = f'' + + if expected_pixel in result: + print("βœ“ Tracking pixel correctly injected") + print(f" Pixel URL: {api_url}/api/track/pixel/{tracking_id}") + return True + else: + print("βœ— Tracking pixel NOT found in HTML") + print(f" Expected: {expected_pixel}") + print(f" Result: {result}") + return False + + +def test_replace_article_links(): + """Test that article links are correctly replaced with tracking URLs""" + print("\n" + "="*70) + print("TEST 2: Replace Article Links") + print("="*70) + + # Test HTML with article links + html = """ + +
Article 1 +Article 2 +Untracked Link + +""" + + # Tracking map + link_tracking_map = { + "https://example.com/article1": "track-id-1", + "https://example.com/article2": "track-id-2" + } + + api_url = "http://localhost:5001" + + # Replace links + result = replace_article_links(html, link_tracking_map, api_url) + + # Verify replacements + success = True + + # Check article 1 link + expected_url_1 = f"{api_url}/api/track/click/track-id-1" + if expected_url_1 in result: + print(f"βœ“ Article 1 link replaced: {expected_url_1}") + else: + print(f"βœ— Article 1 link NOT replaced") + success = False + + # Check article 2 link + expected_url_2 = f"{api_url}/api/track/click/track-id-2" + if expected_url_2 in result: + print(f"βœ“ Article 2 link replaced: {expected_url_2}") + else: + print(f"βœ— Article 2 link NOT replaced") + success = False + + # Check untracked link remains unchanged + if "https://example.com/untracked" in result: + print(f"βœ“ Untracked link preserved: https://example.com/untracked") + else: + print(f"βœ— Untracked link was modified (should remain unchanged)") + success = False + + return success + + +def test_full_integration(): + """Test full integration: pixel + link replacement""" + print("\n" + "="*70) + print("TEST 3: Full Integration (Pixel + Links)") + print("="*70) + + # Test HTML + html = """ + +

Newsletter

+Read Article + +""" + + api_url = "http://localhost:5001" + pixel_tracking_id = "pixel-123" + link_tracking_map = { + "https://example.com/article": "link-456" + } + + # First inject pixel + html = inject_tracking_pixel(html, pixel_tracking_id, api_url) + + # Then replace links + html = replace_article_links(html, link_tracking_map, api_url) + + # Verify both are present + success = True + + pixel_url = f"{api_url}/api/track/pixel/{pixel_tracking_id}" + if pixel_url in html: + print(f"βœ“ Tracking pixel present: {pixel_url}") + else: + print(f"βœ— Tracking pixel NOT found") + success = False + + link_url = f"{api_url}/api/track/click/link-456" + if link_url in html: + print(f"βœ“ Tracking link present: {link_url}") + else: + print(f"βœ— Tracking link NOT found") + success = False + + if success: + print("\nβœ“ Full integration successful!") + print("\nFinal HTML:") + print("-" * 70) + print(html) + print("-" * 70) + + return success + + +if __name__ == '__main__': + print("\n" + "="*70) + print("TRACKING INTEGRATION TEST SUITE") + print("="*70) + + results = [] + + # Run tests + results.append(("Inject Tracking Pixel", test_inject_tracking_pixel())) + results.append(("Replace Article Links", test_replace_article_links())) + results.append(("Full Integration", test_full_integration())) + + # Summary + print("\n" + "="*70) + print("TEST SUMMARY") + print("="*70) + + passed = sum(1 for _, result in results if result) + total = len(results) + + for test_name, result in results: + status = "βœ“ PASS" if result else "βœ— FAIL" + print(f"{status}: {test_name}") + + print("-" * 70) + print(f"Results: {passed}/{total} tests passed") + print("="*70 + "\n") + + # Exit with appropriate code + sys.exit(0 if passed == total else 1) diff --git a/test_feeds_quick.py b/tests/test_feeds_quick.py similarity index 100% rename from test_feeds_quick.py rename to tests/test_feeds_quick.py