Compare commits
5 Commits
master
...
7346ee9de2
| Author | SHA1 | Date | |
|---|---|---|---|
| 7346ee9de2 | |||
| 6e9fbe44c4 | |||
| 4e8b60f77c | |||
| 50b9888004 | |||
| 6c8d6d0940 |
25
Jenkinsfile
vendored
Normal file
25
Jenkinsfile
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
pipeline {
|
||||
agent any
|
||||
|
||||
stages {
|
||||
stage('Security Scan') {
|
||||
steps {
|
||||
withCredentials([string(credentialsId: 'nvd-api-key', variable: 'NVD_API_KEY')]) {
|
||||
// Run OWASP Dependency Check using the specific installation configured in Jenkins
|
||||
// Using NVD API Key to avoid rate limiting
|
||||
dependencyCheck additionalArguments: "--scan ./ --format ALL --nvdApiKey ${NVD_API_KEY}", odcInstallation: 'depcheck'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
post {
|
||||
always {
|
||||
// Publish the results
|
||||
dependencyCheckPublisher pattern: 'dependency-check-report.xml'
|
||||
|
||||
// Archive the reports
|
||||
archiveArtifacts allowEmptyArchive: true, artifacts: 'dependency-check-report.html'
|
||||
}
|
||||
}
|
||||
}
|
||||
143
QUICKSTART.md
143
QUICKSTART.md
@@ -1,56 +1,36 @@
|
||||
# Quick Start Guide
|
||||
# ⚡ Quick Start Guide
|
||||
|
||||
Get Munich News Daily running in 5 minutes!
|
||||
|
||||
## Prerequisites
|
||||
## 📋 Prerequisites
|
||||
- **Docker** & **Docker Compose** installed
|
||||
- **4GB+ RAM** (for AI models)
|
||||
- *(Optional)* NVIDIA GPU for faster processing
|
||||
|
||||
- Docker & Docker Compose installed
|
||||
- 4GB+ RAM (for Ollama AI models)
|
||||
- (Optional) NVIDIA GPU for 5-10x faster AI processing
|
||||
|
||||
## Setup
|
||||
## 🚀 Setup Steps
|
||||
|
||||
### 1. Configure Environment
|
||||
|
||||
```bash
|
||||
# Copy example environment file
|
||||
cp backend/.env.example backend/.env
|
||||
|
||||
# Edit with your settings (required: email configuration)
|
||||
nano backend/.env
|
||||
```
|
||||
**Required:** Update `SMTP_SERVER`, `EMAIL_USER`, and `EMAIL_PASSWORD`.
|
||||
|
||||
**Minimum required settings:**
|
||||
```env
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
```
|
||||
|
||||
### 2. Start System
|
||||
|
||||
### 2. Start the System
|
||||
```bash
|
||||
# Option 1: Auto-detect GPU and start (recommended)
|
||||
# Auto-detects GPU capabilities and starts services
|
||||
./start-with-gpu.sh
|
||||
|
||||
# Option 2: Start without GPU
|
||||
docker-compose up -d
|
||||
|
||||
# View logs
|
||||
docker-compose logs -f
|
||||
|
||||
# Wait for Ollama model download (first time only, ~2-5 minutes)
|
||||
# Watch installation progress (first time model download ~2GB)
|
||||
docker-compose logs -f ollama-setup
|
||||
```
|
||||
|
||||
**Note:** First startup downloads the phi3:latest AI model (2.2GB). This happens automatically.
|
||||
|
||||
### 3. Add RSS Feeds
|
||||
|
||||
### 3. Add News Sources
|
||||
```bash
|
||||
mongosh munich_news
|
||||
# Connect to database
|
||||
docker-compose exec mongodb mongosh munich_news
|
||||
|
||||
# Paste this into the mongo shell:
|
||||
db.rss_feeds.insertMany([
|
||||
{
|
||||
name: "Süddeutsche Zeitung München",
|
||||
@@ -65,11 +45,9 @@ db.rss_feeds.insertMany([
|
||||
])
|
||||
```
|
||||
|
||||
### 4. Add Subscribers
|
||||
|
||||
### 4. Add Yourself as Subscriber
|
||||
```bash
|
||||
mongosh munich_news
|
||||
|
||||
# Still in mongo shell:
|
||||
db.subscribers.insertOne({
|
||||
email: "your-email@example.com",
|
||||
active: true,
|
||||
@@ -78,90 +56,35 @@ db.subscribers.insertOne({
|
||||
})
|
||||
```
|
||||
|
||||
### 5. Test It
|
||||
|
||||
### 5. Verify Installation
|
||||
```bash
|
||||
# Test crawler
|
||||
# 1. Run the crawler manually to fetch news
|
||||
docker-compose exec crawler python crawler_service.py 5
|
||||
|
||||
# Test newsletter
|
||||
# 2. Send a test email to yourself
|
||||
docker-compose exec sender python sender_service.py test your-email@example.com
|
||||
```
|
||||
|
||||
## What Happens Next?
|
||||
## 🎮 Dashboard Access
|
||||
|
||||
The system will automatically:
|
||||
- **Backend API**: Runs continuously at http://localhost:5001 for tracking and analytics
|
||||
- **6:00 AM Berlin time**: Crawl news articles
|
||||
- **7:00 AM Berlin time**: Send newsletter to subscribers
|
||||
Once running, access the services:
|
||||
- **Dashboard**: [http://localhost:3000](http://localhost:3000)
|
||||
- **API**: [http://localhost:5001](http://localhost:5001)
|
||||
|
||||
## View Results
|
||||
## ⏭️ What's Next?
|
||||
|
||||
The system is now fully automated:
|
||||
1. **6:00 AM**: Crawls news and generates AI summaries.
|
||||
2. **7:00 AM**: Sends the daily newsletter.
|
||||
|
||||
### Useful Commands
|
||||
```bash
|
||||
# Check articles
|
||||
mongosh munich_news
|
||||
db.articles.find().sort({ crawled_at: -1 }).limit(5)
|
||||
|
||||
# Check logs
|
||||
docker-compose logs -f crawler
|
||||
docker-compose logs -f sender
|
||||
```
|
||||
|
||||
## Common Commands
|
||||
|
||||
```bash
|
||||
# Stop system
|
||||
# Stop everything
|
||||
docker-compose down
|
||||
|
||||
# Restart system
|
||||
docker-compose restart
|
||||
# View logs for a service
|
||||
docker-compose logs -f crawler
|
||||
|
||||
# View logs
|
||||
docker-compose logs -f
|
||||
|
||||
# Rebuild after changes
|
||||
# Update code & rebuild
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
## New Features
|
||||
|
||||
### GPU Acceleration (5-10x Faster)
|
||||
Enable GPU support for faster AI processing:
|
||||
```bash
|
||||
./check-gpu.sh # Check if GPU is available
|
||||
./start-with-gpu.sh # Start with GPU support
|
||||
```
|
||||
See [docs/GPU_SETUP.md](docs/GPU_SETUP.md) for details.
|
||||
|
||||
### Send Newsletter to All Subscribers
|
||||
```bash
|
||||
# Send newsletter to all active subscribers
|
||||
curl -X POST http://localhost:5001/api/admin/send-newsletter \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"max_articles": 10}'
|
||||
```
|
||||
|
||||
### Security Features
|
||||
- ✅ Only Backend API exposed (port 5001)
|
||||
- ✅ MongoDB internal-only (secure)
|
||||
- ✅ Ollama internal-only (secure)
|
||||
- ✅ All services communicate via internal Docker network
|
||||
|
||||
## Need Help?
|
||||
|
||||
- **Documentation Index**: [docs/INDEX.md](docs/INDEX.md)
|
||||
- **GPU Setup**: [docs/GPU_SETUP.md](docs/GPU_SETUP.md)
|
||||
- **API Reference**: [docs/ADMIN_API.md](docs/ADMIN_API.md)
|
||||
- **Security Guide**: [docs/SECURITY_NOTES.md](docs/SECURITY_NOTES.md)
|
||||
- **Full Documentation**: [README.md](README.md)
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. ✅ **Enable GPU acceleration** - [docs/GPU_SETUP.md](docs/GPU_SETUP.md)
|
||||
2. Set up tracking API (optional)
|
||||
3. Customize newsletter template
|
||||
4. Add more RSS feeds
|
||||
5. Monitor engagement metrics
|
||||
6. Review security settings - [docs/SECURITY_NOTES.md](docs/SECURITY_NOTES.md)
|
||||
|
||||
That's it! Your automated news system is running. 🎉
|
||||
|
||||
527
README.md
527
README.md
@@ -1,460 +1,193 @@
|
||||
# Munich News Daily - Automated Newsletter System
|
||||
|
||||
A fully automated news aggregation and newsletter system that crawls Munich news sources, generates AI summaries, and sends daily newsletters with engagement tracking.
|
||||
A fully automated news aggregation system that crawls Munich news sources, generates AI-powered summaries, tracks local transport disruptions, and delivers personalized daily newsletters.
|
||||
|
||||

|
||||
|
||||
## ✨ Key Features
|
||||
|
||||
- **🤖 AI-Powered Clustering** - Automatically detects duplicate stories from different sources
|
||||
- **📰 Neutral Summaries** - Combines multiple perspectives into balanced coverage
|
||||
- **🎯 Smart Prioritization** - Shows most important stories first (multi-source coverage)
|
||||
- **🎨 Personalized Newsletters** - AI-powered content recommendations based on user interests
|
||||
- **📊 Engagement Tracking** - Open rates, click tracking, and analytics
|
||||
- **⚡ GPU Acceleration** - 5-10x faster AI processing with GPU support
|
||||
- **🔒 GDPR Compliant** - Privacy-first with data retention controls
|
||||
|
||||
**🚀 NEW:** GPU acceleration support for 5-10x faster AI processing! See [docs/GPU_SETUP.md](docs/GPU_SETUP.md)
|
||||
- **🤖 AI-Powered Clustering** - Smartly detects duplicate stories and groups related articles using ChromaDB vector search.
|
||||
- **📝 Neutral Summaries** - Generates balanced, multi-perspective summaries using local LLMs (Ollama).
|
||||
- **🚇 Transport Updates** - Real-time tracking of Munich public transport (MVG) disruptions options.
|
||||
- **🎯 Smart Prioritization** - Ranks stories based on relevance and user preferences.
|
||||
- **🎨 Personalized Newsletters** - diverse content delivery system.
|
||||
- **📊 Engagement Analytics** - Detailed tracking of open rates, click-throughs, and user interests.
|
||||
- **⚡ GPU Acceleration** - Integrated support for NVIDIA GPUs for faster AI processing.
|
||||
- **🔒 Privacy First** - GDPR-compliant with automatic data retention policies and anonymization.
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
For a detailed 5-minute setup guide, see [QUICKSTART.md](QUICKSTART.md).
|
||||
|
||||
```bash
|
||||
# 1. Configure environment
|
||||
cp backend/.env.example backend/.env
|
||||
# Edit backend/.env with your email settings
|
||||
|
||||
# 2. Start everything
|
||||
docker-compose up -d
|
||||
# 2. Start everything (Auto-detects GPU)
|
||||
./start-with-gpu.sh
|
||||
|
||||
# 3. View logs
|
||||
docker-compose logs -f
|
||||
# Questions?
|
||||
# See logs: docker-compose logs -f
|
||||
```
|
||||
|
||||
That's it! The system will automatically:
|
||||
- **Frontend**: Web interface and admin dashboard (http://localhost:3000)
|
||||
- **Backend API**: Runs continuously for tracking and analytics (http://localhost:5001)
|
||||
- **6:00 AM Berlin time**: Crawl news articles and generate summaries
|
||||
- **7:00 AM Berlin time**: Send newsletter to all subscribers
|
||||
The system will automatically:
|
||||
1. **6:00 AM**: Crawl news & transport updates.
|
||||
2. **6:30 AM**: Generate AI summaries & clusters.
|
||||
3. **7:00 AM**: Send personalized newsletters.
|
||||
|
||||
### Access Points
|
||||
## 📋 System Architecture
|
||||
|
||||
- **Newsletter Page**: http://localhost:3000
|
||||
- **Admin Dashboard**: http://localhost:3000/admin.html
|
||||
- **Backend API**: http://localhost:5001
|
||||
The system is built as a set of microservices orchestrated by Docker Compose.
|
||||
|
||||
📖 **New to the project?** See [QUICKSTART.md](QUICKSTART.md) for a detailed 5-minute setup guide.
|
||||
```mermaid
|
||||
graph TD
|
||||
User[Subscribers] -->|Email| Sender[Newsletter Sender]
|
||||
User -->|Web| Frontend[React Frontend]
|
||||
Frontend -->|API| Backend[Backend API]
|
||||
|
||||
🚀 **GPU Acceleration:** Enable 5-10x faster AI processing with [GPU Setup Guide](docs/GPU_SETUP.md)
|
||||
subgraph "Core Services"
|
||||
Crawler[News Crawler]
|
||||
Transport[Transport Crawler]
|
||||
Sender
|
||||
Backend
|
||||
end
|
||||
|
||||
## 📋 System Overview
|
||||
subgraph "Data & AI"
|
||||
Mongo[(MongoDB)]
|
||||
Redis[(Redis)]
|
||||
Chroma[(ChromaDB)]
|
||||
Ollama[Ollama AI]
|
||||
end
|
||||
|
||||
```
|
||||
6:00 AM → News Crawler
|
||||
↓
|
||||
Fetches articles from RSS feeds
|
||||
Extracts full content
|
||||
Generates AI summaries
|
||||
Saves to MongoDB
|
||||
↓
|
||||
7:00 AM → Newsletter Sender
|
||||
↓
|
||||
Waits for crawler to finish
|
||||
Fetches today's articles
|
||||
Generates newsletter with tracking
|
||||
Sends to all subscribers
|
||||
↓
|
||||
✅ Done! Repeat tomorrow
|
||||
Crawler -->|Save| Mongo
|
||||
Crawler -->|Embeddings| Chroma
|
||||
Crawler -->|Summarize| Ollama
|
||||
|
||||
Transport -->|Save| Mongo
|
||||
|
||||
Sender -->|Read| Mongo
|
||||
Sender -->|Track| Backend
|
||||
|
||||
Backend -->|Read/Write| Mongo
|
||||
Backend -->|Cache| Redis
|
||||
```
|
||||
|
||||
## 🏗️ Architecture
|
||||
### Core Components
|
||||
|
||||
### Components
|
||||
| Service | Description | Port |
|
||||
|---------|-------------|------|
|
||||
| **Frontend** | React-based user dashboard and admin interface. | 3000 |
|
||||
| **Backend API** | Flask API for tracking, analytics, and management. | 5001 |
|
||||
| **News Crawler** | Fetches RSS feeds, extracts content, and runs AI clustering. | - |
|
||||
| **Transport Crawler** | Monitors MVG (Munich Transport) for delays and disruptions. | - |
|
||||
| **Newsletter Sender** | Manages subscribers, generates templates, and sends emails. | - |
|
||||
| **Ollama** | Local LLM runner for on-premise AI (Phi-3, Llama3, etc.). | - |
|
||||
| **ChromaDB** | Vector database for semantic search and article clustering. | - |
|
||||
|
||||
- **Ollama**: AI service for summarization and translation (internal only, GPU-accelerated)
|
||||
- **MongoDB**: Data storage (articles, subscribers, tracking) (internal only)
|
||||
- **Backend API**: Flask API for tracking and analytics (port 5001 - only exposed service)
|
||||
- **News Crawler**: Automated RSS feed crawler with AI summarization (internal only)
|
||||
- **Newsletter Sender**: Automated email sender with tracking (internal only)
|
||||
- **Frontend**: React dashboard (optional)
|
||||
## 📂 Project Structure
|
||||
|
||||
### Technology Stack
|
||||
```text
|
||||
munich-news/
|
||||
├── backend/ # Flask API for tracking & analytics
|
||||
├── frontend/ # React dashboard & admin UI
|
||||
├── news_crawler/ # RSS fetcher & AI summarizer service
|
||||
├── news_sender/ # Email generation & dispatch service
|
||||
├── transport_crawler/ # MVG transport disruption monitor
|
||||
├── docker-compose.yml # Main service orchestration
|
||||
└── docs/ # Detailed documentation
|
||||
```
|
||||
|
||||
- Python 3.11
|
||||
- MongoDB 7.0
|
||||
- Ollama (phi3:latest model for AI)
|
||||
- Docker & Docker Compose
|
||||
- Flask (API)
|
||||
- Schedule (automation)
|
||||
- Jinja2 (email templates)
|
||||
## 🛠️ Installation & Setup
|
||||
|
||||
## 📦 Installation
|
||||
1. **Clone the repository**
|
||||
```bash
|
||||
git clone https://github.com/yourusername/munich-news.git
|
||||
cd munich-news
|
||||
```
|
||||
|
||||
### Prerequisites
|
||||
2. **Environment Configuration**
|
||||
```bash
|
||||
cp backend/.env.example backend/.env
|
||||
nano backend/.env
|
||||
```
|
||||
*Critical settings:* `SMTP_SERVER`, `EMAIL_USER`, `EMAIL_PASSWORD`.
|
||||
|
||||
- Docker & Docker Compose
|
||||
- 4GB+ RAM (for Ollama AI models)
|
||||
- (Optional) NVIDIA GPU for 5-10x faster AI processing
|
||||
3. **Start the System**
|
||||
```bash
|
||||
# Recommended: Helper script (handles GPU & Model setup)
|
||||
./start-with-gpu.sh
|
||||
|
||||
### Setup
|
||||
# Alternative: Standard Docker Compose
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
1. **Clone the repository**
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd munich-news
|
||||
```
|
||||
|
||||
2. **Configure environment**
|
||||
```bash
|
||||
cp backend/.env.example backend/.env
|
||||
# Edit backend/.env with your settings
|
||||
```
|
||||
|
||||
3. **Configure Ollama (AI features)**
|
||||
```bash
|
||||
# Option 1: Use integrated Docker Compose Ollama (recommended)
|
||||
./configure-ollama.sh
|
||||
# Select option 1
|
||||
|
||||
# Option 2: Use external Ollama server
|
||||
# Install from https://ollama.ai/download
|
||||
# Then run: ollama pull phi3:latest
|
||||
```
|
||||
|
||||
4. **Start the system**
|
||||
```bash
|
||||
# Auto-detect GPU and start (recommended)
|
||||
./start-with-gpu.sh
|
||||
|
||||
# Or start manually
|
||||
docker-compose up -d
|
||||
|
||||
# First time: Wait for Ollama model download (2-5 minutes)
|
||||
docker-compose logs -f ollama-setup
|
||||
```
|
||||
|
||||
📖 **For detailed Ollama setup & GPU acceleration:** See [docs/OLLAMA_SETUP.md](docs/OLLAMA_SETUP.md)
|
||||
|
||||
💡 **To change AI model:** Edit `OLLAMA_MODEL` in `.env`, then run `./pull-ollama-model.sh`. See [docs/CHANGING_AI_MODEL.md](docs/CHANGING_AI_MODEL.md)
|
||||
4. **Initial Setup (First Run)**
|
||||
* The system needs to download the AI model (approx. 2GB).
|
||||
* Watch progress: `docker-compose logs -f ollama-setup`
|
||||
|
||||
## ⚙️ Configuration
|
||||
|
||||
Edit `backend/.env`:
|
||||
Key configuration options in `backend/.env`:
|
||||
|
||||
```env
|
||||
# MongoDB
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
| Category | Variable | Description |
|
||||
|----------|----------|-------------|
|
||||
| **Email** | `SMTP_SERVER` | SMTP Server (e.g., smtp.gmail.com) |
|
||||
| | `EMAIL_USER` | Your sending email address |
|
||||
| **AI** | `OLLAMA_MODEL` | Model to use (default: phi3:latest) |
|
||||
| **Schedule** | `CRAWLER_TIME` | Time to start crawling (e.g., "06:00") |
|
||||
| | `SENDER_TIME` | Time to send emails (e.g., "07:00") |
|
||||
|
||||
# Email (SMTP)
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
## 📊 Usage & Monitoring
|
||||
|
||||
# Newsletter
|
||||
NEWSLETTER_MAX_ARTICLES=10
|
||||
NEWSLETTER_HOURS_LOOKBACK=24
|
||||
### Access Points
|
||||
* **Web Dashboard**: [http://localhost:3000](http://localhost:3000) (or configured domain)
|
||||
* **API**: [http://localhost:5001](http://localhost:5001)
|
||||
|
||||
# Tracking
|
||||
TRACKING_ENABLED=true
|
||||
TRACKING_API_URL=http://localhost:5001
|
||||
TRACKING_DATA_RETENTION_DAYS=90
|
||||
### Useful Commands
|
||||
|
||||
# Ollama (AI Summarization)
|
||||
OLLAMA_ENABLED=true
|
||||
OLLAMA_BASE_URL=http://127.0.0.1:11434
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
**View Logs**
|
||||
```bash
|
||||
docker-compose logs -f [service_name]
|
||||
# e.g., docker-compose logs -f crawler
|
||||
```
|
||||
|
||||
## 📊 Usage
|
||||
|
||||
### View Logs
|
||||
|
||||
**Manual Trigger**
|
||||
```bash
|
||||
# All services
|
||||
docker-compose logs -f
|
||||
|
||||
# Specific service
|
||||
docker-compose logs -f crawler
|
||||
docker-compose logs -f sender
|
||||
docker-compose logs -f mongodb
|
||||
```
|
||||
|
||||
### Manual Operations
|
||||
|
||||
```bash
|
||||
# Run crawler manually
|
||||
# Run News Crawler immediately
|
||||
docker-compose exec crawler python crawler_service.py 10
|
||||
|
||||
# Send test newsletter
|
||||
docker-compose exec sender python sender_service.py test your-email@example.com
|
||||
# Run Transport Crawler immediately
|
||||
docker-compose exec transport-crawler python transport_service.py
|
||||
|
||||
# Preview newsletter
|
||||
docker-compose exec sender python sender_service.py preview
|
||||
# Send Test Newsletter
|
||||
docker-compose exec sender python sender_service.py test user@example.com
|
||||
```
|
||||
|
||||
### Database Access
|
||||
|
||||
**Database Access**
|
||||
```bash
|
||||
# Connect to MongoDB
|
||||
docker-compose exec mongodb mongosh munich_news
|
||||
|
||||
# View articles
|
||||
db.articles.find().sort({ crawled_at: -1 }).limit(5).pretty()
|
||||
|
||||
# View subscribers
|
||||
db.subscribers.find({ active: true }).pretty()
|
||||
|
||||
# View tracking data
|
||||
db.newsletter_sends.find().sort({ created_at: -1 }).limit(10).pretty()
|
||||
```
|
||||
|
||||
## 🔧 Management
|
||||
## 🌐 Production Deployment (Traefik)
|
||||
|
||||
### Add RSS Feeds
|
||||
This project is configured to work with **Traefik** as a reverse proxy.
|
||||
The `docker-compose.yml` includes labels for:
|
||||
- `news.dongho.kim` (Frontend)
|
||||
- `news-api.dongho.kim` (Backend)
|
||||
|
||||
```bash
|
||||
mongosh munich_news
|
||||
|
||||
db.rss_feeds.insertOne({
|
||||
name: "Source Name",
|
||||
url: "https://example.com/rss",
|
||||
active: true
|
||||
})
|
||||
To use this locally, add these to your `/etc/hosts`:
|
||||
```text
|
||||
127.0.0.1 news.dongho.kim news-api.dongho.kim
|
||||
```
|
||||
|
||||
### Add Subscribers
|
||||
|
||||
```bash
|
||||
mongosh munich_news
|
||||
|
||||
db.subscribers.insertOne({
|
||||
email: "user@example.com",
|
||||
active: true,
|
||||
tracking_enabled: true,
|
||||
subscribed_at: new Date()
|
||||
})
|
||||
```
|
||||
|
||||
### View Analytics
|
||||
|
||||
```bash
|
||||
# Newsletter metrics
|
||||
curl http://localhost:5001/api/analytics/newsletter/2024-01-15
|
||||
|
||||
# Article performance
|
||||
curl http://localhost:5001/api/analytics/article/https://example.com/article
|
||||
|
||||
# Subscriber activity
|
||||
curl http://localhost:5001/api/analytics/subscriber/user@example.com
|
||||
```
|
||||
|
||||
## ⏰ Schedule Configuration
|
||||
|
||||
### Change Crawler Time (default: 6:00 AM)
|
||||
|
||||
Edit `news_crawler/scheduled_crawler.py`:
|
||||
```python
|
||||
schedule.every().day.at("06:00").do(run_crawler) # Change time
|
||||
```
|
||||
|
||||
### Change Sender Time (default: 7:00 AM)
|
||||
|
||||
Edit `news_sender/scheduled_sender.py`:
|
||||
```python
|
||||
schedule.every().day.at("07:00").do(run_sender) # Change time
|
||||
```
|
||||
|
||||
After changes:
|
||||
```bash
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
## 📈 Monitoring
|
||||
|
||||
### Container Status
|
||||
|
||||
```bash
|
||||
docker-compose ps
|
||||
```
|
||||
|
||||
### Check Next Scheduled Runs
|
||||
|
||||
```bash
|
||||
# Crawler
|
||||
docker-compose logs crawler | grep "Next scheduled run"
|
||||
|
||||
# Sender
|
||||
docker-compose logs sender | grep "Next scheduled run"
|
||||
```
|
||||
|
||||
### Engagement Metrics
|
||||
|
||||
```bash
|
||||
mongosh munich_news
|
||||
|
||||
// Open rate
|
||||
var sent = db.newsletter_sends.countDocuments({ newsletter_id: "2024-01-15" })
|
||||
var opened = db.newsletter_sends.countDocuments({ newsletter_id: "2024-01-15", opened: true })
|
||||
print("Open Rate: " + ((opened / sent) * 100).toFixed(2) + "%")
|
||||
|
||||
// Click rate
|
||||
var clicks = db.link_clicks.countDocuments({ newsletter_id: "2024-01-15" })
|
||||
print("Click Rate: " + ((clicks / sent) * 100).toFixed(2) + "%")
|
||||
```
|
||||
|
||||
## 🐛 Troubleshooting
|
||||
|
||||
### Crawler Not Finding Articles
|
||||
|
||||
```bash
|
||||
# Check RSS feeds
|
||||
mongosh munich_news --eval "db.rss_feeds.find({ active: true })"
|
||||
|
||||
# Test manually
|
||||
docker-compose exec crawler python crawler_service.py 5
|
||||
```
|
||||
|
||||
### Newsletter Not Sending
|
||||
|
||||
```bash
|
||||
# Check email config
|
||||
docker-compose exec sender python -c "from sender_service import Config; print(Config.SMTP_SERVER)"
|
||||
|
||||
# Test email
|
||||
docker-compose exec sender python sender_service.py test your-email@example.com
|
||||
```
|
||||
|
||||
### Containers Not Starting
|
||||
|
||||
```bash
|
||||
# Check logs
|
||||
docker-compose logs
|
||||
|
||||
# Rebuild
|
||||
docker-compose up -d --build
|
||||
|
||||
# Reset everything
|
||||
docker-compose down -v
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
## 🔐 Privacy & Compliance
|
||||
|
||||
### GDPR Features
|
||||
|
||||
- **Data Retention**: Automatic anonymization after 90 days
|
||||
- **Opt-Out**: Subscribers can disable tracking
|
||||
- **Data Deletion**: Full data removal on request
|
||||
- **Transparency**: Privacy notice in all emails
|
||||
|
||||
### Privacy Endpoints
|
||||
|
||||
```bash
|
||||
# Delete subscriber data
|
||||
curl -X DELETE http://localhost:5001/api/tracking/subscriber/user@example.com
|
||||
|
||||
# Anonymize old data
|
||||
curl -X POST http://localhost:5001/api/tracking/anonymize
|
||||
|
||||
# Opt out of tracking
|
||||
curl -X POST http://localhost:5001/api/tracking/subscriber/user@example.com/opt-out
|
||||
```
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
### Getting Started
|
||||
- **[QUICKSTART.md](QUICKSTART.md)** - 5-minute setup guide
|
||||
- **[CONTRIBUTING.md](CONTRIBUTING.md)** - Contribution guidelines
|
||||
|
||||
### Core Features
|
||||
- **[docs/AI_NEWS_AGGREGATION.md](docs/AI_NEWS_AGGREGATION.md)** - AI-powered clustering & neutral summaries
|
||||
- **[docs/PERSONALIZATION.md](docs/PERSONALIZATION.md)** - Personalized newsletter system
|
||||
- **[docs/PERSONALIZATION_COMPLETE.md](docs/PERSONALIZATION_COMPLETE.md)** - Personalization implementation guide
|
||||
- **[docs/FEATURES.md](docs/FEATURES.md)** - Complete feature list
|
||||
- **[docs/API.md](docs/API.md)** - API endpoints reference
|
||||
|
||||
### Technical Documentation
|
||||
- **[docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)** - System architecture
|
||||
- **[docs/SETUP.md](docs/SETUP.md)** - Detailed setup guide
|
||||
- **[docs/OLLAMA_SETUP.md](docs/OLLAMA_SETUP.md)** - AI/Ollama configuration
|
||||
- **[docs/GPU_SETUP.md](docs/GPU_SETUP.md)** - GPU acceleration setup
|
||||
- **[docs/DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Production deployment
|
||||
- **[docs/SECURITY.md](docs/SECURITY.md)** - Security best practices
|
||||
- **[docs/REFERENCE.md](docs/REFERENCE.md)** - Complete reference
|
||||
- **[docs/DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Deployment guide
|
||||
- **[docs/API.md](docs/API.md)** - API reference
|
||||
- **[docs/DATABASE_SCHEMA.md](docs/DATABASE_SCHEMA.md)** - Database structure
|
||||
- **[docs/BACKEND_STRUCTURE.md](docs/BACKEND_STRUCTURE.md)** - Backend organization
|
||||
|
||||
### Component Documentation
|
||||
- **[docs/CRAWLER_HOW_IT_WORKS.md](docs/CRAWLER_HOW_IT_WORKS.md)** - Crawler internals
|
||||
- **[docs/EXTRACTION_STRATEGIES.md](docs/EXTRACTION_STRATEGIES.md)** - Content extraction
|
||||
- **[docs/RSS_URL_EXTRACTION.md](docs/RSS_URL_EXTRACTION.md)** - RSS parsing
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
All test files are organized in the `tests/` directory:
|
||||
|
||||
```bash
|
||||
# Run crawler tests
|
||||
docker-compose exec crawler python tests/crawler/test_crawler.py
|
||||
|
||||
# Run sender tests
|
||||
docker-compose exec sender python tests/sender/test_tracking_integration.py
|
||||
|
||||
# Run backend tests
|
||||
docker-compose exec backend python tests/backend/test_tracking.py
|
||||
|
||||
# Test personalization system (all 4 phases)
|
||||
docker exec munich-news-local-backend python test_personalization_system.py
|
||||
```
|
||||
|
||||
## 🚀 Production Deployment
|
||||
|
||||
### Environment Setup
|
||||
|
||||
1. Update `backend/.env` with production values
|
||||
2. Set strong MongoDB password
|
||||
3. Use HTTPS for tracking URLs
|
||||
4. Configure proper SMTP server
|
||||
|
||||
### Security
|
||||
|
||||
```bash
|
||||
# Use production compose file
|
||||
docker-compose -f docker-compose.prod.yml up -d
|
||||
|
||||
# Set MongoDB password
|
||||
export MONGO_PASSWORD=your-secure-password
|
||||
```
|
||||
|
||||
### Monitoring
|
||||
|
||||
- Set up log rotation
|
||||
- Configure health checks
|
||||
- Set up alerts for failures
|
||||
- Monitor database size
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
Complete documentation available in the [docs/](docs/) directory:
|
||||
|
||||
- **[Documentation Index](docs/INDEX.md)** - Complete documentation guide
|
||||
- **[GPU Setup](docs/GPU_SETUP.md)** - 5-10x faster with GPU acceleration
|
||||
- **[Admin API](docs/ADMIN_API.md)** - API endpoints reference
|
||||
- **[Security Guide](docs/SECURITY_NOTES.md)** - Security best practices
|
||||
- **[System Architecture](docs/SYSTEM_ARCHITECTURE.md)** - Technical overview
|
||||
|
||||
## 📝 License
|
||||
|
||||
[Your License Here]
|
||||
For production, ensure your Traefik proxy network is named `proxy` or update the `docker-compose.yml` accordingly.
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
Contributions welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) first.
|
||||
We welcome contributions! Please check [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
||||
|
||||
## 📧 Support
|
||||
## 📄 License
|
||||
|
||||
For issues or questions, please open a GitHub issue.
|
||||
|
||||
---
|
||||
|
||||
**Built with ❤️ for Munich News Daily**
|
||||
MIT License - see [LICENSE](LICENSE) for details.
|
||||
|
||||
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
|
||||
from routes.transport_routes import transport_bp
|
||||
from routes.interests_routes import interests_bp
|
||||
from routes.personalization_routes import personalization_bp
|
||||
from routes.search_routes import search_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
|
||||
app.register_blueprint(transport_bp)
|
||||
app.register_blueprint(interests_bp)
|
||||
app.register_blueprint(personalization_bp)
|
||||
app.register_blueprint(search_bp)
|
||||
|
||||
# Health check endpoint
|
||||
@app.route('/health')
|
||||
|
||||
166
backend/chroma_client.py
Normal file
166
backend/chroma_client.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""
|
||||
ChromaDB Client for storing and retrieving document embeddings
|
||||
"""
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from chromadb.utils import embedding_functions
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
class ChromaClient:
|
||||
"""
|
||||
Client for interacting with ChromaDB vector database.
|
||||
Uses Ollama for generating embeddings if available, otherwise falls back to default.
|
||||
"""
|
||||
|
||||
def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
|
||||
"""
|
||||
Initialize ChromaDB client
|
||||
|
||||
Args:
|
||||
host: ChromaDB host (e.g. 'localhost' or 'chromadb')
|
||||
port: ChromaDB port (default 8000)
|
||||
collection_name: Name of the collection to use
|
||||
ollama_base_url: Optional URL for Ollama embedding function
|
||||
"""
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.collection_name = collection_name
|
||||
self.client = None
|
||||
self.collection = None
|
||||
|
||||
# Setup embedding function
|
||||
# We prefer using a local embedding model compatible with Ollama or SentenceTransformers
|
||||
# For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
|
||||
# which is downloaded automatically by chromadb utils.
|
||||
# Alternatively, we could define a custom function using Ollama's /api/embeddings
|
||||
self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
|
||||
|
||||
def connect(self):
|
||||
"""Establish connection to ChromaDB"""
|
||||
try:
|
||||
self.client = chromadb.HttpClient(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
settings=Settings(allow_reset=True, anonymized_telemetry=False)
|
||||
)
|
||||
|
||||
# Create or get collection
|
||||
self.collection = self.client.get_or_create_collection(
|
||||
name=self.collection_name,
|
||||
embedding_function=self.embedding_function,
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"⚠ Could not connect to ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
def add_articles(self, articles):
|
||||
"""
|
||||
Add articles to the vector database
|
||||
|
||||
Args:
|
||||
articles: List of dictionaries containing article data.
|
||||
Must have 'link' (used as ID), 'title', 'content', etc.
|
||||
"""
|
||||
if not self.client or not self.collection:
|
||||
if not self.connect():
|
||||
return False
|
||||
|
||||
if not articles:
|
||||
return True
|
||||
|
||||
ids = []
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
for article in articles:
|
||||
# Skip if critical data missing
|
||||
if not article.get('link') or not article.get('content'):
|
||||
continue
|
||||
|
||||
# Use link as unique ID
|
||||
article_id = article.get('link')
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
|
||||
|
||||
# robust metadata (flat dict, no nested objects)
|
||||
metadata = {
|
||||
"title": title[:100], # Truncate for metadata limits
|
||||
"url": article_id,
|
||||
"source": article.get('source', 'unknown'),
|
||||
"category": article.get('category', 'general'),
|
||||
"published_at": str(article.get('published_at', '')),
|
||||
"mongo_id": str(article.get('_id', ''))
|
||||
}
|
||||
|
||||
ids.append(article_id)
|
||||
documents.append(text_to_embed)
|
||||
metadatas.append(metadata)
|
||||
|
||||
if not ids:
|
||||
return True
|
||||
|
||||
try:
|
||||
self.collection.upsert(
|
||||
ids=ids,
|
||||
documents=documents,
|
||||
metadatas=metadatas
|
||||
)
|
||||
print(f"✓ Indexed {len(ids)} articles in ChromaDB")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to index in ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
def search(self, query_text, n_results=5, where=None):
|
||||
"""
|
||||
Search for relevant articles
|
||||
|
||||
Args:
|
||||
query_text: The search query
|
||||
n_results: Number of results to return
|
||||
where: Metadata filter dict (e.g. {"category": "sports"})
|
||||
"""
|
||||
if not self.client or not self.collection:
|
||||
if not self.connect():
|
||||
return []
|
||||
|
||||
try:
|
||||
results = self.collection.query(
|
||||
query_texts=[query_text],
|
||||
n_results=n_results,
|
||||
where=where
|
||||
)
|
||||
|
||||
# Format results into a nice list of dicts
|
||||
formatted_results = []
|
||||
if results and results['ids']:
|
||||
for i, id in enumerate(results['ids'][0]):
|
||||
item = {
|
||||
'id': id,
|
||||
'document': results['documents'][0][i] if results['documents'] else None,
|
||||
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
|
||||
'distance': results['distances'][0][i] if results['distances'] else 0
|
||||
}
|
||||
formatted_results.append(item)
|
||||
|
||||
return formatted_results
|
||||
except Exception as e:
|
||||
print(f"✗ Search failed: {e}")
|
||||
return []
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test client
|
||||
client = ChromaClient(host='localhost', port=8000)
|
||||
client.connect()
|
||||
@@ -45,6 +45,11 @@ class Config:
|
||||
TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
|
||||
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
|
||||
|
||||
# ChromaDB
|
||||
CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
|
||||
CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
|
||||
CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print configuration (without sensitive data)"""
|
||||
@@ -57,3 +62,5 @@ class Config:
|
||||
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Tracking Enabled: {cls.TRACKING_ENABLED}")
|
||||
print(f" Tracking API URL: {cls.TRACKING_API_URL}")
|
||||
print(f" ChromaDB Host: {cls.CHROMA_HOST}")
|
||||
print(f" ChromaDB Port: {cls.CHROMA_PORT}")
|
||||
|
||||
@@ -7,3 +7,5 @@ requests==2.31.0
|
||||
Jinja2==3.1.2
|
||||
redis==5.0.1
|
||||
|
||||
chromadb>=0.4.0
|
||||
sentence-transformers>=2.2.2
|
||||
|
||||
@@ -24,8 +24,11 @@ def get_news():
|
||||
|
||||
db_articles = []
|
||||
for doc in cursor:
|
||||
# Use English title if available, otherwise fallback to original
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
|
||||
# Use cluster_articles from aggregation (already fetched)
|
||||
cluster_articles = doc.get('cluster_articles', [])
|
||||
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published': doc.get('published_at', ''),
|
||||
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
|
||||
return jsonify({'error': 'Article not found'}), 404
|
||||
|
||||
return jsonify({
|
||||
'title': article.get('title', ''),
|
||||
'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
|
||||
'author': article.get('author'),
|
||||
'link': article.get('link', ''),
|
||||
'content': article.get('content', ''),
|
||||
|
||||
88
backend/routes/search_routes.py
Normal file
88
backend/routes/search_routes.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from flask import Blueprint, jsonify, request
|
||||
from config import Config
|
||||
from chroma_client import ChromaClient
|
||||
import logging
|
||||
|
||||
search_bp = Blueprint('search', __name__)
|
||||
|
||||
# Initialize ChromaDB client
|
||||
# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
|
||||
chroma_client = ChromaClient(
|
||||
host=Config.CHROMA_HOST,
|
||||
port=Config.CHROMA_PORT,
|
||||
collection_name=Config.CHROMA_COLLECTION
|
||||
)
|
||||
|
||||
@search_bp.route('/api/search', methods=['GET'])
|
||||
def search_news():
|
||||
"""
|
||||
Semantic search for news articles using ChromaDB.
|
||||
Query parameters:
|
||||
- q: Search query (required)
|
||||
- limit: Number of results (default: 10)
|
||||
- category: Filter by category (optional)
|
||||
"""
|
||||
try:
|
||||
query = request.args.get('q')
|
||||
if not query:
|
||||
return jsonify({'error': 'Missing search query'}), 400
|
||||
|
||||
limit = int(request.args.get('limit', 10))
|
||||
category = request.args.get('category')
|
||||
|
||||
# Build filter if category provided
|
||||
where_filter = None
|
||||
if category:
|
||||
where_filter = {"category": category}
|
||||
|
||||
# Perform search
|
||||
results = chroma_client.search(
|
||||
query_text=query,
|
||||
n_results=limit,
|
||||
where=where_filter
|
||||
)
|
||||
|
||||
# Format for frontend
|
||||
formatted_response = []
|
||||
for item in results:
|
||||
metadata = item.get('metadata', {})
|
||||
# Use translated title if availble (stored in metadata as title_en or title)
|
||||
# Note: Chroma metadata structure is flat. If we store title_en, we should use it.
|
||||
# But currently we store: title, url, source, category, published_at.
|
||||
# We need to make sure title_en is stored in Chroma OR fetch it from DB.
|
||||
# Faster approach: just rely on what is in Chroma.
|
||||
# BETTER: In crawl, we store title as title_en in metadata if available?
|
||||
# Let's check how we store it in crawler_service.py/chroma_client.py
|
||||
|
||||
# Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
|
||||
# In chroma_client.py line 97, we only extract title, url, source, category, published_at.
|
||||
# We are NOT storing title_en in Chroma metadata currently.
|
||||
|
||||
# FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
|
||||
# However, since the user IS complaining about English, let's assume valid English titles
|
||||
# are what we want to display.
|
||||
|
||||
# Wait, if we change the metadata in ChromaClient to use title_en as the main title,
|
||||
# then search results will automatically show English.
|
||||
|
||||
title = metadata.get('title', 'Unknown Title')
|
||||
|
||||
formatted_response.append({
|
||||
'title': title,
|
||||
'link': metadata.get('url', ''),
|
||||
'source': metadata.get('source', 'Unknown'),
|
||||
'category': metadata.get('category', 'general'),
|
||||
'published_at': metadata.get('published_at', ''),
|
||||
'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
|
||||
'snippet': item.get('document', '')[:200] + '...' # Preview
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'query': query,
|
||||
'count': len(formatted_response),
|
||||
'results': formatted_response
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Search error: {str(e)}")
|
||||
return jsonify({'error': str(e)}), 500
|
||||
@@ -1,20 +1,3 @@
|
||||
# Munich News Daily - Docker Compose Configuration
|
||||
#
|
||||
# GPU Support:
|
||||
# To enable GPU acceleration for Ollama (5-10x faster):
|
||||
# 1. Check GPU availability: ./check-gpu.sh
|
||||
# 2. Start with GPU: ./start-with-gpu.sh
|
||||
# Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
|
||||
#
|
||||
# Security:
|
||||
# - Only Backend API (port 5001) is exposed to host
|
||||
# - MongoDB is internal-only (not exposed to host)
|
||||
# - Ollama is internal-only (not exposed to host)
|
||||
# - Crawler and Sender are internal-only
|
||||
# All services communicate via internal Docker network
|
||||
#
|
||||
# See docs/OLLAMA_SETUP.md for detailed setup instructions
|
||||
|
||||
services:
|
||||
# Ollama AI Service (Internal only - not exposed to host)
|
||||
ollama:
|
||||
@@ -29,16 +12,8 @@ services:
|
||||
dns:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
# GPU support (uncomment if you have NVIDIA GPU)
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "ollama list || exit 1"]
|
||||
test: [ "CMD-SHELL", "ollama list || exit 1" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -72,7 +47,7 @@ services:
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
test: [ "CMD", "redis-cli", "ping" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -100,6 +75,24 @@ services:
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ChromaDB - Vector Database for AI features
|
||||
chromadb:
|
||||
image: chromadb/chroma:latest
|
||||
container_name: munich-news-chromadb
|
||||
restart: unless-stopped
|
||||
# No ports exposed - only accessible within Docker network
|
||||
environment:
|
||||
- IS_PERSISTENT=TRUE
|
||||
volumes:
|
||||
- chromadb_data:/chroma/chroma
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# News Crawler - Runs at 6 AM Berlin time
|
||||
crawler:
|
||||
build:
|
||||
@@ -120,7 +113,7 @@ services:
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||
test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ]
|
||||
interval: 1m
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -149,7 +142,7 @@ services:
|
||||
- munich-news-network
|
||||
- proxy
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||
test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -188,7 +181,7 @@ services:
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
|
||||
test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -214,7 +207,7 @@ services:
|
||||
networks:
|
||||
- munich-news-network
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
|
||||
test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ]
|
||||
interval: 1m
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -239,7 +232,7 @@ services:
|
||||
- munich-news-network
|
||||
- proxy
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"]
|
||||
test: [ "CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -264,6 +257,8 @@ volumes:
|
||||
driver: local
|
||||
ollama_data:
|
||||
driver: local
|
||||
chromadb_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
|
||||
@@ -113,10 +113,15 @@ function setupInfiniteScroll() {
|
||||
}
|
||||
|
||||
// Search functionality
|
||||
function handleSearch() {
|
||||
let searchTimeout;
|
||||
|
||||
async function handleSearch() {
|
||||
const searchInput = document.getElementById('searchInput');
|
||||
const clearBtn = document.getElementById('clearSearch');
|
||||
searchQuery = searchInput.value.trim().toLowerCase();
|
||||
const searchStats = document.getElementById('searchStats');
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
|
||||
searchQuery = searchInput.value.trim();
|
||||
|
||||
// Show/hide clear button
|
||||
if (searchQuery) {
|
||||
@@ -125,41 +130,68 @@ function handleSearch() {
|
||||
clearBtn.classList.add('hidden');
|
||||
}
|
||||
|
||||
// Filter articles
|
||||
// Clear previous timeout
|
||||
if (searchTimeout) clearTimeout(searchTimeout);
|
||||
|
||||
// If empty query, reset to all articles
|
||||
if (searchQuery === '') {
|
||||
filteredArticles = allArticles;
|
||||
} else {
|
||||
filteredArticles = allArticles.filter(article => {
|
||||
const title = article.title.toLowerCase();
|
||||
const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, '');
|
||||
const source = formatSourceName(article.source).toLowerCase();
|
||||
|
||||
return title.includes(searchQuery) ||
|
||||
summary.includes(searchQuery) ||
|
||||
source.includes(searchQuery);
|
||||
});
|
||||
}
|
||||
|
||||
// Reset display
|
||||
displayedCount = 0;
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
newsGrid.innerHTML = '';
|
||||
|
||||
// Update stats
|
||||
updateSearchStats();
|
||||
|
||||
// Load filtered articles
|
||||
if (filteredArticles.length > 0) {
|
||||
displayedCount = 0;
|
||||
newsGrid.innerHTML = '';
|
||||
updateSearchStats();
|
||||
loadMoreArticles();
|
||||
} else {
|
||||
newsGrid.innerHTML = `
|
||||
<div class="text-center py-16">
|
||||
<div class="text-6xl mb-4">🔍</div>
|
||||
<p class="text-xl text-gray-600 mb-2">No articles found</p>
|
||||
<p class="text-gray-400">Try a different search term</p>
|
||||
</div>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
// Debounce search API call
|
||||
searchTimeout = setTimeout(async () => {
|
||||
// Show searching state
|
||||
newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Searching...</div>';
|
||||
|
||||
try {
|
||||
const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`);
|
||||
|
||||
// Check if response is ok
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Server returned ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.results && data.results.length > 0) {
|
||||
// Map results to match card format
|
||||
filteredArticles = data.results.map(item => ({
|
||||
title: item.title,
|
||||
link: item.link,
|
||||
source: item.source,
|
||||
summary: item.snippet, // Map snippet to summary
|
||||
published_at: item.published_at,
|
||||
score: item.relevance_score
|
||||
}));
|
||||
|
||||
displayedCount = 0;
|
||||
newsGrid.innerHTML = '';
|
||||
|
||||
// Update stats
|
||||
searchStats.textContent = `Found ${filteredArticles.length} relevant articles`;
|
||||
|
||||
loadMoreArticles();
|
||||
} else {
|
||||
newsGrid.innerHTML = `
|
||||
<div class="text-center py-16">
|
||||
<div class="text-6xl mb-4">🔍</div>
|
||||
<p class="text-xl text-gray-600 mb-2">No relevant articles found</p>
|
||||
<p class="text-gray-400">Try different keywords or concepts</p>
|
||||
</div>
|
||||
`;
|
||||
searchStats.textContent = 'No results found';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Search failed:', error);
|
||||
newsGrid.innerHTML = `<div class="text-center py-10 text-red-400">Search failed: ${error.message}</div>`;
|
||||
}
|
||||
}, 500); // 500ms debounce
|
||||
}
|
||||
|
||||
function clearSearch() {
|
||||
@@ -423,7 +455,7 @@ async function unsubscribe() {
|
||||
}
|
||||
|
||||
// Close modal when clicking outside
|
||||
window.onclick = function(event) {
|
||||
window.onclick = function (event) {
|
||||
const modal = document.getElementById('unsubscribeModal');
|
||||
if (event.target === modal) {
|
||||
closeUnsubscribe();
|
||||
|
||||
@@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/api/search', async (req, res) => {
|
||||
try {
|
||||
const { q, limit, category } = req.query;
|
||||
const response = await axios.get(`${API_URL}/api/search`, {
|
||||
params: { q, limit, category }
|
||||
});
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
if (error.response) {
|
||||
// The request was made and the server responded with a status code
|
||||
// that falls out of the range of 2xx
|
||||
console.error('Search API Error:', error.response.status, error.response.data);
|
||||
res.status(error.response.status).json(error.response.data);
|
||||
} else if (error.request) {
|
||||
// The request was made but no response was received
|
||||
console.error('Search API No Response:', error.request);
|
||||
res.status(502).json({ error: 'Search service unavailable (timeout/connection)' });
|
||||
} else {
|
||||
// Something happened in setting up the request that triggered an Error
|
||||
console.error('Search API Request Error:', error.message);
|
||||
res.status(500).json({ error: 'Internal proxy error' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, () => {
|
||||
console.log(`Frontend server running on http://localhost:${PORT}`);
|
||||
console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`);
|
||||
|
||||
166
news_crawler/chroma_client.py
Normal file
166
news_crawler/chroma_client.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""
|
||||
ChromaDB Client for storing and retrieving document embeddings
|
||||
"""
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from chromadb.utils import embedding_functions
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
class ChromaClient:
|
||||
"""
|
||||
Client for interacting with ChromaDB vector database.
|
||||
Uses Ollama for generating embeddings if available, otherwise falls back to default.
|
||||
"""
|
||||
|
||||
def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
|
||||
"""
|
||||
Initialize ChromaDB client
|
||||
|
||||
Args:
|
||||
host: ChromaDB host (e.g. 'localhost' or 'chromadb')
|
||||
port: ChromaDB port (default 8000)
|
||||
collection_name: Name of the collection to use
|
||||
ollama_base_url: Optional URL for Ollama embedding function
|
||||
"""
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.collection_name = collection_name
|
||||
self.client = None
|
||||
self.collection = None
|
||||
|
||||
# Setup embedding function
|
||||
# We prefer using a local embedding model compatible with Ollama or SentenceTransformers
|
||||
# For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
|
||||
# which is downloaded automatically by chromadb utils.
|
||||
# Alternatively, we could define a custom function using Ollama's /api/embeddings
|
||||
self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
|
||||
|
||||
def connect(self):
|
||||
"""Establish connection to ChromaDB"""
|
||||
try:
|
||||
self.client = chromadb.HttpClient(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
settings=Settings(allow_reset=True, anonymized_telemetry=False)
|
||||
)
|
||||
|
||||
# Create or get collection
|
||||
self.collection = self.client.get_or_create_collection(
|
||||
name=self.collection_name,
|
||||
embedding_function=self.embedding_function,
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"⚠ Could not connect to ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
def add_articles(self, articles):
|
||||
"""
|
||||
Add articles to the vector database
|
||||
|
||||
Args:
|
||||
articles: List of dictionaries containing article data.
|
||||
Must have 'link' (used as ID), 'title', 'content', etc.
|
||||
"""
|
||||
if not self.client or not self.collection:
|
||||
if not self.connect():
|
||||
return False
|
||||
|
||||
if not articles:
|
||||
return True
|
||||
|
||||
ids = []
|
||||
documents = []
|
||||
metadatas = []
|
||||
|
||||
for article in articles:
|
||||
# Skip if critical data missing
|
||||
if not article.get('link') or not article.get('content'):
|
||||
continue
|
||||
|
||||
# Use link as unique ID
|
||||
article_id = article.get('link')
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
|
||||
|
||||
# robust metadata (flat dict, no nested objects)
|
||||
metadata = {
|
||||
"title": title[:100], # Truncate for metadata limits
|
||||
"url": article_id,
|
||||
"source": article.get('source', 'unknown'),
|
||||
"category": article.get('category', 'general'),
|
||||
"published_at": str(article.get('published_at', '')),
|
||||
"mongo_id": str(article.get('_id', ''))
|
||||
}
|
||||
|
||||
ids.append(article_id)
|
||||
documents.append(text_to_embed)
|
||||
metadatas.append(metadata)
|
||||
|
||||
if not ids:
|
||||
return True
|
||||
|
||||
try:
|
||||
self.collection.upsert(
|
||||
ids=ids,
|
||||
documents=documents,
|
||||
metadatas=metadatas
|
||||
)
|
||||
print(f"✓ Indexed {len(ids)} articles in ChromaDB")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to index in ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
def search(self, query_text, n_results=5, where=None):
|
||||
"""
|
||||
Search for relevant articles
|
||||
|
||||
Args:
|
||||
query_text: The search query
|
||||
n_results: Number of results to return
|
||||
where: Metadata filter dict (e.g. {"category": "sports"})
|
||||
"""
|
||||
if not self.client or not self.collection:
|
||||
if not self.connect():
|
||||
return []
|
||||
|
||||
try:
|
||||
results = self.collection.query(
|
||||
query_texts=[query_text],
|
||||
n_results=n_results,
|
||||
where=where
|
||||
)
|
||||
|
||||
# Format results into a nice list of dicts
|
||||
formatted_results = []
|
||||
if results and results['ids']:
|
||||
for i, id in enumerate(results['ids'][0]):
|
||||
item = {
|
||||
'id': id,
|
||||
'document': results['documents'][0][i] if results['documents'] else None,
|
||||
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
|
||||
'distance': results['distances'][0][i] if results['distances'] else 0
|
||||
}
|
||||
formatted_results.append(item)
|
||||
|
||||
return formatted_results
|
||||
except Exception as e:
|
||||
print(f"✗ Search failed: {e}")
|
||||
return []
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test client
|
||||
client = ChromaClient(host='localhost', port=8000)
|
||||
client.connect()
|
||||
@@ -34,6 +34,11 @@ class Config:
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# ChromaDB Configuration
|
||||
CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
|
||||
CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
|
||||
CHROMA_COLLECTION = 'munich_news_articles'
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
||||
|
||||
@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
from article_clustering import ArticleClusterer
|
||||
from article_clustering import ArticleClusterer
|
||||
from cluster_summarizer import create_cluster_summaries
|
||||
from chroma_client import ChromaClient
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
|
||||
# Initialize Article Clusterer (will be initialized after ollama_client)
|
||||
article_clusterer = None
|
||||
|
||||
# Initialize ChromaDB client
|
||||
chroma_client = ChromaClient(
|
||||
host=Config.CHROMA_HOST,
|
||||
port=Config.CHROMA_PORT,
|
||||
collection_name=Config.CHROMA_COLLECTION
|
||||
)
|
||||
|
||||
# Print configuration on startup
|
||||
if __name__ != '__main__':
|
||||
Config.print_config()
|
||||
@@ -331,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
|
||||
if not feed.entries:
|
||||
print(f" ⚠ No entries found in feed")
|
||||
return 0
|
||||
return {
|
||||
'crawled': 0,
|
||||
'summarized': 0,
|
||||
'failed_summaries': 0
|
||||
}
|
||||
|
||||
crawled_count = 0
|
||||
summarized_count = 0
|
||||
@@ -440,6 +453,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
crawled_count += 1
|
||||
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
||||
|
||||
# Index in ChromaDB
|
||||
try:
|
||||
# Add mongo _id to article doc for reference
|
||||
saved_article = articles_collection.find_one({'link': article_url})
|
||||
if saved_article:
|
||||
article_doc['_id'] = str(saved_article['_id'])
|
||||
chroma_client.add_articles([article_doc])
|
||||
except Exception as e:
|
||||
print(f" ⚠ Failed to index in ChromaDB: {e}")
|
||||
|
||||
|
||||
except DuplicateKeyError:
|
||||
print(f" ⚠ Duplicate key error")
|
||||
except Exception as e:
|
||||
|
||||
@@ -7,3 +7,4 @@ python-dotenv==1.0.0
|
||||
schedule==1.2.0
|
||||
pytz==2023.3
|
||||
redis==5.0.1
|
||||
chromadb>=0.4.0
|
||||
|
||||
@@ -37,12 +37,12 @@ def main():
|
||||
"""Main scheduler loop"""
|
||||
print("🤖 Munich News Crawler Scheduler")
|
||||
print("="*60)
|
||||
print("Schedule: Daily at 6:00 AM Berlin time")
|
||||
print("Schedule: Every 3 hours")
|
||||
print("Timezone: Europe/Berlin (CET/CEST)")
|
||||
print("="*60)
|
||||
|
||||
# Schedule the crawler to run at 6 AM Berlin time
|
||||
schedule.every().day.at("06:00").do(run_crawler)
|
||||
# Schedule the crawler to run every 3 hours
|
||||
schedule.every(3).hours.do(run_crawler)
|
||||
|
||||
# Show next run time
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
|
||||
Reference in New Issue
Block a user