update
This commit is contained in:
40
.dockerignore
Normal file
40
.dockerignore
Normal file
@@ -0,0 +1,40 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# Node
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Git
|
||||
.git/
|
||||
.gitignore
|
||||
|
||||
# Documentation
|
||||
*.md
|
||||
!README.md
|
||||
|
||||
187
.gitignore
vendored
Normal file
187
.gitignore
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
# ===================================
|
||||
# Python
|
||||
# ===================================
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
|
||||
# Virtual Environments
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
.virtualenv
|
||||
backend/env/
|
||||
backend/venv/
|
||||
news_crawler/env/
|
||||
news_crawler/venv/
|
||||
news_sender/env/
|
||||
news_sender/venv/
|
||||
|
||||
# Python Distribution / Packaging
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# ===================================
|
||||
# Node.js
|
||||
# ===================================
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
.npm
|
||||
.yarn-integrity
|
||||
package-lock.json
|
||||
yarn.lock
|
||||
.pnp
|
||||
.pnp.js
|
||||
|
||||
# ===================================
|
||||
# Environment Variables & Secrets
|
||||
# ===================================
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
*.env
|
||||
|
||||
# ===================================
|
||||
# Database
|
||||
# ===================================
|
||||
*.db
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
*.db-journal
|
||||
|
||||
# MongoDB
|
||||
data/
|
||||
mongodb_data/
|
||||
|
||||
# ===================================
|
||||
# IDE & Editors
|
||||
# ===================================
|
||||
# VSCode
|
||||
.vscode/
|
||||
.vscode-test/
|
||||
*.code-workspace
|
||||
|
||||
# PyCharm / IntelliJ
|
||||
.idea/
|
||||
*.iml
|
||||
*.iws
|
||||
*.ipr
|
||||
out/
|
||||
|
||||
# Sublime Text
|
||||
*.sublime-project
|
||||
*.sublime-workspace
|
||||
|
||||
# Vim
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.vim/
|
||||
|
||||
# Emacs
|
||||
*~
|
||||
\#*\#
|
||||
.\#*
|
||||
|
||||
# ===================================
|
||||
# OS Files
|
||||
# ===================================
|
||||
# macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
._*
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
Thumbs.db:encryptable
|
||||
ehthumbs.db
|
||||
ehthumbs_vista.db
|
||||
*.stackdump
|
||||
[Dd]esktop.ini
|
||||
$RECYCLE.BIN/
|
||||
*.cab
|
||||
*.msi
|
||||
*.msix
|
||||
*.msm
|
||||
*.msp
|
||||
*.lnk
|
||||
|
||||
# Linux
|
||||
.directory
|
||||
.Trash-*
|
||||
|
||||
# ===================================
|
||||
# Project Specific
|
||||
# ===================================
|
||||
# Generated files
|
||||
newsletter_preview.html
|
||||
*.log
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
*.bak
|
||||
*.backup
|
||||
|
||||
# Docker volumes
|
||||
mongodb_data/
|
||||
ollama_data/
|
||||
|
||||
# Spec artifacts (optional - uncomment if you don't want to track specs)
|
||||
# .kiro/specs/
|
||||
|
||||
# Test outputs
|
||||
test-results/
|
||||
coverage/
|
||||
|
||||
487
.kiro/specs/ai-article-summarization/design.md
Normal file
487
.kiro/specs/ai-article-summarization/design.md
Normal file
@@ -0,0 +1,487 @@
|
||||
# Design Document - AI Article Summarization
|
||||
|
||||
## Overview
|
||||
|
||||
This design integrates Ollama AI into the news crawler workflow to automatically generate concise summaries of articles. The system will extract full article content, send it to Ollama for summarization, and store both the original content and the AI-generated summary in MongoDB.
|
||||
|
||||
## Architecture
|
||||
|
||||
### High-Level Flow
|
||||
|
||||
```
|
||||
RSS Feed → Extract Content → Summarize with Ollama → Store in MongoDB
|
||||
↓ ↓ ↓
|
||||
Full Article Text AI Summary (≤150 words) Both Stored
|
||||
```
|
||||
|
||||
### Component Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ News Crawler Service │
|
||||
│ │
|
||||
│ ┌────────────────┐ ┌──────────────────┐ │
|
||||
│ │ RSS Parser │──────→│ Content Extractor│ │
|
||||
│ └────────────────┘ └──────────────────┘ │
|
||||
│ │ │
|
||||
│ ↓ │
|
||||
│ ┌──────────────────┐ │
|
||||
│ │ Ollama Client │ │
|
||||
│ │ (New Component) │ │
|
||||
│ └──────────────────┘ │
|
||||
│ │ │
|
||||
│ ↓ │
|
||||
│ ┌──────────────────┐ │
|
||||
│ │ Database Writer │ │
|
||||
│ └──────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
↓
|
||||
┌──────────────────┐
|
||||
│ Ollama Server │
|
||||
│ (External) │
|
||||
└──────────────────┘
|
||||
│
|
||||
↓
|
||||
┌──────────────────┐
|
||||
│ MongoDB │
|
||||
└──────────────────┘
|
||||
```
|
||||
|
||||
## Components and Interfaces
|
||||
|
||||
### 1. Ollama Client Module
|
||||
|
||||
**File:** `news_crawler/ollama_client.py`
|
||||
|
||||
**Purpose:** Handle communication with Ollama server for summarization
|
||||
|
||||
**Interface:**
|
||||
```python
|
||||
class OllamaClient:
|
||||
def __init__(self, base_url, model, api_key=None, enabled=True):
|
||||
"""Initialize Ollama client with configuration"""
|
||||
|
||||
def summarize_article(self, content: str, max_words: int = 150) -> dict:
|
||||
"""
|
||||
Summarize article content using Ollama
|
||||
|
||||
Args:
|
||||
content: Full article text
|
||||
max_words: Maximum words in summary (default 150)
|
||||
|
||||
Returns:
|
||||
{
|
||||
'summary': str, # AI-generated summary
|
||||
'word_count': int, # Summary word count
|
||||
'success': bool, # Whether summarization succeeded
|
||||
'error': str or None, # Error message if failed
|
||||
'duration': float # Time taken in seconds
|
||||
}
|
||||
"""
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Ollama server is reachable"""
|
||||
|
||||
def test_connection(self) -> dict:
|
||||
"""Test connection and return server info"""
|
||||
```
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
1. **summarize_article()**
|
||||
- Constructs prompt for Ollama
|
||||
- Sends HTTP POST request
|
||||
- Handles timeouts and errors
|
||||
- Validates response
|
||||
- Returns structured result
|
||||
|
||||
2. **is_available()**
|
||||
- Quick health check
|
||||
- Returns True/False
|
||||
- Used before attempting summarization
|
||||
|
||||
3. **test_connection()**
|
||||
- Detailed connection test
|
||||
- Returns server info and model list
|
||||
- Used for diagnostics
|
||||
|
||||
### 2. Enhanced Crawler Service
|
||||
|
||||
**File:** `news_crawler/crawler_service.py`
|
||||
|
||||
**Changes:**
|
||||
|
||||
```python
|
||||
# Add Ollama client initialization
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
# Initialize at module level
|
||||
ollama_client = OllamaClient(
|
||||
base_url=os.getenv('OLLAMA_BASE_URL'),
|
||||
model=os.getenv('OLLAMA_MODEL'),
|
||||
api_key=os.getenv('OLLAMA_API_KEY'),
|
||||
enabled=os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
||||
)
|
||||
|
||||
# Modify crawl_rss_feed() to include summarization
|
||||
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
||||
# ... existing code ...
|
||||
|
||||
# After extracting content
|
||||
article_data = extract_article_content(article_url)
|
||||
|
||||
# NEW: Summarize with Ollama
|
||||
summary_result = None
|
||||
if ollama_client.enabled and article_data.get('content'):
|
||||
print(f" 🤖 Summarizing with AI...")
|
||||
summary_result = ollama_client.summarize_article(
|
||||
article_data['content'],
|
||||
max_words=150
|
||||
)
|
||||
|
||||
if summary_result['success']:
|
||||
print(f" ✓ Summary generated ({summary_result['word_count']} words)")
|
||||
else:
|
||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||
|
||||
# Build article document with summary
|
||||
article_doc = {
|
||||
'title': article_data.get('title'),
|
||||
'author': article_data.get('author'),
|
||||
'link': article_url,
|
||||
'content': article_data.get('content'),
|
||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||
'word_count': article_data.get('word_count'),
|
||||
'summary_word_count': summary_result['word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
'published_at': extract_published_date(entry),
|
||||
'crawled_at': article_data.get('crawled_at'),
|
||||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Configuration Module
|
||||
|
||||
**File:** `news_crawler/config.py` (new file)
|
||||
|
||||
**Purpose:** Centralize configuration management
|
||||
|
||||
```python
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
class Config:
|
||||
# MongoDB
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# Ollama
|
||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
||||
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
|
||||
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
||||
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
|
||||
|
||||
# Crawler
|
||||
RATE_LIMIT_DELAY = 1 # seconds between requests
|
||||
MAX_CONTENT_LENGTH = 50000 # characters
|
||||
```
|
||||
|
||||
## Data Models
|
||||
|
||||
### Updated Article Schema
|
||||
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
title: String,
|
||||
author: String,
|
||||
link: String, // Unique index
|
||||
content: String, // Full article content
|
||||
summary: String, // AI-generated summary (≤150 words)
|
||||
word_count: Number, // Original content word count
|
||||
summary_word_count: Number, // Summary word count
|
||||
source: String,
|
||||
published_at: String,
|
||||
crawled_at: DateTime,
|
||||
summarized_at: DateTime, // When AI summary was generated
|
||||
created_at: DateTime
|
||||
}
|
||||
```
|
||||
|
||||
### Ollama Request Format
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "phi3:latest",
|
||||
"prompt": "Summarize the following article in 150 words or less. Focus on the key points and main message:\n\n[ARTICLE CONTENT]",
|
||||
"stream": false,
|
||||
"options": {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 200
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Ollama Response Format
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "phi3:latest",
|
||||
"created_at": "2024-11-10T16:30:00Z",
|
||||
"response": "The AI-generated summary text here...",
|
||||
"done": true,
|
||||
"total_duration": 5000000000
|
||||
}
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Error Scenarios and Responses
|
||||
|
||||
| Scenario | Handling | User Impact |
|
||||
|----------|----------|-------------|
|
||||
| Ollama server down | Log warning, store original content | Article saved without summary |
|
||||
| Ollama timeout (>30s) | Cancel request, store original | Article saved without summary |
|
||||
| Empty summary returned | Log error, store original | Article saved without summary |
|
||||
| Invalid response format | Log error, store original | Article saved without summary |
|
||||
| Network error | Retry once, then store original | Article saved without summary |
|
||||
| Model not found | Log error, disable Ollama | All articles saved without summaries |
|
||||
|
||||
### Error Logging Format
|
||||
|
||||
```python
|
||||
{
|
||||
'timestamp': datetime.utcnow(),
|
||||
'article_url': article_url,
|
||||
'error_type': 'timeout|connection|invalid_response|empty_summary',
|
||||
'error_message': str(error),
|
||||
'ollama_config': {
|
||||
'base_url': OLLAMA_BASE_URL,
|
||||
'model': OLLAMA_MODEL,
|
||||
'enabled': OLLAMA_ENABLED
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
|
||||
1. **test_ollama_client.py**
|
||||
- Test summarization with mock responses
|
||||
- Test timeout handling
|
||||
- Test error scenarios
|
||||
- Test connection checking
|
||||
|
||||
2. **test_crawler_with_ollama.py**
|
||||
- Test crawler with Ollama enabled
|
||||
- Test crawler with Ollama disabled
|
||||
- Test fallback when Ollama fails
|
||||
- Test rate limiting
|
||||
|
||||
### Integration Tests
|
||||
|
||||
1. **test_end_to_end.py**
|
||||
- Crawl real RSS feed
|
||||
- Summarize with real Ollama
|
||||
- Verify database storage
|
||||
- Check all fields populated
|
||||
|
||||
### Manual Testing
|
||||
|
||||
1. Test with Ollama enabled and working
|
||||
2. Test with Ollama disabled
|
||||
3. Test with Ollama unreachable
|
||||
4. Test with slow Ollama responses
|
||||
5. Test with various article lengths
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Timing Estimates
|
||||
|
||||
- Article extraction: 2-5 seconds
|
||||
- Ollama summarization: 5-15 seconds (depends on article length and model)
|
||||
- Database write: <1 second
|
||||
- **Total per article: 8-21 seconds**
|
||||
|
||||
### Optimization Strategies
|
||||
|
||||
1. **Sequential Processing**
|
||||
- Process one article at a time
|
||||
- Prevents overwhelming Ollama
|
||||
- Easier to debug
|
||||
|
||||
2. **Timeout Management**
|
||||
- 30-second timeout per request
|
||||
- Prevents hanging on slow responses
|
||||
|
||||
3. **Rate Limiting**
|
||||
- 1-second delay between articles
|
||||
- Respects server resources
|
||||
|
||||
4. **Future: Batch Processing**
|
||||
- Queue articles for summarization
|
||||
- Process in batches
|
||||
- Use Celery for async processing
|
||||
|
||||
### Resource Usage
|
||||
|
||||
- **Memory**: ~100MB per crawler instance
|
||||
- **Network**: ~1-5KB per article (to Ollama)
|
||||
- **Storage**: +150 words per article (~1KB)
|
||||
- **CPU**: Minimal (Ollama does the heavy lifting)
|
||||
|
||||
## Security Considerations
|
||||
|
||||
1. **API Key Storage**
|
||||
- Store in environment variables
|
||||
- Never commit to git
|
||||
- Use secrets management in production
|
||||
|
||||
2. **Content Sanitization**
|
||||
- Don't log full article content
|
||||
- Sanitize URLs in logs
|
||||
- Limit error message detail
|
||||
|
||||
3. **Network Security**
|
||||
- Support HTTPS for Ollama
|
||||
- Validate SSL certificates
|
||||
- Use secure connections
|
||||
|
||||
4. **Rate Limiting**
|
||||
- Prevent abuse of Ollama server
|
||||
- Implement backoff on errors
|
||||
- Monitor usage patterns
|
||||
|
||||
## Deployment Considerations
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Required
|
||||
OLLAMA_BASE_URL=http://localhost:11434
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
OLLAMA_ENABLED=true
|
||||
|
||||
# Optional
|
||||
OLLAMA_API_KEY=your-api-key
|
||||
OLLAMA_TIMEOUT=30
|
||||
```
|
||||
|
||||
### Docker Deployment
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
services:
|
||||
crawler:
|
||||
build: ./news_crawler
|
||||
environment:
|
||||
- OLLAMA_BASE_URL=http://ollama:11434
|
||||
- OLLAMA_ENABLED=true
|
||||
depends_on:
|
||||
- ollama
|
||||
- mongodb
|
||||
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- "11434:11434"
|
||||
volumes:
|
||||
- ollama_data:/root/.ollama
|
||||
```
|
||||
|
||||
### Monitoring
|
||||
|
||||
1. **Metrics to Track**
|
||||
- Summarization success rate
|
||||
- Average summarization time
|
||||
- Ollama server uptime
|
||||
- Error frequency by type
|
||||
|
||||
2. **Logging**
|
||||
- Log all summarization attempts
|
||||
- Log errors with context
|
||||
- Log performance metrics
|
||||
|
||||
3. **Alerts**
|
||||
- Alert if Ollama is down >5 minutes
|
||||
- Alert if success rate <80%
|
||||
- Alert if average time >20 seconds
|
||||
|
||||
## Migration Plan
|
||||
|
||||
### Phase 1: Add Ollama Client (Week 1)
|
||||
- Create ollama_client.py
|
||||
- Add configuration
|
||||
- Write unit tests
|
||||
- Test with sample articles
|
||||
|
||||
### Phase 2: Integrate with Crawler (Week 1)
|
||||
- Modify crawler_service.py
|
||||
- Add summarization step
|
||||
- Update database schema
|
||||
- Test end-to-end
|
||||
|
||||
### Phase 3: Update Backend API (Week 2)
|
||||
- Update news routes
|
||||
- Add summary fields to responses
|
||||
- Update frontend to display summaries
|
||||
- Deploy to production
|
||||
|
||||
### Phase 4: Monitor and Optimize (Ongoing)
|
||||
- Monitor performance
|
||||
- Tune prompts for better summaries
|
||||
- Optimize rate limiting
|
||||
- Add batch processing if needed
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If issues arise:
|
||||
|
||||
1. **Immediate**: Set `OLLAMA_ENABLED=false`
|
||||
2. **Short-term**: Revert crawler code changes
|
||||
3. **Long-term**: Remove Ollama integration
|
||||
|
||||
System will continue to work with original content if Ollama is disabled.
|
||||
|
||||
## Success Metrics
|
||||
|
||||
- ✅ 95%+ of articles successfully summarized
|
||||
- ✅ Average summarization time <15 seconds
|
||||
- ✅ Zero data loss (all articles stored even if summarization fails)
|
||||
- ✅ Ollama uptime >99%
|
||||
- ✅ Summary quality: readable and accurate (manual review)
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Multi-language Support**
|
||||
- Detect article language
|
||||
- Use appropriate model
|
||||
- Translate summaries
|
||||
|
||||
2. **Custom Summary Lengths**
|
||||
- Allow configuration per feed
|
||||
- Support different lengths for different use cases
|
||||
|
||||
3. **Sentiment Analysis**
|
||||
- Add sentiment score
|
||||
- Categorize as positive/negative/neutral
|
||||
|
||||
4. **Keyword Extraction**
|
||||
- Extract key topics
|
||||
- Enable better search
|
||||
|
||||
5. **Batch Processing**
|
||||
- Queue articles
|
||||
- Process in parallel
|
||||
- Use Celery for async
|
||||
|
||||
6. **Caching**
|
||||
- Cache summaries
|
||||
- Avoid re-processing
|
||||
- Use Redis for cache
|
||||
164
.kiro/specs/ai-article-summarization/requirements.md
Normal file
164
.kiro/specs/ai-article-summarization/requirements.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# Requirements Document
|
||||
|
||||
## Introduction
|
||||
|
||||
This feature integrates Ollama AI into the news crawler to automatically summarize articles before storing them in the database. Instead of storing full article content, the system will generate concise 150-word summaries using AI, making the content more digestible for newsletter readers and reducing storage requirements.
|
||||
|
||||
## Glossary
|
||||
|
||||
- **Crawler Service**: The standalone microservice that fetches and processes article content from RSS feeds
|
||||
- **Ollama Server**: The AI inference server that provides text summarization capabilities
|
||||
- **Article Content**: The full text extracted from a news article webpage
|
||||
- **Summary**: A concise AI-generated version of the article content (max 150 words)
|
||||
- **MongoDB**: The database where articles and summaries are stored
|
||||
|
||||
## Requirements
|
||||
|
||||
### Requirement 1: Ollama Integration in Crawler
|
||||
|
||||
**User Story:** As a system administrator, I want the crawler to use Ollama for summarization, so that articles are automatically condensed before storage.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN the crawler extracts article content, THE Crawler Service SHALL send the content to the Ollama Server for summarization
|
||||
2. WHEN sending content to Ollama, THE Crawler Service SHALL include a prompt requesting a summary of 150 words or less
|
||||
3. WHEN Ollama returns a summary, THE Crawler Service SHALL validate that the summary is not empty
|
||||
4. IF the Ollama Server is unavailable, THEN THE Crawler Service SHALL store the original content without summarization and log a warning
|
||||
5. WHEN summarization fails, THE Crawler Service SHALL continue processing other articles without stopping
|
||||
|
||||
### Requirement 2: Configuration Management
|
||||
|
||||
**User Story:** As a system administrator, I want to configure Ollama settings, so that I can control the summarization behavior.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE Crawler Service SHALL read Ollama configuration from environment variables
|
||||
2. THE Crawler Service SHALL support the following configuration options:
|
||||
- OLLAMA_BASE_URL (server URL)
|
||||
- OLLAMA_MODEL (model name)
|
||||
- OLLAMA_ENABLED (enable/disable flag)
|
||||
- OLLAMA_API_KEY (optional authentication)
|
||||
3. WHERE OLLAMA_ENABLED is false, THE Crawler Service SHALL store original content without summarization
|
||||
4. WHERE OLLAMA_ENABLED is true AND Ollama is unreachable, THE Crawler Service SHALL log an error and store original content
|
||||
|
||||
### Requirement 3: Summary Storage
|
||||
|
||||
**User Story:** As a developer, I want summaries stored in the database, so that the frontend can display concise article previews.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN a summary is generated, THE Crawler Service SHALL store it in the `summary` field in MongoDB
|
||||
2. WHEN storing an article, THE Crawler Service SHALL include both the original content and the AI summary
|
||||
3. THE Crawler Service SHALL store the following fields:
|
||||
- `content` (original full text)
|
||||
- `summary` (AI-generated, max 150 words)
|
||||
- `word_count` (original content word count)
|
||||
- `summary_word_count` (summary word count)
|
||||
- `summarized_at` (timestamp when summarized)
|
||||
4. WHEN an article already has a summary, THE Crawler Service SHALL not re-summarize it
|
||||
|
||||
### Requirement 4: Error Handling and Resilience
|
||||
|
||||
**User Story:** As a system administrator, I want the crawler to handle AI failures gracefully, so that the system remains reliable.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. IF Ollama returns an error, THEN THE Crawler Service SHALL log the error and store the original content
|
||||
2. IF Ollama times out (>30 seconds), THEN THE Crawler Service SHALL cancel the request and store the original content
|
||||
3. IF the summary is empty or invalid, THEN THE Crawler Service SHALL store the original content
|
||||
4. WHEN an error occurs, THE Crawler Service SHALL include an error indicator in the database record
|
||||
5. THE Crawler Service SHALL continue processing remaining articles after any summarization failure
|
||||
|
||||
### Requirement 5: Performance and Rate Limiting
|
||||
|
||||
**User Story:** As a system administrator, I want the crawler to respect rate limits, so that it doesn't overwhelm the Ollama server.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE Crawler Service SHALL wait at least 1 second between Ollama API calls
|
||||
2. THE Crawler Service SHALL set a timeout of 30 seconds for each Ollama request
|
||||
3. WHEN processing multiple articles, THE Crawler Service SHALL process them sequentially to avoid overloading Ollama
|
||||
4. THE Crawler Service SHALL log the time taken for each summarization
|
||||
5. THE Crawler Service SHALL display progress indicators showing summarization status
|
||||
|
||||
### Requirement 6: Monitoring and Logging
|
||||
|
||||
**User Story:** As a system administrator, I want detailed logs of summarization activity, so that I can monitor and troubleshoot the system.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE Crawler Service SHALL log when summarization starts for each article
|
||||
2. THE Crawler Service SHALL log the original word count and summary word count
|
||||
3. THE Crawler Service SHALL log any errors or warnings from Ollama
|
||||
4. THE Crawler Service SHALL display a summary of total articles summarized at the end
|
||||
5. THE Crawler Service SHALL include summarization statistics in the final report
|
||||
|
||||
### Requirement 7: API Endpoint Updates
|
||||
|
||||
**User Story:** As a frontend developer, I want API endpoints to return summaries, so that I can display them to users.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN fetching articles via GET /api/news, THE Backend API SHALL include the `summary` field if available
|
||||
2. WHEN fetching a single article via GET /api/news/<url>, THE Backend API SHALL include both `content` and `summary`
|
||||
3. THE Backend API SHALL include a `has_summary` boolean field indicating if AI summarization was performed
|
||||
4. THE Backend API SHALL include `summarized_at` timestamp if available
|
||||
5. WHERE no summary exists, THE Backend API SHALL return a preview of the original content (first 200 chars)
|
||||
|
||||
### Requirement 8: Backward Compatibility
|
||||
|
||||
**User Story:** As a developer, I want the system to work with existing articles, so that no data migration is required.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE Crawler Service SHALL work with articles that don't have summaries
|
||||
2. THE Backend API SHALL handle articles with or without summaries gracefully
|
||||
3. WHERE an article has no summary, THE Backend API SHALL generate a preview from the content field
|
||||
4. THE Crawler Service SHALL not re-process articles that already have summaries
|
||||
5. THE system SHALL continue to function if Ollama is disabled or unavailable
|
||||
|
||||
## Non-Functional Requirements
|
||||
|
||||
### Performance
|
||||
- Summarization SHALL complete within 30 seconds per article
|
||||
- The crawler SHALL process at least 10 articles per minute (including summarization)
|
||||
- Database operations SHALL not be significantly slower with summary storage
|
||||
|
||||
### Reliability
|
||||
- The system SHALL maintain 99% uptime even if Ollama is unavailable
|
||||
- Failed summarizations SHALL not prevent article storage
|
||||
- The crawler SHALL recover from Ollama errors without manual intervention
|
||||
|
||||
### Security
|
||||
- Ollama API keys SHALL be stored in environment variables, not in code
|
||||
- Article content SHALL not be logged to prevent sensitive data exposure
|
||||
- API communication with Ollama SHALL support HTTPS
|
||||
|
||||
### Scalability
|
||||
- The system SHALL support multiple Ollama servers for load balancing (future)
|
||||
- The crawler SHALL handle articles of any length (up to 50,000 words)
|
||||
- The database schema SHALL support future enhancements (tags, categories, etc.)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Ollama server must be running and accessible
|
||||
- `requests` Python library for HTTP communication
|
||||
- Environment variables properly configured
|
||||
- MongoDB with sufficient storage for both content and summaries
|
||||
|
||||
## Assumptions
|
||||
|
||||
- Ollama server is already set up and configured
|
||||
- The phi3:latest model (or configured model) supports summarization tasks
|
||||
- Network connectivity between crawler and Ollama server is reliable
|
||||
- Articles are in English or the configured Ollama model supports the article language
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- Support for multiple languages
|
||||
- Customizable summary length
|
||||
- Sentiment analysis integration
|
||||
- Keyword extraction
|
||||
- Category classification
|
||||
- Batch summarization for improved performance
|
||||
- Caching of summaries to avoid re-processing
|
||||
92
.kiro/specs/ai-article-summarization/tasks.md
Normal file
92
.kiro/specs/ai-article-summarization/tasks.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# Implementation Plan
|
||||
|
||||
- [x] 1. Create Ollama client module
|
||||
- Create `news_crawler/ollama_client.py` with OllamaClient class
|
||||
- Implement `summarize_article()` method with prompt construction and API call
|
||||
- Implement `is_available()` method for health checks
|
||||
- Implement `test_connection()` method for diagnostics
|
||||
- Add timeout handling (30 seconds)
|
||||
- Add error handling for connection, timeout, and invalid responses
|
||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 4.1, 4.2, 4.3, 5.2_
|
||||
|
||||
- [x] 2. Create configuration module for crawler
|
||||
- Create `news_crawler/config.py` with Config class
|
||||
- Load environment variables (OLLAMA_BASE_URL, OLLAMA_MODEL, OLLAMA_ENABLED, OLLAMA_API_KEY, OLLAMA_TIMEOUT)
|
||||
- Add validation for required configuration
|
||||
- Add default values for optional configuration
|
||||
- _Requirements: 2.1, 2.2, 2.3, 2.4_
|
||||
|
||||
- [x] 3. Integrate Ollama client into crawler service
|
||||
- Import OllamaClient in `news_crawler/crawler_service.py`
|
||||
- Initialize Ollama client at module level using Config
|
||||
- Modify `crawl_rss_feed()` to call summarization after content extraction
|
||||
- Add conditional logic to skip summarization if OLLAMA_ENABLED is false
|
||||
- Add error handling to continue processing if summarization fails
|
||||
- Add logging for summarization start, success, and failure
|
||||
- Add rate limiting delay after summarization
|
||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 2.3, 2.4, 4.1, 4.5, 5.1, 5.3, 6.1, 6.2, 6.3_
|
||||
|
||||
- [x] 4. Update database schema and storage
|
||||
- Modify article document structure in `crawl_rss_feed()` to include:
|
||||
- `summary` field (AI-generated summary)
|
||||
- `summary_word_count` field
|
||||
- `summarized_at` field (timestamp)
|
||||
- Update MongoDB upsert logic to handle new fields
|
||||
- Add check to skip re-summarization if article already has summary
|
||||
- _Requirements: 3.1, 3.2, 3.3, 3.4, 8.4_
|
||||
|
||||
- [x] 5. Update backend API to return summaries
|
||||
- Modify `backend/routes/news_routes.py` GET /api/news endpoint
|
||||
- Add `summary`, `summary_word_count`, `summarized_at` fields to response
|
||||
- Add `has_summary` boolean field to indicate if AI summarization was performed
|
||||
- Modify GET /api/news/<url> endpoint to include summary fields
|
||||
- Add fallback to content preview if no summary exists
|
||||
- _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3_
|
||||
|
||||
- [x] 6. Update database schema documentation
|
||||
- Update `backend/DATABASE_SCHEMA.md` with new summary fields
|
||||
- Add example document showing summary fields
|
||||
- Document the summarization workflow
|
||||
- _Requirements: 3.1, 3.2, 3.3_
|
||||
|
||||
- [x] 7. Add environment variable configuration
|
||||
- Update `backend/env.template` with Ollama configuration
|
||||
- Add comments explaining each Ollama setting
|
||||
- Document default values
|
||||
- _Requirements: 2.1, 2.2_
|
||||
|
||||
- [x] 8. Create test script for Ollama integration
|
||||
- Create `news_crawler/test_ollama.py` to test Ollama connection
|
||||
- Test summarization with sample article
|
||||
- Test error handling (timeout, connection failure)
|
||||
- Display configuration and connection status
|
||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 4.1, 4.2_
|
||||
|
||||
- [x] 9. Update crawler statistics and logging
|
||||
- Add summarization statistics to final report in `crawl_all_feeds()`
|
||||
- Track total articles summarized vs failed
|
||||
- Log average summarization time
|
||||
- Display progress indicators during summarization
|
||||
- _Requirements: 5.4, 6.1, 6.2, 6.3, 6.4, 6.5_
|
||||
|
||||
- [x] 10. Create documentation for AI summarization
|
||||
- Create `news_crawler/AI_SUMMARIZATION.md` explaining the feature
|
||||
- Document configuration options
|
||||
- Provide troubleshooting guide
|
||||
- Add examples of usage
|
||||
- _Requirements: 2.1, 2.2, 2.3, 2.4, 6.1, 6.2, 6.3_
|
||||
|
||||
- [x] 11. Update main README with AI summarization info
|
||||
- Add section about AI summarization feature
|
||||
- Document Ollama setup requirements
|
||||
- Add configuration examples
|
||||
- Update API endpoint documentation
|
||||
- _Requirements: 2.1, 2.2, 7.1, 7.2_
|
||||
|
||||
- [x] 12. Test end-to-end workflow
|
||||
- Run crawler with Ollama enabled
|
||||
- Verify articles are summarized correctly
|
||||
- Check database contains all expected fields
|
||||
- Test API endpoints return summaries
|
||||
- Verify error handling when Ollama is disabled/unavailable
|
||||
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5, 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3, 8.4, 8.5_
|
||||
209
ARCHITECTURE.md
Normal file
209
ARCHITECTURE.md
Normal file
@@ -0,0 +1,209 @@
|
||||
# Munich News Daily - Architecture
|
||||
|
||||
## System Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Users / Browsers │
|
||||
└────────────────────────┬────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Frontend (Port 3000) │
|
||||
│ Node.js + Express + Vanilla JS │
|
||||
│ - Subscription form │
|
||||
│ - News display │
|
||||
│ - RSS feed management UI (future) │
|
||||
└────────────────────────┬────────────────────────────────────┘
|
||||
│ HTTP/REST
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Backend API (Port 5001) │
|
||||
│ Flask + Python │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────┐ │
|
||||
│ │ Routes (Blueprints) │ │
|
||||
│ │ - subscription_routes.py (subscribe/unsubscribe) │ │
|
||||
│ │ - news_routes.py (get news, stats) │ │
|
||||
│ │ - rss_routes.py (manage RSS feeds) │ │
|
||||
│ │ - ollama_routes.py (AI features) │ │
|
||||
│ └──────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────┐ │
|
||||
│ │ Services (Business Logic) │ │
|
||||
│ │ - news_service.py (fetch & save articles) │ │
|
||||
│ │ - email_service.py (send newsletters) │ │
|
||||
│ │ - ollama_service.py (AI integration) │ │
|
||||
│ └──────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────┐ │
|
||||
│ │ Core │ │
|
||||
│ │ - config.py (configuration) │ │
|
||||
│ │ - database.py (DB connection) │ │
|
||||
│ └──────────────────────────────────────────────────────┘ │
|
||||
└────────────────────────┬────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ MongoDB (Port 27017) │
|
||||
│ │
|
||||
│ Collections: │
|
||||
│ - articles (news articles with full content) │
|
||||
│ - subscribers (email subscribers) │
|
||||
│ - rss_feeds (RSS feed sources) │
|
||||
└─────────────────────────┬───────────────────────────────────┘
|
||||
│
|
||||
│ Read/Write
|
||||
│
|
||||
┌─────────────────────────┴───────────────────────────────────┐
|
||||
│ News Crawler Microservice │
|
||||
│ (Standalone) │
|
||||
│ │
|
||||
│ - Fetches RSS feeds from MongoDB │
|
||||
│ - Crawls full article content │
|
||||
│ - Extracts text, metadata, word count │
|
||||
│ - Stores back to MongoDB │
|
||||
│ - Can run independently or scheduled │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
|
||||
│
|
||||
│ (Optional)
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Ollama AI Server (Port 11434) │
|
||||
│ (Optional, External) │
|
||||
│ │
|
||||
│ - Article summarization │
|
||||
│ - Content analysis │
|
||||
│ - AI-powered features │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Component Details
|
||||
|
||||
### Frontend (Port 3000)
|
||||
- **Technology**: Node.js, Express, Vanilla JavaScript
|
||||
- **Responsibilities**:
|
||||
- User interface
|
||||
- Subscription management
|
||||
- News display
|
||||
- API proxy to backend
|
||||
- **Communication**: HTTP REST to Backend
|
||||
|
||||
### Backend API (Port 5001)
|
||||
- **Technology**: Python, Flask
|
||||
- **Architecture**: Modular with Blueprints
|
||||
- **Responsibilities**:
|
||||
- REST API endpoints
|
||||
- Business logic
|
||||
- Database operations
|
||||
- Email sending
|
||||
- AI integration
|
||||
- **Communication**:
|
||||
- HTTP REST from Frontend
|
||||
- MongoDB driver to Database
|
||||
- HTTP to Ollama (optional)
|
||||
|
||||
### MongoDB (Port 27017)
|
||||
- **Technology**: MongoDB 7.0
|
||||
- **Responsibilities**:
|
||||
- Persistent data storage
|
||||
- Articles, subscribers, RSS feeds
|
||||
- **Communication**: MongoDB protocol
|
||||
|
||||
### News Crawler (Standalone)
|
||||
- **Technology**: Python, BeautifulSoup
|
||||
- **Architecture**: Microservice (can run independently)
|
||||
- **Responsibilities**:
|
||||
- Fetch RSS feeds
|
||||
- Crawl article content
|
||||
- Extract and clean text
|
||||
- Store in database
|
||||
- **Communication**: MongoDB driver to Database
|
||||
- **Execution**:
|
||||
- Manual: `python crawler_service.py`
|
||||
- Scheduled: Cron, systemd, Docker
|
||||
- On-demand: Via backend API (future)
|
||||
|
||||
### Ollama AI Server (Optional, External)
|
||||
- **Technology**: Ollama
|
||||
- **Responsibilities**:
|
||||
- AI model inference
|
||||
- Text summarization
|
||||
- Content analysis
|
||||
- **Communication**: HTTP REST API
|
||||
|
||||
## Data Flow
|
||||
|
||||
### 1. News Aggregation Flow
|
||||
```
|
||||
RSS Feeds → Backend (news_service) → MongoDB (articles)
|
||||
```
|
||||
|
||||
### 2. Content Crawling Flow
|
||||
```
|
||||
MongoDB (rss_feeds) → Crawler → Article URLs →
|
||||
Web Scraping → MongoDB (articles with full_content)
|
||||
```
|
||||
|
||||
### 3. Subscription Flow
|
||||
```
|
||||
User → Frontend → Backend (subscription_routes) →
|
||||
MongoDB (subscribers)
|
||||
```
|
||||
|
||||
### 4. Newsletter Flow (Future)
|
||||
```
|
||||
Scheduler → Backend (email_service) →
|
||||
MongoDB (articles + subscribers) → SMTP → Users
|
||||
```
|
||||
|
||||
### 5. AI Processing Flow (Optional)
|
||||
```
|
||||
MongoDB (articles) → Backend (ollama_service) →
|
||||
Ollama Server → AI Summary → MongoDB (articles)
|
||||
```
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Development
|
||||
- All services run locally
|
||||
- MongoDB via Docker Compose
|
||||
- Manual crawler execution
|
||||
|
||||
### Production
|
||||
- Backend: Cloud VM, Container, or PaaS
|
||||
- Frontend: Static hosting or same server
|
||||
- MongoDB: MongoDB Atlas or self-hosted
|
||||
- Crawler: Scheduled job (cron, systemd timer)
|
||||
- Ollama: Separate GPU server (optional)
|
||||
|
||||
## Scalability Considerations
|
||||
|
||||
### Current Architecture
|
||||
- Monolithic backend (single Flask instance)
|
||||
- Standalone crawler (can run multiple instances)
|
||||
- Shared MongoDB
|
||||
|
||||
### Future Improvements
|
||||
- Load balancer for backend
|
||||
- Message queue for crawler jobs (Celery + Redis)
|
||||
- Caching layer (Redis)
|
||||
- CDN for frontend
|
||||
- Read replicas for MongoDB
|
||||
|
||||
## Security
|
||||
|
||||
- CORS enabled for frontend-backend communication
|
||||
- MongoDB authentication (production)
|
||||
- Environment variables for secrets
|
||||
- Input validation on all endpoints
|
||||
- Rate limiting (future)
|
||||
|
||||
## Monitoring (Future)
|
||||
|
||||
- Application logs
|
||||
- MongoDB metrics
|
||||
- Crawler success/failure tracking
|
||||
- API response times
|
||||
- Error tracking (Sentry)
|
||||
136
CHANGELOG.md
Normal file
136
CHANGELOG.md
Normal file
@@ -0,0 +1,136 @@
|
||||
# Changelog
|
||||
|
||||
## [Unreleased] - 2024-11-10
|
||||
|
||||
### Added - Major Refactoring
|
||||
|
||||
#### Backend Modularization
|
||||
- ✅ Restructured backend into modular architecture
|
||||
- ✅ Created separate route blueprints:
|
||||
- `subscription_routes.py` - User subscriptions
|
||||
- `news_routes.py` - News fetching and stats
|
||||
- `rss_routes.py` - RSS feed management (CRUD)
|
||||
- `ollama_routes.py` - AI integration
|
||||
- ✅ Created service layer:
|
||||
- `news_service.py` - News fetching logic
|
||||
- `email_service.py` - Newsletter sending
|
||||
- `ollama_service.py` - AI communication
|
||||
- ✅ Centralized configuration in `config.py`
|
||||
- ✅ Separated database logic in `database.py`
|
||||
- ✅ Reduced main `app.py` from 700+ lines to 27 lines
|
||||
|
||||
#### RSS Feed Management
|
||||
- ✅ Dynamic RSS feed management via API
|
||||
- ✅ Add/remove/list/toggle RSS feeds without code changes
|
||||
- ✅ Unique index on RSS feed URLs (prevents duplicates)
|
||||
- ✅ Default feeds auto-initialized on first run
|
||||
- ✅ Created `fix_duplicates.py` utility script
|
||||
|
||||
#### News Crawler Microservice
|
||||
- ✅ Created standalone `news_crawler/` microservice
|
||||
- ✅ Web scraping with BeautifulSoup
|
||||
- ✅ Smart content extraction using multiple selectors
|
||||
- ✅ Full article content storage in MongoDB
|
||||
- ✅ Word count calculation
|
||||
- ✅ Duplicate prevention (skips already-crawled articles)
|
||||
- ✅ Rate limiting (1 second between requests)
|
||||
- ✅ Can run independently or scheduled
|
||||
- ✅ Docker support for crawler
|
||||
- ✅ Comprehensive documentation
|
||||
|
||||
#### API Endpoints
|
||||
New endpoints added:
|
||||
- `GET /api/rss-feeds` - List all RSS feeds
|
||||
- `POST /api/rss-feeds` - Add new RSS feed
|
||||
- `DELETE /api/rss-feeds/<id>` - Remove RSS feed
|
||||
- `PATCH /api/rss-feeds/<id>/toggle` - Toggle feed active status
|
||||
|
||||
#### Documentation
|
||||
- ✅ Created `ARCHITECTURE.md` - System architecture overview
|
||||
- ✅ Created `backend/STRUCTURE.md` - Backend structure guide
|
||||
- ✅ Created `news_crawler/README.md` - Crawler documentation
|
||||
- ✅ Created `news_crawler/QUICKSTART.md` - Quick start guide
|
||||
- ✅ Created `news_crawler/test_crawler.py` - Test suite
|
||||
- ✅ Updated main `README.md` with new features
|
||||
- ✅ Updated `DATABASE_SCHEMA.md` with new fields
|
||||
|
||||
#### Configuration
|
||||
- ✅ Added `FLASK_PORT` environment variable
|
||||
- ✅ Fixed `OLLAMA_MODEL` typo in `.env`
|
||||
- ✅ Port 5001 default to avoid macOS AirPlay conflict
|
||||
|
||||
### Changed
|
||||
- Backend structure: Monolithic → Modular
|
||||
- RSS feeds: Hardcoded → Database-driven
|
||||
- Article storage: Summary only → Full content support
|
||||
- Configuration: Scattered → Centralized
|
||||
|
||||
### Technical Improvements
|
||||
- Separation of concerns (routes vs services)
|
||||
- Better testability
|
||||
- Easier maintenance
|
||||
- Scalable architecture
|
||||
- Independent microservices
|
||||
- Proper error handling
|
||||
- Comprehensive logging
|
||||
|
||||
### Database Schema Updates
|
||||
Articles collection now includes:
|
||||
- `full_content` - Full article text
|
||||
- `word_count` - Number of words
|
||||
- `crawled_at` - When content was crawled
|
||||
|
||||
RSS Feeds collection added:
|
||||
- `name` - Feed name
|
||||
- `url` - Feed URL (unique)
|
||||
- `active` - Active status
|
||||
- `created_at` - Creation timestamp
|
||||
|
||||
### Files Added
|
||||
```
|
||||
backend/
|
||||
├── config.py
|
||||
├── database.py
|
||||
├── fix_duplicates.py
|
||||
├── STRUCTURE.md
|
||||
├── routes/
|
||||
│ ├── __init__.py
|
||||
│ ├── subscription_routes.py
|
||||
│ ├── news_routes.py
|
||||
│ ├── rss_routes.py
|
||||
│ └── ollama_routes.py
|
||||
└── services/
|
||||
├── __init__.py
|
||||
├── news_service.py
|
||||
├── email_service.py
|
||||
└── ollama_service.py
|
||||
|
||||
news_crawler/
|
||||
├── crawler_service.py
|
||||
├── test_crawler.py
|
||||
├── requirements.txt
|
||||
├── .gitignore
|
||||
├── Dockerfile
|
||||
├── docker-compose.yml
|
||||
├── README.md
|
||||
└── QUICKSTART.md
|
||||
|
||||
Root:
|
||||
├── ARCHITECTURE.md
|
||||
└── CHANGELOG.md
|
||||
```
|
||||
|
||||
### Files Removed
|
||||
- Old monolithic `backend/app.py` (replaced with modular version)
|
||||
|
||||
### Next Steps (Future Enhancements)
|
||||
- [ ] Frontend UI for RSS feed management
|
||||
- [ ] Automatic article summarization with Ollama
|
||||
- [ ] Scheduled newsletter sending
|
||||
- [ ] Article categorization and tagging
|
||||
- [ ] Search functionality
|
||||
- [ ] User preferences (categories, frequency)
|
||||
- [ ] Analytics dashboard
|
||||
- [ ] API rate limiting
|
||||
- [ ] Caching layer (Redis)
|
||||
- [ ] Message queue for crawler (Celery)
|
||||
206
QUICK_REFERENCE.md
Normal file
206
QUICK_REFERENCE.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Quick Reference Guide
|
||||
|
||||
## Starting the Application
|
||||
|
||||
### 1. Start MongoDB
|
||||
```bash
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### 2. Start Backend (Port 5001)
|
||||
```bash
|
||||
cd backend
|
||||
source venv/bin/activate # or: venv\Scripts\activate on Windows
|
||||
python app.py
|
||||
```
|
||||
|
||||
### 3. Start Frontend (Port 3000)
|
||||
```bash
|
||||
cd frontend
|
||||
npm start
|
||||
```
|
||||
|
||||
### 4. Run Crawler (Optional)
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
## Common Commands
|
||||
|
||||
### RSS Feed Management
|
||||
|
||||
**List all feeds:**
|
||||
```bash
|
||||
curl http://localhost:5001/api/rss-feeds
|
||||
```
|
||||
|
||||
**Add a feed:**
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "Feed Name", "url": "https://example.com/rss"}'
|
||||
```
|
||||
|
||||
**Remove a feed:**
|
||||
```bash
|
||||
curl -X DELETE http://localhost:5001/api/rss-feeds/<feed_id>
|
||||
```
|
||||
|
||||
**Toggle feed status:**
|
||||
```bash
|
||||
curl -X PATCH http://localhost:5001/api/rss-feeds/<feed_id>/toggle
|
||||
```
|
||||
|
||||
### News & Subscriptions
|
||||
|
||||
**Get latest news:**
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
**Subscribe:**
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/subscribe \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"email": "user@example.com"}'
|
||||
```
|
||||
|
||||
**Get stats:**
|
||||
```bash
|
||||
curl http://localhost:5001/api/stats
|
||||
```
|
||||
|
||||
### Ollama (AI)
|
||||
|
||||
**Test connection:**
|
||||
```bash
|
||||
curl http://localhost:5001/api/ollama/ping
|
||||
```
|
||||
|
||||
**List models:**
|
||||
```bash
|
||||
curl http://localhost:5001/api/ollama/models
|
||||
```
|
||||
|
||||
### Database
|
||||
|
||||
**Connect to MongoDB:**
|
||||
```bash
|
||||
mongosh
|
||||
use munich_news
|
||||
```
|
||||
|
||||
**Check articles:**
|
||||
```javascript
|
||||
db.articles.find().limit(5)
|
||||
db.articles.countDocuments()
|
||||
db.articles.countDocuments({full_content: {$exists: true}})
|
||||
```
|
||||
|
||||
**Check subscribers:**
|
||||
```javascript
|
||||
db.subscribers.find()
|
||||
db.subscribers.countDocuments({status: "active"})
|
||||
```
|
||||
|
||||
**Check RSS feeds:**
|
||||
```javascript
|
||||
db.rss_feeds.find()
|
||||
```
|
||||
|
||||
## File Locations
|
||||
|
||||
### Configuration
|
||||
- Backend: `backend/.env`
|
||||
- Frontend: `frontend/package.json`
|
||||
- Crawler: Uses backend's `.env` or own `.env`
|
||||
|
||||
### Logs
|
||||
- Backend: Terminal output
|
||||
- Frontend: Terminal output
|
||||
- Crawler: Terminal output
|
||||
|
||||
### Database
|
||||
- MongoDB data: Docker volume `mongodb_data`
|
||||
- Database name: `munich_news`
|
||||
|
||||
## Ports
|
||||
|
||||
| Service | Port | URL |
|
||||
|---------|------|-----|
|
||||
| Frontend | 3000 | http://localhost:3000 |
|
||||
| Backend | 5001 | http://localhost:5001 |
|
||||
| MongoDB | 27017 | mongodb://localhost:27017 |
|
||||
| Ollama | 11434 | http://localhost:11434 |
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Backend won't start
|
||||
- Check if port 5001 is available
|
||||
- Verify MongoDB is running
|
||||
- Check `.env` file exists
|
||||
|
||||
### Frontend can't connect
|
||||
- Verify backend is running on port 5001
|
||||
- Check CORS settings
|
||||
- Check API_URL in frontend
|
||||
|
||||
### Crawler fails
|
||||
- Install dependencies: `pip install -r requirements.txt`
|
||||
- Check MongoDB connection
|
||||
- Verify RSS feeds exist in database
|
||||
|
||||
### MongoDB connection error
|
||||
- Start MongoDB: `docker-compose up -d`
|
||||
- Check connection string in `.env`
|
||||
- Verify port 27017 is not blocked
|
||||
|
||||
### Port 5000 conflict (macOS)
|
||||
- AirPlay uses port 5000
|
||||
- Use port 5001 instead (set in `.env`)
|
||||
- Or disable AirPlay Receiver in System Preferences
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
munich-news/
|
||||
├── backend/ # Main API (Flask)
|
||||
├── frontend/ # Web UI (Express + JS)
|
||||
├── news_crawler/ # Crawler microservice
|
||||
├── .env # Environment variables
|
||||
└── docker-compose.yml # MongoDB setup
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### Backend (.env)
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
FLASK_PORT=5001
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
OLLAMA_BASE_URL=http://127.0.0.1:11434
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
OLLAMA_ENABLED=true
|
||||
```
|
||||
|
||||
## Development Workflow
|
||||
|
||||
1. **Add RSS Feed** → Backend API
|
||||
2. **Run Crawler** → Fetches full content
|
||||
3. **View News** → Frontend displays articles
|
||||
4. **Users Subscribe** → Via frontend form
|
||||
5. **Send Newsletter** → Manual or scheduled
|
||||
|
||||
## Useful Links
|
||||
|
||||
- Frontend: http://localhost:3000
|
||||
- Backend API: http://localhost:5001
|
||||
- MongoDB: mongodb://localhost:27017
|
||||
- Architecture: See `ARCHITECTURE.md`
|
||||
- Backend Structure: See `backend/STRUCTURE.md`
|
||||
- Crawler Guide: See `news_crawler/README.md`
|
||||
327
README.md
Normal file
327
README.md
Normal file
@@ -0,0 +1,327 @@
|
||||
# Munich News Daily 📰
|
||||
|
||||
A TLDR/Morning Brew-style news email platform specifically for Munich. Get the latest Munich news delivered to your inbox every morning.
|
||||
|
||||
## Features
|
||||
|
||||
- 📧 Email newsletter subscription system
|
||||
- 📰 Aggregated news from multiple Munich news sources
|
||||
- 🎨 Beautiful, modern web interface
|
||||
- 📊 Subscription statistics
|
||||
- 🔄 Real-time news updates
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- **Backend**: Python (Flask) - Modular architecture with blueprints
|
||||
- **Frontend**: Node.js (Express + Vanilla JavaScript)
|
||||
- **Database**: MongoDB
|
||||
- **News Crawler**: Standalone Python microservice
|
||||
- **News Sources**: RSS feeds from major Munich news outlets
|
||||
|
||||
## Setup Instructions
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- Node.js 14+
|
||||
- npm or yarn
|
||||
- Docker and Docker Compose (recommended for MongoDB) OR MongoDB (local installation or MongoDB Atlas account)
|
||||
|
||||
### Backend Setup
|
||||
|
||||
1. Navigate to the backend directory:
|
||||
```bash
|
||||
cd backend
|
||||
```
|
||||
|
||||
2. Create a virtual environment (recommended):
|
||||
```bash
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
3. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. Set up MongoDB using Docker Compose (recommended):
|
||||
```bash
|
||||
# From the project root directory
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
This will start MongoDB in a Docker container. The database will be available at `mongodb://localhost:27017/`
|
||||
|
||||
**Useful Docker commands:**
|
||||
```bash
|
||||
# Start MongoDB
|
||||
docker-compose up -d
|
||||
|
||||
# Stop MongoDB
|
||||
docker-compose down
|
||||
|
||||
# View MongoDB logs
|
||||
docker-compose logs -f mongodb
|
||||
|
||||
# Restart MongoDB
|
||||
docker-compose restart mongodb
|
||||
|
||||
# Remove MongoDB and all data (WARNING: deletes all data)
|
||||
docker-compose down -v
|
||||
```
|
||||
|
||||
**Alternative options:**
|
||||
- **Local MongoDB**: Install MongoDB locally and make sure it's running
|
||||
- **MongoDB Atlas** (Cloud): Create a free account at [mongodb.com/cloud/atlas](https://www.mongodb.com/cloud/atlas) and get your connection string
|
||||
|
||||
5. Create a `.env` file in the backend directory:
|
||||
```bash
|
||||
# Copy the template file
|
||||
cp env.template .env
|
||||
```
|
||||
|
||||
Then edit `.env` with your configuration:
|
||||
```env
|
||||
# MongoDB connection (default: mongodb://localhost:27017/)
|
||||
# For Docker Compose (no authentication):
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
# For Docker Compose with authentication (if you modify docker-compose.yml):
|
||||
# MONGODB_URI=mongodb://admin:password@localhost:27017/
|
||||
# Or for MongoDB Atlas:
|
||||
# MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
|
||||
|
||||
# Email configuration (optional for testing)
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
|
||||
# Ollama Configuration (for AI-powered features)
|
||||
# Remote Ollama server URL
|
||||
OLLAMA_BASE_URL=http://your-remote-server-ip:11434
|
||||
# Optional: API key if your Ollama server requires authentication
|
||||
# OLLAMA_API_KEY=your-api-key-here
|
||||
# Model name to use (e.g., llama2, mistral, codellama, llama3)
|
||||
OLLAMA_MODEL=llama2
|
||||
# Enable/disable Ollama features (true/false)
|
||||
OLLAMA_ENABLED=false
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- For Gmail, you'll need to use an [App Password](https://support.google.com/accounts/answer/185833) instead of your regular password.
|
||||
- For Ollama, replace `your-remote-server-ip` with your actual server IP or domain. Set `OLLAMA_ENABLED=true` to enable AI features.
|
||||
|
||||
6. Run the backend server:
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
The backend will run on `http://localhost:5001` (port 5001 to avoid conflict with AirPlay on macOS)
|
||||
|
||||
### Frontend Setup
|
||||
|
||||
1. Navigate to the frontend directory:
|
||||
```bash
|
||||
cd frontend
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Run the frontend server:
|
||||
```bash
|
||||
npm start
|
||||
```
|
||||
|
||||
The frontend will run on `http://localhost:3000`
|
||||
|
||||
## Usage
|
||||
|
||||
1. Open your browser and go to `http://localhost:3000`
|
||||
2. Enter your email address to subscribe to the newsletter
|
||||
3. View the latest Munich news on the homepage
|
||||
4. The backend will aggregate news from multiple Munich news sources
|
||||
|
||||
## Sending Newsletters
|
||||
|
||||
To send newsletters to all subscribers, you can add a scheduled task or manually trigger the `send_newsletter()` function in `app.py`. For production, consider using:
|
||||
|
||||
- **Cron jobs** (Linux/Mac)
|
||||
- **Task Scheduler** (Windows)
|
||||
- **Celery** with Redis/RabbitMQ for more advanced scheduling
|
||||
- **Cloud functions** (AWS Lambda, Google Cloud Functions)
|
||||
|
||||
Example cron job to send daily at 8 AM:
|
||||
```
|
||||
0 8 * * * cd /path/to/munich-news/backend && python -c "from app import send_newsletter; send_newsletter()"
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
munich-news/
|
||||
├── backend/ # Main API server
|
||||
│ ├── app.py # Flask application entry point
|
||||
│ ├── config.py # Configuration management
|
||||
│ ├── database.py # Database connection
|
||||
│ ├── routes/ # API endpoints (blueprints)
|
||||
│ ├── services/ # Business logic
|
||||
│ ├── templates/ # Email templates
|
||||
│ └── requirements.txt # Python dependencies
|
||||
├── news_crawler/ # Crawler microservice
|
||||
│ ├── crawler_service.py # Standalone crawler
|
||||
│ ├── ollama_client.py # AI summarization client
|
||||
│ ├── requirements.txt # Crawler dependencies
|
||||
│ └── README.md # Crawler documentation
|
||||
├── news_sender/ # Newsletter sender microservice
|
||||
│ ├── sender_service.py # Standalone email sender
|
||||
│ ├── newsletter_template.html # Email template
|
||||
│ ├── requirements.txt # Sender dependencies
|
||||
│ └── README.md # Sender documentation
|
||||
├── frontend/ # Web interface
|
||||
│ ├── server.js # Express server
|
||||
│ ├── package.json # Node.js dependencies
|
||||
│ └── public/
|
||||
│ ├── index.html # Main page
|
||||
│ ├── styles.css # Styling
|
||||
│ └── app.js # Frontend JavaScript
|
||||
├── docker-compose.yml # Docker Compose for MongoDB (development)
|
||||
├── docker-compose.prod.yml # Docker Compose with authentication (production)
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `POST /api/subscribe`
|
||||
Subscribe to the newsletter
|
||||
- Body: `{ "email": "user@example.com" }`
|
||||
|
||||
### `POST /api/unsubscribe`
|
||||
Unsubscribe from the newsletter
|
||||
- Body: `{ "email": "user@example.com" }`
|
||||
|
||||
### `GET /api/news`
|
||||
Get latest Munich news articles
|
||||
|
||||
### `GET /api/stats`
|
||||
Get subscription statistics
|
||||
- Returns: `{ "subscribers": number, "articles": number, "crawled_articles": number }`
|
||||
|
||||
### `GET /api/news/<article_url>`
|
||||
Get full article content by URL
|
||||
- Returns: Full article with content, author, word count, etc.
|
||||
|
||||
### `GET /api/ollama/ping`
|
||||
Test connection to Ollama server
|
||||
- Returns: Connection status and Ollama configuration
|
||||
- Response examples:
|
||||
- Success: `{ "status": "success", "message": "...", "response": "...", "ollama_config": {...} }`
|
||||
- Disabled: `{ "status": "disabled", "message": "...", "ollama_config": {...} }`
|
||||
- Error: `{ "status": "error", "message": "...", "error_details": "...", "troubleshooting": {...}, "ollama_config": {...} }`
|
||||
|
||||
### `GET /api/ollama/models`
|
||||
List available models on Ollama server
|
||||
- Returns: List of available models and current configuration
|
||||
- Response: `{ "status": "success", "models": [...], "current_model": "...", "ollama_config": {...} }`
|
||||
|
||||
### `GET /api/rss-feeds`
|
||||
Get all RSS feeds
|
||||
- Returns: `{ "feeds": [...] }`
|
||||
|
||||
### `POST /api/rss-feeds`
|
||||
Add a new RSS feed
|
||||
- Body: `{ "name": "Feed Name", "url": "https://example.com/rss" }`
|
||||
- Returns: `{ "message": "...", "id": "..." }`
|
||||
|
||||
### `DELETE /api/rss-feeds/<feed_id>`
|
||||
Remove an RSS feed
|
||||
- Returns: `{ "message": "..." }`
|
||||
|
||||
### `PATCH /api/rss-feeds/<feed_id>/toggle`
|
||||
Toggle RSS feed active status
|
||||
- Returns: `{ "message": "...", "active": boolean }`
|
||||
|
||||
## Database Schema
|
||||
|
||||
### Articles Collection
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
title: String,
|
||||
link: String (unique),
|
||||
summary: String,
|
||||
source: String,
|
||||
published_at: String,
|
||||
created_at: DateTime
|
||||
}
|
||||
```
|
||||
|
||||
### Subscribers Collection
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
email: String (unique, lowercase),
|
||||
subscribed_at: DateTime,
|
||||
status: String ('active' | 'inactive')
|
||||
}
|
||||
```
|
||||
|
||||
**Indexes:**
|
||||
- `articles.link` - Unique index to prevent duplicate articles
|
||||
- `articles.created_at` - For efficient sorting
|
||||
- `subscribers.email` - Unique index for email lookups
|
||||
- `subscribers.subscribed_at` - For analytics
|
||||
|
||||
## News Crawler Microservice
|
||||
|
||||
The project includes a standalone crawler microservice that fetches full article content from RSS feeds.
|
||||
|
||||
### Running the Crawler
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Run crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
See `news_crawler/README.md` for detailed documentation.
|
||||
|
||||
### What It Does
|
||||
|
||||
- Crawls full article content from RSS feed links
|
||||
- Extracts text, word count, and metadata
|
||||
- Stores in MongoDB for AI processing
|
||||
- Skips already-crawled articles
|
||||
- Rate-limited (1 second between requests)
|
||||
|
||||
## Customization
|
||||
|
||||
### Adding News Sources
|
||||
|
||||
Use the API to add RSS feeds dynamically:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "Your Source Name", "url": "https://example.com/rss"}'
|
||||
```
|
||||
|
||||
### Styling
|
||||
|
||||
Modify `frontend/public/styles.css` to customize the appearance.
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
||||
## Contributing
|
||||
|
||||
Feel free to submit issues and enhancement requests!
|
||||
|
||||
132
TEST_INSTRUCTIONS.md
Normal file
132
TEST_INSTRUCTIONS.md
Normal file
@@ -0,0 +1,132 @@
|
||||
# Testing RSS Feed URL Extraction
|
||||
|
||||
## Quick Test (Recommended)
|
||||
|
||||
Run this from the project root with backend virtual environment activated:
|
||||
|
||||
```bash
|
||||
# 1. Activate backend virtual environment
|
||||
cd backend
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
|
||||
# 2. Go back to project root
|
||||
cd ..
|
||||
|
||||
# 3. Run the test
|
||||
python test_feeds_quick.py
|
||||
```
|
||||
|
||||
This will:
|
||||
- ✓ Check what RSS feeds are in your database
|
||||
- ✓ Fetch each feed
|
||||
- ✓ Test URL extraction on first 3 articles
|
||||
- ✓ Show what fields are available
|
||||
- ✓ Verify summary and date extraction
|
||||
|
||||
## Expected Output
|
||||
|
||||
```
|
||||
================================================================================
|
||||
RSS Feed Test - Checking Database Feeds
|
||||
================================================================================
|
||||
|
||||
✓ Found 3 feed(s) in database
|
||||
|
||||
================================================================================
|
||||
Feed: Süddeutsche Zeitung München
|
||||
URL: https://www.sueddeutsche.de/muenchen/rss
|
||||
Active: True
|
||||
================================================================================
|
||||
Fetching RSS feed...
|
||||
✓ Found 20 entries
|
||||
|
||||
--- Entry 1 ---
|
||||
Title: New U-Bahn Line Opens in Munich
|
||||
✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-123
|
||||
✓ Summary: The new U-Bahn line connecting the city center...
|
||||
✓ Date: Mon, 10 Nov 2024 10:00:00 +0100
|
||||
|
||||
--- Entry 2 ---
|
||||
Title: Munich Weather Update
|
||||
✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-124
|
||||
✓ Summary: Weather forecast for the week...
|
||||
✓ Date: Mon, 10 Nov 2024 09:30:00 +0100
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
## If No Feeds Found
|
||||
|
||||
Add a feed first:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "Süddeutsche Politik", "url": "https://rss.sueddeutsche.de/rss/Politik"}'
|
||||
```
|
||||
|
||||
## Testing News Crawler
|
||||
|
||||
Once feeds are verified, test the crawler:
|
||||
|
||||
```bash
|
||||
# 1. Install crawler dependencies
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
|
||||
# 2. Run the test
|
||||
python test_rss_feeds.py
|
||||
|
||||
# 3. Or run the actual crawler
|
||||
python crawler_service.py 5
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "No module named 'pymongo'"
|
||||
- Activate the backend virtual environment first
|
||||
- Or install dependencies: `pip install -r backend/requirements.txt`
|
||||
|
||||
### "No RSS feeds in database"
|
||||
- Make sure backend is running
|
||||
- Add feeds via API (see above)
|
||||
- Or check if MongoDB is running: `docker-compose ps`
|
||||
|
||||
### "Could not extract URL"
|
||||
- The test will show available fields
|
||||
- Check if the feed uses `guid`, `id`, or `links` instead of `link`
|
||||
- Our utility should handle most cases automatically
|
||||
|
||||
### "No entries found"
|
||||
- The RSS feed URL might be invalid
|
||||
- Try opening the URL in a browser
|
||||
- Check if it returns valid XML
|
||||
|
||||
## Manual Database Check
|
||||
|
||||
Using mongosh:
|
||||
|
||||
```bash
|
||||
mongosh
|
||||
use munich_news
|
||||
db.rss_feeds.find()
|
||||
db.articles.find().limit(3)
|
||||
```
|
||||
|
||||
## What to Look For
|
||||
|
||||
✅ **Good signs:**
|
||||
- URLs are extracted successfully
|
||||
- URLs start with `http://` or `https://`
|
||||
- Summaries are present
|
||||
- Dates are extracted
|
||||
|
||||
⚠️ **Warning signs:**
|
||||
- "Could not extract URL" messages
|
||||
- Empty summaries (not critical)
|
||||
- Missing dates (not critical)
|
||||
|
||||
❌ **Problems:**
|
||||
- No entries found in feed
|
||||
- All URL extractions fail
|
||||
- Feed parsing errors
|
||||
143
backend/DATABASE_SCHEMA.md
Normal file
143
backend/DATABASE_SCHEMA.md
Normal file
@@ -0,0 +1,143 @@
|
||||
# MongoDB Database Schema
|
||||
|
||||
This document describes the MongoDB collections and their structure for Munich News Daily.
|
||||
|
||||
## Collections
|
||||
|
||||
### 1. Articles Collection (`articles`)
|
||||
|
||||
Stores all news articles aggregated from Munich news sources.
|
||||
|
||||
**Document Structure:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId, // Auto-generated MongoDB ID
|
||||
title: String, // Article title (required)
|
||||
author: String, // Article author (optional, extracted during crawl)
|
||||
link: String, // Article URL (required, unique)
|
||||
content: String, // Full article content (no length limit)
|
||||
summary: String, // AI-generated English summary (≤150 words)
|
||||
word_count: Number, // Word count of full content
|
||||
summary_word_count: Number, // Word count of AI summary
|
||||
source: String, // News source name (e.g., "Süddeutsche Zeitung München")
|
||||
published_at: String, // Original publication date from RSS feed or crawled
|
||||
crawled_at: DateTime, // When article content was crawled (UTC)
|
||||
summarized_at: DateTime, // When AI summary was generated (UTC)
|
||||
created_at: DateTime // When article was added to database (UTC)
|
||||
}
|
||||
```
|
||||
|
||||
**Indexes:**
|
||||
- `link` - Unique index to prevent duplicate articles
|
||||
- `created_at` - Index for efficient sorting by date
|
||||
|
||||
**Example Document:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId("507f1f77bcf86cd799439011"),
|
||||
title: "New U-Bahn Line Opens in Munich",
|
||||
author: "Max Mustermann",
|
||||
link: "https://www.sueddeutsche.de/muenchen/ubahn-1.123456",
|
||||
content: "The new U-Bahn line connecting the city center with the airport opened today. Mayor Dieter Reiter attended the opening ceremony... [full article text continues]",
|
||||
summary: "Munich's new U-Bahn line connecting the city center to the airport opened today with Mayor Dieter Reiter in attendance. The line features 10 stations and runs every 10 minutes during peak hours, significantly reducing travel time. Construction took five years and cost approximately 2 billion euros.",
|
||||
word_count: 1250,
|
||||
summary_word_count: 48,
|
||||
source: "Süddeutsche Zeitung München",
|
||||
published_at: "Mon, 15 Jan 2024 10:00:00 +0100",
|
||||
crawled_at: ISODate("2024-01-15T09:30:00.000Z"),
|
||||
summarized_at: ISODate("2024-01-15T09:30:15.000Z"),
|
||||
created_at: ISODate("2024-01-15T09:00:00.000Z")
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Subscribers Collection (`subscribers`)
|
||||
|
||||
Stores all newsletter subscribers.
|
||||
|
||||
**Document Structure:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId, // Auto-generated MongoDB ID
|
||||
email: String, // Subscriber email (required, unique, lowercase)
|
||||
subscribed_at: DateTime, // When user subscribed (UTC)
|
||||
status: String // Subscription status: 'active' or 'inactive'
|
||||
}
|
||||
```
|
||||
|
||||
**Indexes:**
|
||||
- `email` - Unique index for email lookups and preventing duplicates
|
||||
- `subscribed_at` - Index for analytics and sorting
|
||||
|
||||
**Example Document:**
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId("507f1f77bcf86cd799439012"),
|
||||
email: "user@example.com",
|
||||
subscribed_at: ISODate("2024-01-15T08:30:00.000Z"),
|
||||
status: "active"
|
||||
}
|
||||
```
|
||||
|
||||
## Design Decisions
|
||||
|
||||
### Why MongoDB?
|
||||
|
||||
1. **Flexibility**: Easy to add new fields without schema migrations
|
||||
2. **Scalability**: Handles large volumes of articles and subscribers efficiently
|
||||
3. **Performance**: Indexes on frequently queried fields (link, email, created_at)
|
||||
4. **Document Model**: Natural fit for news articles and subscriber data
|
||||
|
||||
### Schema Choices
|
||||
|
||||
1. **Unique Link Index**: Prevents duplicate articles from being stored, even if fetched multiple times
|
||||
2. **Status Field**: Soft delete for subscribers (set to 'inactive' instead of deleting) - allows for analytics and easy re-subscription
|
||||
3. **UTC Timestamps**: All dates stored in UTC for consistency across timezones
|
||||
4. **Lowercase Emails**: Emails stored in lowercase to prevent case-sensitivity issues
|
||||
|
||||
### Future Enhancements
|
||||
|
||||
Potential fields to add in the future:
|
||||
|
||||
**Articles:**
|
||||
- `category`: String (e.g., "politics", "sports", "culture")
|
||||
- `tags`: Array of Strings
|
||||
- `image_url`: String
|
||||
- `sent_in_newsletter`: Boolean (track if article was sent)
|
||||
- `sent_at`: DateTime (when article was included in newsletter)
|
||||
|
||||
**Subscribers:**
|
||||
- `preferences`: Object (newsletter frequency, categories, etc.)
|
||||
- `last_sent_at`: DateTime (last newsletter sent date)
|
||||
- `unsubscribed_at`: DateTime (when user unsubscribed)
|
||||
- `verification_token`: String (for email verification)
|
||||
|
||||
|
||||
|
||||
## AI Summarization Workflow
|
||||
|
||||
When the crawler processes an article:
|
||||
|
||||
1. **Extract Content**: Full article text is extracted from the webpage
|
||||
2. **Summarize with Ollama**: If `OLLAMA_ENABLED=true`, the content is sent to Ollama for summarization
|
||||
3. **Store Both**: Both the original `content` and AI-generated `summary` are stored
|
||||
4. **Fallback**: If Ollama is unavailable or fails, only the original content is stored
|
||||
|
||||
### Summary Field Details
|
||||
|
||||
- **Language**: Always in English, regardless of source article language
|
||||
- **Length**: Maximum 150 words
|
||||
- **Format**: Plain text, concise and clear
|
||||
- **Purpose**: Quick preview for newsletters and frontend display
|
||||
|
||||
### Querying Articles
|
||||
|
||||
```javascript
|
||||
// Get articles with AI summaries
|
||||
db.articles.find({ summary: { $exists: true, $ne: null } })
|
||||
|
||||
// Get articles without summaries
|
||||
db.articles.find({ summary: { $exists: false } })
|
||||
|
||||
// Count summarized articles
|
||||
db.articles.countDocuments({ summary: { $exists: true, $ne: null } })
|
||||
```
|
||||
98
backend/STRUCTURE.md
Normal file
98
backend/STRUCTURE.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Backend Structure
|
||||
|
||||
The backend has been modularized for better maintainability and scalability.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
backend/
|
||||
├── app.py # Main Flask application entry point
|
||||
├── config.py # Configuration management
|
||||
├── database.py # Database connection and initialization
|
||||
├── requirements.txt # Python dependencies
|
||||
├── .env # Environment variables
|
||||
│
|
||||
├── routes/ # API route handlers (blueprints)
|
||||
│ ├── __init__.py
|
||||
│ ├── subscription_routes.py # /api/subscribe, /api/unsubscribe
|
||||
│ ├── news_routes.py # /api/news, /api/stats
|
||||
│ ├── rss_routes.py # /api/rss-feeds (CRUD operations)
|
||||
│ └── ollama_routes.py # /api/ollama/* (AI features)
|
||||
│
|
||||
└── services/ # Business logic layer
|
||||
├── __init__.py
|
||||
├── news_service.py # News fetching and storage logic
|
||||
├── email_service.py # Newsletter email sending
|
||||
└── ollama_service.py # Ollama AI integration
|
||||
```
|
||||
|
||||
## Key Components
|
||||
|
||||
### app.py
|
||||
- Main Flask application
|
||||
- Registers all blueprints
|
||||
- Minimal code, just wiring things together
|
||||
|
||||
### config.py
|
||||
- Centralized configuration
|
||||
- Loads environment variables
|
||||
- Single source of truth for all settings
|
||||
|
||||
### database.py
|
||||
- MongoDB connection setup
|
||||
- Collection definitions
|
||||
- Database initialization with indexes
|
||||
|
||||
### routes/
|
||||
Each route file is a Flask Blueprint handling specific API endpoints:
|
||||
- **subscription_routes.py**: User subscription management
|
||||
- **news_routes.py**: News fetching and statistics
|
||||
- **rss_routes.py**: RSS feed management (add/remove/list/toggle)
|
||||
- **ollama_routes.py**: AI/Ollama integration endpoints
|
||||
|
||||
### services/
|
||||
Business logic separated from route handlers:
|
||||
- **news_service.py**: Fetches news from RSS feeds, saves to database
|
||||
- **email_service.py**: Sends newsletter emails to subscribers
|
||||
- **ollama_service.py**: Communicates with Ollama AI server
|
||||
|
||||
## Benefits of This Structure
|
||||
|
||||
1. **Separation of Concerns**: Routes handle HTTP, services handle business logic
|
||||
2. **Testability**: Each module can be tested independently
|
||||
3. **Maintainability**: Easy to find and modify specific functionality
|
||||
4. **Scalability**: Easy to add new routes or services
|
||||
5. **Reusability**: Services can be used by multiple routes
|
||||
|
||||
## Adding New Features
|
||||
|
||||
### To add a new API endpoint:
|
||||
1. Create a new route file in `routes/` or add to existing one
|
||||
2. Create a Blueprint and define routes
|
||||
3. Register the blueprint in `app.py`
|
||||
|
||||
### To add new business logic:
|
||||
1. Create a new service file in `services/`
|
||||
2. Import and use in your route handlers
|
||||
|
||||
### Example:
|
||||
```python
|
||||
# services/my_service.py
|
||||
def my_business_logic():
|
||||
return "Hello"
|
||||
|
||||
# routes/my_routes.py
|
||||
from flask import Blueprint
|
||||
from services.my_service import my_business_logic
|
||||
|
||||
my_bp = Blueprint('my', __name__)
|
||||
|
||||
@my_bp.route('/api/my-endpoint')
|
||||
def my_endpoint():
|
||||
result = my_business_logic()
|
||||
return {'message': result}
|
||||
|
||||
# app.py
|
||||
from routes.my_routes import my_bp
|
||||
app.register_blueprint(my_bp)
|
||||
```
|
||||
29
backend/app.py
Normal file
29
backend/app.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from flask import Flask
|
||||
from flask_cors import CORS
|
||||
from config import Config
|
||||
from database import init_db
|
||||
from routes.subscription_routes import subscription_bp
|
||||
from routes.news_routes import news_bp
|
||||
from routes.rss_routes import rss_bp
|
||||
from routes.ollama_routes import ollama_bp
|
||||
from routes.newsletter_routes import newsletter_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
# Initialize database
|
||||
init_db()
|
||||
|
||||
# Register blueprints
|
||||
app.register_blueprint(subscription_bp)
|
||||
app.register_blueprint(news_bp)
|
||||
app.register_blueprint(rss_bp)
|
||||
app.register_blueprint(ollama_bp)
|
||||
app.register_blueprint(newsletter_bp)
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, port=Config.FLASK_PORT, host='127.0.0.1')
|
||||
52
backend/config.py
Normal file
52
backend/config.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
# Get the directory where this script is located
|
||||
backend_dir = Path(__file__).parent
|
||||
env_path = backend_dir / '.env'
|
||||
|
||||
# Load .env file
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
|
||||
# Debug: Print if .env file exists (for troubleshooting)
|
||||
if env_path.exists():
|
||||
print(f"✓ Loading .env file from: {env_path}")
|
||||
else:
|
||||
print(f"⚠ Warning: .env file not found at {env_path}")
|
||||
print(f" Current working directory: {os.getcwd()}")
|
||||
print(f" Looking for .env in: {env_path}")
|
||||
|
||||
|
||||
class Config:
|
||||
"""Application configuration"""
|
||||
|
||||
# MongoDB
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# Email
|
||||
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
|
||||
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
|
||||
EMAIL_USER = os.getenv('EMAIL_USER', '')
|
||||
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
|
||||
|
||||
# Ollama
|
||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
|
||||
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
|
||||
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
||||
|
||||
# Flask
|
||||
FLASK_PORT = int(os.getenv('FLASK_PORT', '5000'))
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print configuration (without sensitive data)"""
|
||||
print("\nApplication Configuration:")
|
||||
print(f" MongoDB URI: {cls.MONGODB_URI}")
|
||||
print(f" Database: {cls.DB_NAME}")
|
||||
print(f" Flask Port: {cls.FLASK_PORT}")
|
||||
print(f" Ollama Base URL: {cls.OLLAMA_BASE_URL}")
|
||||
print(f" Ollama Model: {cls.OLLAMA_MODEL}")
|
||||
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
|
||||
53
backend/database.py
Normal file
53
backend/database.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
from config import Config
|
||||
|
||||
# MongoDB setup
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
# Collections
|
||||
articles_collection = db['articles']
|
||||
subscribers_collection = db['subscribers']
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Initialize database with indexes"""
|
||||
# Create unique index on article links to prevent duplicates
|
||||
articles_collection.create_index('link', unique=True)
|
||||
# Create index on created_at for faster sorting
|
||||
articles_collection.create_index('created_at')
|
||||
# Create unique index on subscriber emails
|
||||
subscribers_collection.create_index('email', unique=True)
|
||||
# Create index on subscribed_at
|
||||
subscribers_collection.create_index('subscribed_at')
|
||||
# Create unique index on RSS feed URLs
|
||||
rss_feeds_collection.create_index('url', unique=True)
|
||||
|
||||
# Initialize default RSS feeds if collection is empty
|
||||
if rss_feeds_collection.count_documents({}) == 0:
|
||||
default_feeds = [
|
||||
{
|
||||
'name': 'Süddeutsche Zeitung München',
|
||||
'url': 'https://www.sueddeutsche.de/muenchen/rss',
|
||||
'active': True,
|
||||
'created_at': datetime.utcnow()
|
||||
},
|
||||
{
|
||||
'name': 'Münchner Merkur',
|
||||
'url': 'https://www.merkur.de/muenchen/rss',
|
||||
'active': True,
|
||||
'created_at': datetime.utcnow()
|
||||
},
|
||||
{
|
||||
'name': 'Abendzeitung München',
|
||||
'url': 'https://www.abendzeitung-muenchen.de/rss',
|
||||
'active': True,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
]
|
||||
rss_feeds_collection.insert_many(default_feeds)
|
||||
print(f"Initialized {len(default_feeds)} default RSS feeds")
|
||||
|
||||
print("Database initialized with indexes")
|
||||
32
backend/env.template
Normal file
32
backend/env.template
Normal file
@@ -0,0 +1,32 @@
|
||||
# MongoDB Configuration
|
||||
# For Docker Compose (no authentication):
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
# For Docker Compose with authentication:
|
||||
# MONGODB_URI=mongodb://admin:password@localhost:27017/
|
||||
# For MongoDB Atlas (cloud):
|
||||
# MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
|
||||
|
||||
# Email Configuration (for sending newsletters)
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
# Note: For Gmail, use an App Password: https://support.google.com/accounts/answer/185833
|
||||
|
||||
# Ollama Configuration (for AI-powered features)
|
||||
# Remote Ollama server URL (e.g., http://your-server-ip:11434 or https://your-domain.com)
|
||||
OLLAMA_BASE_URL=http://localhost:11434
|
||||
# Optional: API key if your Ollama server requires authentication
|
||||
# OLLAMA_API_KEY=your-api-key-here
|
||||
# Model name to use (e.g., llama2, mistral, codellama, llama3, phi3:latest)
|
||||
OLLAMA_MODEL=phi3:latest
|
||||
# Enable/disable Ollama features (true/false)
|
||||
# When enabled, the crawler will automatically summarize articles in English (≤150 words)
|
||||
OLLAMA_ENABLED=true
|
||||
# Timeout for Ollama requests in seconds (default: 30)
|
||||
OLLAMA_TIMEOUT=30
|
||||
|
||||
# Flask Server Configuration
|
||||
# Port for Flask server (default: 5001 to avoid AirPlay conflict on macOS)
|
||||
FLASK_PORT=5001
|
||||
|
||||
61
backend/fix_duplicates.py
Normal file
61
backend/fix_duplicates.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Script to fix duplicate RSS feeds and create unique index
|
||||
Run this once: python fix_duplicates.py
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
|
||||
# Connect to MongoDB
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
print("Fixing duplicate RSS feeds...")
|
||||
|
||||
# Get all feeds
|
||||
all_feeds = list(rss_feeds_collection.find())
|
||||
print(f"Total feeds found: {len(all_feeds)}")
|
||||
|
||||
# Find duplicates by URL
|
||||
seen_urls = {}
|
||||
duplicates_to_remove = []
|
||||
|
||||
for feed in all_feeds:
|
||||
url = feed.get('url')
|
||||
if url in seen_urls:
|
||||
# This is a duplicate, mark for removal
|
||||
duplicates_to_remove.append(feed['_id'])
|
||||
print(f" Duplicate found: {feed['name']} - {url}")
|
||||
else:
|
||||
# First occurrence, keep it
|
||||
seen_urls[url] = feed['_id']
|
||||
|
||||
# Remove duplicates
|
||||
if duplicates_to_remove:
|
||||
result = rss_feeds_collection.delete_many({'_id': {'$in': duplicates_to_remove}})
|
||||
print(f"Removed {result.deleted_count} duplicate feeds")
|
||||
else:
|
||||
print("No duplicates found")
|
||||
|
||||
# Drop existing indexes (if any)
|
||||
print("\nDropping existing indexes...")
|
||||
try:
|
||||
rss_feeds_collection.drop_indexes()
|
||||
print("Indexes dropped")
|
||||
except Exception as e:
|
||||
print(f"Note: {e}")
|
||||
|
||||
# Create unique index on URL
|
||||
print("\nCreating unique index on 'url' field...")
|
||||
rss_feeds_collection.create_index('url', unique=True)
|
||||
print("✓ Unique index created successfully")
|
||||
|
||||
# Verify
|
||||
remaining_feeds = list(rss_feeds_collection.find())
|
||||
print(f"\nFinal feed count: {len(remaining_feeds)}")
|
||||
print("\nRemaining feeds:")
|
||||
for feed in remaining_feeds:
|
||||
print(f" - {feed['name']}: {feed['url']}")
|
||||
|
||||
print("\n✓ Done! Duplicates removed and unique index created.")
|
||||
print("You can now restart your Flask app.")
|
||||
8
backend/requirements.txt
Normal file
8
backend/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
Flask==3.0.0
|
||||
flask-cors==4.0.0
|
||||
feedparser==6.0.10
|
||||
python-dotenv==1.0.0
|
||||
pymongo==4.6.1
|
||||
requests==2.31.0
|
||||
Jinja2==3.1.2
|
||||
|
||||
1
backend/routes/__init__.py
Normal file
1
backend/routes/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Routes package
|
||||
123
backend/routes/news_routes.py
Normal file
123
backend/routes/news_routes.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from flask import Blueprint, jsonify
|
||||
from database import articles_collection
|
||||
from services.news_service import fetch_munich_news, save_articles_to_db
|
||||
|
||||
news_bp = Blueprint('news', __name__)
|
||||
|
||||
|
||||
@news_bp.route('/api/news', methods=['GET'])
|
||||
def get_news():
|
||||
"""Get latest Munich news"""
|
||||
try:
|
||||
# Fetch fresh news and save to database
|
||||
articles = fetch_munich_news()
|
||||
save_articles_to_db(articles)
|
||||
|
||||
# Get articles from MongoDB, sorted by created_at (newest first)
|
||||
cursor = articles_collection.find().sort('created_at', -1).limit(20)
|
||||
|
||||
db_articles = []
|
||||
for doc in cursor:
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published': doc.get('published_at', ''),
|
||||
'word_count': doc.get('word_count'),
|
||||
'has_full_content': bool(doc.get('content')),
|
||||
'has_summary': bool(doc.get('summary'))
|
||||
}
|
||||
|
||||
# Include AI summary if available
|
||||
if doc.get('summary'):
|
||||
article['summary'] = doc.get('summary', '')
|
||||
article['summary_word_count'] = doc.get('summary_word_count')
|
||||
article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
|
||||
# Fallback: Include preview of content if no summary (first 200 chars)
|
||||
elif doc.get('content'):
|
||||
article['preview'] = doc.get('content', '')[:200] + '...'
|
||||
|
||||
db_articles.append(article)
|
||||
|
||||
# Combine fresh articles with database articles and deduplicate
|
||||
seen_links = set()
|
||||
combined = []
|
||||
|
||||
# Add fresh articles first (they're more recent)
|
||||
for article in articles:
|
||||
link = article.get('link', '')
|
||||
if link and link not in seen_links:
|
||||
seen_links.add(link)
|
||||
combined.append(article)
|
||||
|
||||
# Add database articles
|
||||
for article in db_articles:
|
||||
link = article.get('link', '')
|
||||
if link and link not in seen_links:
|
||||
seen_links.add(link)
|
||||
combined.append(article)
|
||||
|
||||
return jsonify({'articles': combined[:20]}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
|
||||
def get_article_by_url(article_url):
|
||||
"""Get full article content by URL"""
|
||||
try:
|
||||
# Decode URL
|
||||
from urllib.parse import unquote
|
||||
decoded_url = unquote(article_url)
|
||||
|
||||
# Find article by link
|
||||
article = articles_collection.find_one({'link': decoded_url})
|
||||
|
||||
if not article:
|
||||
return jsonify({'error': 'Article not found'}), 404
|
||||
|
||||
return jsonify({
|
||||
'title': article.get('title', ''),
|
||||
'author': article.get('author'),
|
||||
'link': article.get('link', ''),
|
||||
'content': article.get('content', ''),
|
||||
'summary': article.get('summary'),
|
||||
'word_count': article.get('word_count', 0),
|
||||
'summary_word_count': article.get('summary_word_count'),
|
||||
'source': article.get('source', ''),
|
||||
'published_at': article.get('published_at', ''),
|
||||
'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
|
||||
'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
|
||||
'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
|
||||
}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@news_bp.route('/api/stats', methods=['GET'])
|
||||
def get_stats():
|
||||
"""Get subscription statistics"""
|
||||
try:
|
||||
from database import subscribers_collection
|
||||
|
||||
# Count only active subscribers
|
||||
subscriber_count = subscribers_collection.count_documents({'status': 'active'})
|
||||
|
||||
# Also get total article count
|
||||
article_count = articles_collection.count_documents({})
|
||||
|
||||
# Count crawled articles
|
||||
crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})
|
||||
|
||||
# Count summarized articles
|
||||
summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
|
||||
|
||||
return jsonify({
|
||||
'subscribers': subscriber_count,
|
||||
'articles': article_count,
|
||||
'crawled_articles': crawled_count,
|
||||
'summarized_articles': summarized_count
|
||||
}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
62
backend/routes/newsletter_routes.py
Normal file
62
backend/routes/newsletter_routes.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from flask import Blueprint, Response
|
||||
from pathlib import Path
|
||||
from jinja2 import Template
|
||||
from datetime import datetime
|
||||
from database import articles_collection
|
||||
|
||||
newsletter_bp = Blueprint('newsletter', __name__)
|
||||
|
||||
|
||||
@newsletter_bp.route('/api/newsletter/preview', methods=['GET'])
|
||||
def preview_newsletter():
|
||||
"""Preview the newsletter HTML (for testing)"""
|
||||
try:
|
||||
# Get latest articles with AI summaries
|
||||
cursor = articles_collection.find(
|
||||
{'summary': {'$exists': True, '$ne': None}}
|
||||
).sort('created_at', -1).limit(10)
|
||||
|
||||
articles = []
|
||||
for doc in cursor:
|
||||
articles.append({
|
||||
'title': doc.get('title', ''),
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'summary': doc.get('summary', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published_at': doc.get('published_at', '')
|
||||
})
|
||||
|
||||
if not articles:
|
||||
return Response(
|
||||
"<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>",
|
||||
mimetype='text/html'
|
||||
)
|
||||
|
||||
# Load template
|
||||
template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
template_content = f.read()
|
||||
|
||||
template = Template(template_content)
|
||||
|
||||
# Prepare data
|
||||
now = datetime.now()
|
||||
template_data = {
|
||||
'date': now.strftime('%A, %B %d, %Y'),
|
||||
'year': now.year,
|
||||
'article_count': len(articles),
|
||||
'articles': articles,
|
||||
'unsubscribe_link': 'http://localhost:3000/unsubscribe',
|
||||
'website_link': 'http://localhost:3000'
|
||||
}
|
||||
|
||||
# Render and return HTML
|
||||
html_content = template.render(**template_data)
|
||||
return Response(html_content, mimetype='text/html')
|
||||
|
||||
except Exception as e:
|
||||
return Response(
|
||||
f"<h1>Error</h1><p>{str(e)}</p>",
|
||||
mimetype='text/html'
|
||||
), 500
|
||||
158
backend/routes/ollama_routes.py
Normal file
158
backend/routes/ollama_routes.py
Normal file
@@ -0,0 +1,158 @@
|
||||
from flask import Blueprint, jsonify
|
||||
from config import Config
|
||||
from services.ollama_service import call_ollama, list_ollama_models
|
||||
import os
|
||||
|
||||
ollama_bp = Blueprint('ollama', __name__)
|
||||
|
||||
|
||||
@ollama_bp.route('/api/ollama/ping', methods=['GET', 'POST'])
|
||||
def ping_ollama():
|
||||
"""Test connection to Ollama server"""
|
||||
try:
|
||||
# Check if Ollama is enabled
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
return jsonify({
|
||||
'status': 'disabled',
|
||||
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': False
|
||||
}
|
||||
}), 200
|
||||
|
||||
# Send a simple test prompt
|
||||
test_prompt = "Say 'Hello! I am connected and working.' in one sentence."
|
||||
system_prompt = "You are a helpful assistant. Respond briefly and concisely."
|
||||
|
||||
response_text, error_message = call_ollama(test_prompt, system_prompt)
|
||||
|
||||
if response_text:
|
||||
return jsonify({
|
||||
'status': 'success',
|
||||
'message': 'Successfully connected to Ollama',
|
||||
'response': response_text,
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': True
|
||||
}
|
||||
}), 200
|
||||
else:
|
||||
# Try to get available models for better error message
|
||||
available_models, _ = list_ollama_models()
|
||||
|
||||
troubleshooting = {
|
||||
'check_server': f'Verify Ollama is running at {Config.OLLAMA_BASE_URL}',
|
||||
'check_model': f'Verify model "{Config.OLLAMA_MODEL}" is available (run: ollama list)',
|
||||
'test_connection': f'Test manually: curl {Config.OLLAMA_BASE_URL}/api/generate -d \'{{"model":"{Config.OLLAMA_MODEL}","prompt":"test"}}\''
|
||||
}
|
||||
|
||||
if available_models:
|
||||
troubleshooting['available_models'] = available_models
|
||||
troubleshooting['suggestion'] = f'Try setting OLLAMA_MODEL to one of: {", ".join(available_models[:5])}'
|
||||
|
||||
return jsonify({
|
||||
'status': 'error',
|
||||
'message': error_message or 'Failed to get response from Ollama',
|
||||
'error_details': error_message,
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': True
|
||||
},
|
||||
'troubleshooting': troubleshooting
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'status': 'error',
|
||||
'message': f'Error connecting to Ollama: {str(e)}',
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': Config.OLLAMA_ENABLED
|
||||
}
|
||||
}), 500
|
||||
|
||||
|
||||
@ollama_bp.route('/api/ollama/config', methods=['GET'])
|
||||
def get_ollama_config():
|
||||
"""Get current Ollama configuration (for debugging)"""
|
||||
try:
|
||||
from pathlib import Path
|
||||
backend_dir = Path(__file__).parent.parent
|
||||
env_path = backend_dir / '.env'
|
||||
|
||||
return jsonify({
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': Config.OLLAMA_ENABLED,
|
||||
'has_api_key': bool(Config.OLLAMA_API_KEY)
|
||||
},
|
||||
'env_file_path': str(env_path),
|
||||
'env_file_exists': env_path.exists(),
|
||||
'current_working_directory': os.getcwd()
|
||||
}), 200
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'error': str(e),
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': Config.OLLAMA_ENABLED
|
||||
}
|
||||
}), 500
|
||||
|
||||
|
||||
@ollama_bp.route('/api/ollama/models', methods=['GET'])
|
||||
def get_ollama_models():
|
||||
"""List available models on Ollama server"""
|
||||
try:
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
return jsonify({
|
||||
'status': 'disabled',
|
||||
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': False
|
||||
}
|
||||
}), 200
|
||||
|
||||
models, error_message = list_ollama_models()
|
||||
|
||||
if models is not None:
|
||||
return jsonify({
|
||||
'status': 'success',
|
||||
'models': models,
|
||||
'current_model': Config.OLLAMA_MODEL,
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': True
|
||||
}
|
||||
}), 200
|
||||
else:
|
||||
return jsonify({
|
||||
'status': 'error',
|
||||
'message': error_message or 'Failed to list models',
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': True
|
||||
}
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'status': 'error',
|
||||
'message': f'Error listing models: {str(e)}',
|
||||
'ollama_config': {
|
||||
'base_url': Config.OLLAMA_BASE_URL,
|
||||
'model': Config.OLLAMA_MODEL,
|
||||
'enabled': Config.OLLAMA_ENABLED
|
||||
}
|
||||
}), 500
|
||||
124
backend/routes/rss_routes.py
Normal file
124
backend/routes/rss_routes.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from flask import Blueprint, request, jsonify
|
||||
from datetime import datetime
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
from bson.objectid import ObjectId
|
||||
import feedparser
|
||||
from database import rss_feeds_collection
|
||||
|
||||
rss_bp = Blueprint('rss', __name__)
|
||||
|
||||
|
||||
@rss_bp.route('/api/rss-feeds', methods=['GET'])
|
||||
def get_rss_feeds():
|
||||
"""Get all RSS feeds"""
|
||||
try:
|
||||
cursor = rss_feeds_collection.find().sort('created_at', -1)
|
||||
feeds = []
|
||||
for feed in cursor:
|
||||
feeds.append({
|
||||
'id': str(feed['_id']),
|
||||
'name': feed.get('name', ''),
|
||||
'url': feed.get('url', ''),
|
||||
'active': feed.get('active', True),
|
||||
'created_at': feed.get('created_at', '').isoformat() if feed.get('created_at') else ''
|
||||
})
|
||||
return jsonify({'feeds': feeds}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@rss_bp.route('/api/rss-feeds', methods=['POST'])
|
||||
def add_rss_feed():
|
||||
"""Add a new RSS feed"""
|
||||
data = request.json
|
||||
name = data.get('name', '').strip()
|
||||
url = data.get('url', '').strip()
|
||||
|
||||
if not name or not url:
|
||||
return jsonify({'error': 'Name and URL are required'}), 400
|
||||
|
||||
if not url.startswith('http://') and not url.startswith('https://'):
|
||||
return jsonify({'error': 'URL must start with http:// or https://'}), 400
|
||||
|
||||
try:
|
||||
# Test if the RSS feed is valid
|
||||
try:
|
||||
feed = feedparser.parse(url)
|
||||
if not feed.entries:
|
||||
return jsonify({'error': 'Invalid RSS feed or no entries found'}), 400
|
||||
except Exception as e:
|
||||
return jsonify({'error': f'Failed to parse RSS feed: {str(e)}'}), 400
|
||||
|
||||
feed_doc = {
|
||||
'name': name,
|
||||
'url': url,
|
||||
'active': True,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
try:
|
||||
result = rss_feeds_collection.insert_one(feed_doc)
|
||||
return jsonify({
|
||||
'message': 'RSS feed added successfully',
|
||||
'id': str(result.inserted_id)
|
||||
}), 201
|
||||
except DuplicateKeyError:
|
||||
return jsonify({'error': 'RSS feed URL already exists'}), 409
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@rss_bp.route('/api/rss-feeds/<feed_id>', methods=['DELETE'])
|
||||
def remove_rss_feed(feed_id):
|
||||
"""Remove an RSS feed"""
|
||||
try:
|
||||
# Validate ObjectId
|
||||
try:
|
||||
obj_id = ObjectId(feed_id)
|
||||
except Exception:
|
||||
return jsonify({'error': 'Invalid feed ID'}), 400
|
||||
|
||||
result = rss_feeds_collection.delete_one({'_id': obj_id})
|
||||
|
||||
if result.deleted_count > 0:
|
||||
return jsonify({'message': 'RSS feed removed successfully'}), 200
|
||||
else:
|
||||
return jsonify({'error': 'RSS feed not found'}), 404
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@rss_bp.route('/api/rss-feeds/<feed_id>/toggle', methods=['PATCH'])
|
||||
def toggle_rss_feed(feed_id):
|
||||
"""Toggle RSS feed active status"""
|
||||
try:
|
||||
# Validate ObjectId
|
||||
try:
|
||||
obj_id = ObjectId(feed_id)
|
||||
except Exception:
|
||||
return jsonify({'error': 'Invalid feed ID'}), 400
|
||||
|
||||
# Get current status
|
||||
feed = rss_feeds_collection.find_one({'_id': obj_id})
|
||||
if not feed:
|
||||
return jsonify({'error': 'RSS feed not found'}), 404
|
||||
|
||||
# Toggle status
|
||||
new_status = not feed.get('active', True)
|
||||
result = rss_feeds_collection.update_one(
|
||||
{'_id': obj_id},
|
||||
{'$set': {'active': new_status}}
|
||||
)
|
||||
|
||||
if result.modified_count > 0:
|
||||
return jsonify({
|
||||
'message': f'RSS feed {"activated" if new_status else "deactivated"} successfully',
|
||||
'active': new_status
|
||||
}), 200
|
||||
else:
|
||||
return jsonify({'error': 'Failed to update RSS feed'}), 500
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
63
backend/routes/subscription_routes.py
Normal file
63
backend/routes/subscription_routes.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from flask import Blueprint, request, jsonify
|
||||
from datetime import datetime
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
from database import subscribers_collection
|
||||
|
||||
subscription_bp = Blueprint('subscription', __name__)
|
||||
|
||||
|
||||
@subscription_bp.route('/api/subscribe', methods=['POST'])
|
||||
def subscribe():
|
||||
"""Subscribe a user to the newsletter"""
|
||||
data = request.json
|
||||
email = data.get('email', '').strip().lower()
|
||||
|
||||
if not email or '@' not in email:
|
||||
return jsonify({'error': 'Invalid email address'}), 400
|
||||
|
||||
try:
|
||||
subscriber_doc = {
|
||||
'email': email,
|
||||
'subscribed_at': datetime.utcnow(),
|
||||
'status': 'active'
|
||||
}
|
||||
|
||||
# Try to insert, if duplicate key error, subscriber already exists
|
||||
try:
|
||||
subscribers_collection.insert_one(subscriber_doc)
|
||||
return jsonify({'message': 'Successfully subscribed!'}), 201
|
||||
except DuplicateKeyError:
|
||||
# Check if subscriber is active
|
||||
existing = subscribers_collection.find_one({'email': email})
|
||||
if existing and existing.get('status') == 'active':
|
||||
return jsonify({'message': 'Email already subscribed'}), 200
|
||||
else:
|
||||
# Reactivate if previously unsubscribed
|
||||
subscribers_collection.update_one(
|
||||
{'email': email},
|
||||
{'$set': {'status': 'active', 'subscribed_at': datetime.utcnow()}}
|
||||
)
|
||||
return jsonify({'message': 'Successfully re-subscribed!'}), 200
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@subscription_bp.route('/api/unsubscribe', methods=['POST'])
|
||||
def unsubscribe():
|
||||
"""Unsubscribe a user from the newsletter"""
|
||||
data = request.json
|
||||
email = data.get('email', '').strip().lower()
|
||||
|
||||
try:
|
||||
result = subscribers_collection.update_one(
|
||||
{'email': email},
|
||||
{'$set': {'status': 'inactive'}}
|
||||
)
|
||||
|
||||
if result.matched_count > 0:
|
||||
return jsonify({'message': 'Successfully unsubscribed'}), 200
|
||||
else:
|
||||
return jsonify({'error': 'Email not found in subscribers'}), 404
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 500
|
||||
1
backend/services/__init__.py
Normal file
1
backend/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Services package
|
||||
88
backend/services/email_service.py
Normal file
88
backend/services/email_service.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from jinja2 import Template
|
||||
from config import Config
|
||||
from database import subscribers_collection, articles_collection
|
||||
|
||||
|
||||
def send_newsletter(max_articles=10):
|
||||
"""Send newsletter to all subscribers with AI-summarized articles"""
|
||||
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
|
||||
print("Email credentials not configured")
|
||||
return
|
||||
|
||||
# Get latest articles with AI summaries from database
|
||||
cursor = articles_collection.find(
|
||||
{'summary': {'$exists': True, '$ne': None}}
|
||||
).sort('created_at', -1).limit(max_articles)
|
||||
|
||||
articles = []
|
||||
for doc in cursor:
|
||||
articles.append({
|
||||
'title': doc.get('title', ''),
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'summary': doc.get('summary', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published_at': doc.get('published_at', '')
|
||||
})
|
||||
|
||||
if not articles:
|
||||
print("No articles with summaries to send")
|
||||
return
|
||||
|
||||
# Load email template
|
||||
template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
template_content = f.read()
|
||||
|
||||
template = Template(template_content)
|
||||
|
||||
# Prepare template data
|
||||
now = datetime.now()
|
||||
template_data = {
|
||||
'date': now.strftime('%A, %B %d, %Y'),
|
||||
'year': now.year,
|
||||
'article_count': len(articles),
|
||||
'articles': articles,
|
||||
'unsubscribe_link': 'http://localhost:3000', # Update with actual unsubscribe link
|
||||
'website_link': 'http://localhost:3000'
|
||||
}
|
||||
|
||||
# Render HTML
|
||||
html_content = template.render(**template_data)
|
||||
|
||||
# Get all active subscribers
|
||||
subscribers_cursor = subscribers_collection.find({'status': 'active'})
|
||||
subscribers = [doc['email'] for doc in subscribers_cursor]
|
||||
|
||||
# Send emails
|
||||
for subscriber in subscribers:
|
||||
try:
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = f'Munich News Daily - {datetime.now().strftime("%B %d, %Y")}'
|
||||
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
|
||||
msg['To'] = subscriber
|
||||
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
|
||||
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{subscriber}@dongho.kim>'
|
||||
msg['X-Mailer'] = 'Munich News Daily'
|
||||
|
||||
# Add plain text version as fallback
|
||||
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
|
||||
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
|
||||
|
||||
# Add HTML version
|
||||
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
|
||||
|
||||
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
|
||||
server.starttls()
|
||||
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
|
||||
print(f"Newsletter sent to {subscriber}")
|
||||
except Exception as e:
|
||||
print(f"Error sending to {subscriber}: {e}")
|
||||
90
backend/services/news_service.py
Normal file
90
backend/services/news_service.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import feedparser
|
||||
from datetime import datetime
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
from database import articles_collection, rss_feeds_collection
|
||||
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
|
||||
def get_active_rss_feeds():
|
||||
"""Get all active RSS feeds from database"""
|
||||
feeds = []
|
||||
cursor = rss_feeds_collection.find({'active': True})
|
||||
for feed in cursor:
|
||||
feeds.append({
|
||||
'name': feed.get('name', ''),
|
||||
'url': feed.get('url', '')
|
||||
})
|
||||
return feeds
|
||||
|
||||
|
||||
def fetch_munich_news():
|
||||
"""Fetch news from Munich news sources"""
|
||||
articles = []
|
||||
|
||||
# Get RSS feeds from database instead of hardcoded list
|
||||
sources = get_active_rss_feeds()
|
||||
|
||||
for source in sources:
|
||||
try:
|
||||
feed = feedparser.parse(source['url'])
|
||||
for entry in feed.entries[:5]: # Get top 5 from each source
|
||||
# Extract article URL using utility function
|
||||
article_url = extract_article_url(entry)
|
||||
|
||||
if not article_url:
|
||||
print(f" ⚠ No valid URL for: {entry.get('title', 'Unknown')[:50]}")
|
||||
continue # Skip entries without valid URL
|
||||
|
||||
# Extract summary
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
summary = summary[:200] + '...' if len(summary) > 200 else summary
|
||||
|
||||
articles.append({
|
||||
'title': entry.get('title', ''),
|
||||
'link': article_url,
|
||||
'summary': summary,
|
||||
'source': source['name'],
|
||||
'published': extract_published_date(entry)
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error fetching from {source['name']}: {e}")
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
def save_articles_to_db(articles):
|
||||
"""Save articles to MongoDB, avoiding duplicates"""
|
||||
saved_count = 0
|
||||
|
||||
for article in articles:
|
||||
try:
|
||||
# Prepare article document
|
||||
article_doc = {
|
||||
'title': article.get('title', ''),
|
||||
'link': article.get('link', ''),
|
||||
'summary': article.get('summary', ''),
|
||||
'source': article.get('source', ''),
|
||||
'published_at': article.get('published', ''),
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
# Use update_one with upsert to handle duplicates
|
||||
# This will insert if link doesn't exist, or update if it does
|
||||
result = articles_collection.update_one(
|
||||
{'link': article_doc['link']},
|
||||
{'$setOnInsert': article_doc}, # Only set on insert, don't update existing
|
||||
upsert=True
|
||||
)
|
||||
|
||||
if result.upserted_id:
|
||||
saved_count += 1
|
||||
|
||||
except DuplicateKeyError:
|
||||
# Link already exists, skip
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Error saving article {article.get('link', 'unknown')}: {e}")
|
||||
|
||||
if saved_count > 0:
|
||||
print(f"Saved {saved_count} new articles to database")
|
||||
96
backend/services/ollama_service.py
Normal file
96
backend/services/ollama_service.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import requests
|
||||
from config import Config
|
||||
|
||||
|
||||
def list_ollama_models():
|
||||
"""List available models on Ollama server"""
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
return None, "Ollama is not enabled"
|
||||
|
||||
try:
|
||||
url = f"{Config.OLLAMA_BASE_URL}/api/tags"
|
||||
headers = {}
|
||||
if Config.OLLAMA_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
models = result.get('models', [])
|
||||
model_names = [model.get('name', '') for model in models]
|
||||
|
||||
return model_names, None
|
||||
except requests.exceptions.RequestException as e:
|
||||
return None, f"Error listing models: {str(e)}"
|
||||
except Exception as e:
|
||||
return None, f"Unexpected error: {str(e)}"
|
||||
|
||||
|
||||
def call_ollama(prompt, system_prompt=None):
|
||||
"""Call Ollama API to generate text"""
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
return None, "Ollama is not enabled"
|
||||
|
||||
try:
|
||||
url = f"{Config.OLLAMA_BASE_URL}/api/generate"
|
||||
payload = {
|
||||
"model": Config.OLLAMA_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
if system_prompt:
|
||||
payload["system"] = system_prompt
|
||||
|
||||
headers = {}
|
||||
if Config.OLLAMA_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
|
||||
|
||||
print(f"Calling Ollama at {url} with model {Config.OLLAMA_MODEL}")
|
||||
response = requests.post(url, json=payload, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
response_text = result.get('response', '').strip()
|
||||
|
||||
if not response_text:
|
||||
return None, "Ollama returned empty response"
|
||||
|
||||
return response_text, None
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
error_msg = f"Cannot connect to Ollama server at {Config.OLLAMA_BASE_URL}. Is Ollama running?"
|
||||
print(f"Connection error: {error_msg}")
|
||||
return None, error_msg
|
||||
except requests.exceptions.Timeout:
|
||||
error_msg = "Request to Ollama timed out after 30 seconds"
|
||||
print(f"Timeout error: {error_msg}")
|
||||
return None, error_msg
|
||||
except requests.exceptions.HTTPError as e:
|
||||
# Check if it's a model not found error
|
||||
if e.response.status_code == 404:
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
if 'model' in error_data.get('error', '').lower() and 'not found' in error_data.get('error', '').lower():
|
||||
# Try to get available models
|
||||
available_models, _ = list_ollama_models()
|
||||
if available_models:
|
||||
error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Available models: {', '.join(available_models)}"
|
||||
else:
|
||||
error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Use 'ollama list' on the server to see available models."
|
||||
else:
|
||||
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
|
||||
except (ValueError, KeyError):
|
||||
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
|
||||
else:
|
||||
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
|
||||
print(f"HTTP error: {error_msg}")
|
||||
return None, error_msg
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Request error: {str(e)}"
|
||||
print(f"Request error: {error_msg}")
|
||||
return None, error_msg
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error: {str(e)}"
|
||||
print(f"Unexpected error: {error_msg}")
|
||||
return None, error_msg
|
||||
162
backend/templates/newsletter_template.html
Normal file
162
backend/templates/newsletter_template.html
Normal file
@@ -0,0 +1,162 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<title>Munich News Daily</title>
|
||||
<!--[if mso]>
|
||||
<style type="text/css">
|
||||
body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
|
||||
</style>
|
||||
<![endif]-->
|
||||
</head>
|
||||
<body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
|
||||
<!-- Wrapper Table -->
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
|
||||
<tr>
|
||||
<td align="center" style="padding: 20px 0;">
|
||||
<!-- Main Container -->
|
||||
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
|
||||
|
||||
<!-- Header -->
|
||||
<tr>
|
||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
||||
<h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
|
||||
Munich News Daily
|
||||
</h1>
|
||||
<p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
|
||||
{{ date }}
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Greeting -->
|
||||
<tr>
|
||||
<td style="padding: 30px 40px 20px 40px;">
|
||||
<p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
|
||||
Good morning ☀️
|
||||
</p>
|
||||
<p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
|
||||
Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Divider -->
|
||||
<tr>
|
||||
<td style="padding: 0 40px;">
|
||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Articles -->
|
||||
{% for article in articles %}
|
||||
<tr>
|
||||
<td style="padding: 25px 40px;">
|
||||
<!-- Article Number Badge -->
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td>
|
||||
<span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
|
||||
{{ loop.index }}
|
||||
</span>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<!-- Article Title -->
|
||||
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
|
||||
{{ article.title }}
|
||||
</h2>
|
||||
|
||||
<!-- Article Meta -->
|
||||
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
|
||||
<span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
|
||||
{% if article.author %}
|
||||
<span> • {{ article.author }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
|
||||
<!-- Article Summary -->
|
||||
<p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
|
||||
{{ article.summary }}
|
||||
</p>
|
||||
|
||||
<!-- Read More Link -->
|
||||
<a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
|
||||
Read more →
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Article Divider -->
|
||||
{% if not loop.last %}
|
||||
<tr>
|
||||
<td style="padding: 0 40px;">
|
||||
<div style="height: 1px; background-color: #f0f0f0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
<!-- Bottom Divider -->
|
||||
<tr>
|
||||
<td style="padding: 25px 40px 0 40px;">
|
||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Summary Box -->
|
||||
<tr>
|
||||
<td style="padding: 30px 40px;">
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
|
||||
<tr>
|
||||
<td style="padding: 25px; text-align: center;">
|
||||
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
|
||||
Today's Digest
|
||||
</p>
|
||||
<p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
|
||||
{{ article_count }}
|
||||
</p>
|
||||
<p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
|
||||
stories • AI-summarized • 5 min read
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Footer -->
|
||||
<tr>
|
||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
||||
<p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
|
||||
Munich News Daily
|
||||
</p>
|
||||
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
|
||||
AI-powered news summaries for busy people.<br>
|
||||
Delivered daily to your inbox.
|
||||
</p>
|
||||
|
||||
<!-- Footer Links -->
|
||||
<p style="margin: 0; font-size: 12px; color: #666666;">
|
||||
<a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
|
||||
<span style="color: #444444;"> • </span>
|
||||
<a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
|
||||
</p>
|
||||
|
||||
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
|
||||
© {{ year }} Munich News Daily. All rights reserved.
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
<!-- End Main Container -->
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<!-- End Wrapper Table -->
|
||||
</body>
|
||||
</html>
|
||||
128
backend/test_rss_extraction.py
Normal file
128
backend/test_rss_extraction.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test RSS feed URL extraction
|
||||
Run from backend directory with venv activated:
|
||||
cd backend
|
||||
source venv/bin/activate # or venv\Scripts\activate on Windows
|
||||
python test_rss_extraction.py
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
import feedparser
|
||||
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*80)
|
||||
|
||||
# Connect to database
|
||||
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
# Get RSS feeds
|
||||
print("Fetching RSS feeds from database...")
|
||||
feeds = list(db['rss_feeds'].find())
|
||||
|
||||
if not feeds:
|
||||
print("\n❌ No RSS feeds in database!")
|
||||
print("\nAdd a feed first:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
|
||||
exit(1)
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s)\n")
|
||||
|
||||
# Test each feed
|
||||
total_success = 0
|
||||
total_fail = 0
|
||||
|
||||
for feed_doc in feeds:
|
||||
name = feed_doc.get('name', 'Unknown')
|
||||
url = feed_doc.get('url', '')
|
||||
active = feed_doc.get('active', True)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(f"Feed: {name}")
|
||||
print(f"URL: {url}")
|
||||
print(f"Active: {'Yes' if active else 'No'}")
|
||||
print("="*80)
|
||||
|
||||
if not active:
|
||||
print("⏭ Skipping (inactive)")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse RSS
|
||||
print("\nFetching RSS feed...")
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
continue
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries")
|
||||
|
||||
# Test first 3 entries
|
||||
print(f"\nTesting first 3 entries:")
|
||||
print("-" * 80)
|
||||
|
||||
for i, entry in enumerate(feed.entries[:3], 1):
|
||||
print(f"\n📰 Entry {i}:")
|
||||
|
||||
# Title
|
||||
title = entry.get('title', 'No title')
|
||||
print(f" Title: {title[:65]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f" ✓ URL: {article_url}")
|
||||
total_success += 1
|
||||
else:
|
||||
print(f" ❌ Could not extract URL")
|
||||
print(f" Available fields: {list(entry.keys())[:10]}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
total_fail += 1
|
||||
|
||||
# Test summary
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f" ✓ Summary: {summary[:70]}...")
|
||||
else:
|
||||
print(f" ⚠ No summary")
|
||||
|
||||
# Test date
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f" ✓ Date: {pub_date}")
|
||||
else:
|
||||
print(f" ⚠ No date")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*80)
|
||||
print("SUMMARY")
|
||||
print("="*80)
|
||||
print(f"Total URLs tested: {total_success + total_fail}")
|
||||
print(f"✓ Successfully extracted: {total_success}")
|
||||
print(f"❌ Failed to extract: {total_fail}")
|
||||
|
||||
if total_fail == 0:
|
||||
print("\n🎉 All URLs extracted successfully!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" cd ../news_crawler")
|
||||
print(" pip install -r requirements.txt")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print(f"\n⚠ {total_fail} URL(s) could not be extracted")
|
||||
print("Check the output above for details")
|
||||
|
||||
print("="*80 + "\n")
|
||||
1
backend/utils/__init__.py
Normal file
1
backend/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Utils package
|
||||
98
backend/utils/rss_utils.py
Normal file
98
backend/utils/rss_utils.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Utility functions for RSS feed processing
|
||||
"""
|
||||
|
||||
|
||||
def extract_article_url(entry):
|
||||
"""
|
||||
Extract article URL from RSS entry.
|
||||
Different RSS feeds use different fields for the article URL.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article URL or None if not found
|
||||
|
||||
Examples:
|
||||
- Most feeds use 'link'
|
||||
- Some use 'guid' as the URL
|
||||
- Some use 'id' as the URL
|
||||
- Some have guid as a dict with 'href'
|
||||
"""
|
||||
# Try 'link' first (most common)
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
|
||||
# Try 'guid' if it's a valid URL
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
|
||||
# Try 'id' if it's a valid URL
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
|
||||
# Try 'links' array (some feeds have multiple links)
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type, but accept any http link
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
# If no alternate found, return first http link
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
return link.get('href')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_article_summary(entry):
|
||||
"""
|
||||
Extract article summary/description from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article summary or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('summary'):
|
||||
return entry.get('summary', '')
|
||||
elif entry.get('description'):
|
||||
return entry.get('description', '')
|
||||
elif entry.get('content'):
|
||||
# content is usually a list of dicts
|
||||
content = entry.get('content', [])
|
||||
if content and isinstance(content, list) and len(content) > 0:
|
||||
return content[0].get('value', '')
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def extract_published_date(entry):
|
||||
"""
|
||||
Extract published date from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Published date or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('published'):
|
||||
return entry.get('published', '')
|
||||
elif entry.get('updated'):
|
||||
return entry.get('updated', '')
|
||||
elif entry.get('created'):
|
||||
return entry.get('created', '')
|
||||
|
||||
return ''
|
||||
33
docker-compose.prod.yml
Normal file
33
docker-compose.prod.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
version: '3.8'
|
||||
|
||||
# Production version with authentication enabled
|
||||
# Usage: docker-compose -f docker-compose.prod.yml up -d
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: munich-news-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: admin
|
||||
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
|
||||
MONGO_INITDB_DATABASE: munich_news
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
- mongodb_config:/data/configdb
|
||||
networks:
|
||||
- munich-news-network
|
||||
command: mongod --bind_ip_all --auth
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
mongodb_config:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
driver: bridge
|
||||
|
||||
32
docker-compose.yml
Normal file
32
docker-compose.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: munich-news-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
# For development: MongoDB runs without authentication
|
||||
# For production: Uncomment the environment variables below and update MONGODB_URI
|
||||
# environment:
|
||||
# MONGO_INITDB_ROOT_USERNAME: admin
|
||||
# MONGO_INITDB_ROOT_PASSWORD: password
|
||||
# MONGO_INITDB_DATABASE: munich_news
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
- mongodb_config:/data/configdb
|
||||
networks:
|
||||
- munich-news-network
|
||||
command: mongod --bind_ip_all
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
mongodb_config:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
driver: bridge
|
||||
|
||||
1320
frontend/package-lock.json
generated
Normal file
1320
frontend/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
21
frontend/package.json
Normal file
21
frontend/package.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "munich-news-frontend",
|
||||
"version": "1.0.0",
|
||||
"description": "Munich News Email Platform Frontend",
|
||||
"main": "server.js",
|
||||
"scripts": {
|
||||
"start": "node server.js",
|
||||
"dev": "nodemon server.js"
|
||||
},
|
||||
"keywords": ["news", "munich", "email"],
|
||||
"author": "",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"express": "^4.18.2",
|
||||
"axios": "^1.6.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"nodemon": "^3.0.2"
|
||||
}
|
||||
}
|
||||
|
||||
170
frontend/public/app.js
Normal file
170
frontend/public/app.js
Normal file
@@ -0,0 +1,170 @@
|
||||
// Load news on page load
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
loadNews();
|
||||
loadStats();
|
||||
});
|
||||
|
||||
async function loadNews() {
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
newsGrid.innerHTML = '<div class="loading">Loading news...</div>';
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/news');
|
||||
const data = await response.json();
|
||||
|
||||
if (data.articles && data.articles.length > 0) {
|
||||
displayNews(data.articles);
|
||||
} else {
|
||||
newsGrid.innerHTML = '<div class="loading">No news available at the moment. Check back later!</div>';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error loading news:', error);
|
||||
newsGrid.innerHTML = '<div class="loading">Failed to load news. Please try again later.</div>';
|
||||
}
|
||||
}
|
||||
|
||||
function displayNews(articles) {
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
newsGrid.innerHTML = '';
|
||||
|
||||
articles.forEach(article => {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'news-card';
|
||||
card.onclick = () => window.open(article.link, '_blank');
|
||||
|
||||
card.innerHTML = `
|
||||
<div class="source">${article.source || 'Munich News'}</div>
|
||||
<h3>${article.title}</h3>
|
||||
<p>${article.summary || 'No summary available.'}</p>
|
||||
<a href="${article.link}" target="_blank" class="read-more" onclick="event.stopPropagation()">Read more →</a>
|
||||
`;
|
||||
|
||||
newsGrid.appendChild(card);
|
||||
});
|
||||
}
|
||||
|
||||
async function loadStats() {
|
||||
try {
|
||||
const response = await fetch('/api/stats');
|
||||
const data = await response.json();
|
||||
|
||||
if (data.subscribers !== undefined) {
|
||||
document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error loading stats:', error);
|
||||
}
|
||||
}
|
||||
|
||||
async function subscribe() {
|
||||
const emailInput = document.getElementById('emailInput');
|
||||
const subscribeBtn = document.getElementById('subscribeBtn');
|
||||
const formMessage = document.getElementById('formMessage');
|
||||
|
||||
const email = emailInput.value.trim();
|
||||
|
||||
if (!email || !email.includes('@')) {
|
||||
formMessage.textContent = 'Please enter a valid email address';
|
||||
formMessage.className = 'form-message error';
|
||||
return;
|
||||
}
|
||||
|
||||
subscribeBtn.disabled = true;
|
||||
subscribeBtn.textContent = 'Subscribing...';
|
||||
formMessage.textContent = '';
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/subscribe', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({ email: email })
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (response.ok) {
|
||||
formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
|
||||
formMessage.className = 'form-message success';
|
||||
emailInput.value = '';
|
||||
loadStats(); // Refresh stats
|
||||
} else {
|
||||
formMessage.textContent = data.error || 'Failed to subscribe. Please try again.';
|
||||
formMessage.className = 'form-message error';
|
||||
}
|
||||
} catch (error) {
|
||||
formMessage.textContent = 'Network error. Please try again later.';
|
||||
formMessage.className = 'form-message error';
|
||||
} finally {
|
||||
subscribeBtn.disabled = false;
|
||||
subscribeBtn.textContent = 'Subscribe Free';
|
||||
}
|
||||
}
|
||||
|
||||
// Allow Enter key to submit
|
||||
document.getElementById('emailInput').addEventListener('keypress', (e) => {
|
||||
if (e.key === 'Enter') {
|
||||
subscribe();
|
||||
}
|
||||
});
|
||||
|
||||
function showUnsubscribe() {
|
||||
document.getElementById('unsubscribeModal').style.display = 'block';
|
||||
}
|
||||
|
||||
function closeUnsubscribe() {
|
||||
document.getElementById('unsubscribeModal').style.display = 'none';
|
||||
document.getElementById('unsubscribeEmail').value = '';
|
||||
document.getElementById('unsubscribeMessage').textContent = '';
|
||||
}
|
||||
|
||||
async function unsubscribe() {
|
||||
const emailInput = document.getElementById('unsubscribeEmail');
|
||||
const unsubscribeMessage = document.getElementById('unsubscribeMessage');
|
||||
|
||||
const email = emailInput.value.trim();
|
||||
|
||||
if (!email || !email.includes('@')) {
|
||||
unsubscribeMessage.textContent = 'Please enter a valid email address';
|
||||
unsubscribeMessage.className = 'form-message error';
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/unsubscribe', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({ email: email })
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (response.ok) {
|
||||
unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
|
||||
unsubscribeMessage.className = 'form-message success';
|
||||
emailInput.value = '';
|
||||
setTimeout(() => {
|
||||
closeUnsubscribe();
|
||||
loadStats();
|
||||
}, 2000);
|
||||
} else {
|
||||
unsubscribeMessage.textContent = data.error || 'Failed to unsubscribe. Please try again.';
|
||||
unsubscribeMessage.className = 'form-message error';
|
||||
}
|
||||
} catch (error) {
|
||||
unsubscribeMessage.textContent = 'Network error. Please try again later.';
|
||||
unsubscribeMessage.className = 'form-message error';
|
||||
}
|
||||
}
|
||||
|
||||
// Close modal when clicking outside
|
||||
window.onclick = function(event) {
|
||||
const modal = document.getElementById('unsubscribeModal');
|
||||
if (event.target === modal) {
|
||||
closeUnsubscribe();
|
||||
}
|
||||
}
|
||||
|
||||
65
frontend/public/index.html
Normal file
65
frontend/public/index.html
Normal file
@@ -0,0 +1,65 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Munich News Daily - Your Daily Dose of Munich News</title>
|
||||
<link rel="stylesheet" href="styles.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header class="hero">
|
||||
<div class="hero-content">
|
||||
<h1>📰 Munich News Daily</h1>
|
||||
<p class="tagline">Get the latest Munich news delivered to your inbox every morning</p>
|
||||
<p class="description">Stay informed about what's happening in Munich with our curated daily newsletter. No fluff, just the news that matters.</p>
|
||||
|
||||
<div class="subscription-form" id="subscriptionForm">
|
||||
<input
|
||||
type="email"
|
||||
id="emailInput"
|
||||
placeholder="Enter your email address"
|
||||
required
|
||||
>
|
||||
<button id="subscribeBtn" onclick="subscribe()">Subscribe Free</button>
|
||||
<p class="form-message" id="formMessage"></p>
|
||||
</div>
|
||||
|
||||
<div class="stats">
|
||||
<div class="stat-item">
|
||||
<span class="stat-number" id="subscriberCount">-</span>
|
||||
<span class="stat-label">Subscribers</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="news-section">
|
||||
<h2>Latest Munich News</h2>
|
||||
<div class="news-grid" id="newsGrid">
|
||||
<div class="loading">Loading news...</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer>
|
||||
<p>© 2024 Munich News Daily. Made with ❤️ for Munich.</p>
|
||||
<p><a href="#" onclick="showUnsubscribe()">Unsubscribe</a></p>
|
||||
</footer>
|
||||
</div>
|
||||
|
||||
<!-- Unsubscribe Modal -->
|
||||
<div class="modal" id="unsubscribeModal">
|
||||
<div class="modal-content">
|
||||
<span class="close" onclick="closeUnsubscribe()">×</span>
|
||||
<h2>Unsubscribe</h2>
|
||||
<p>Enter your email to unsubscribe from Munich News Daily:</p>
|
||||
<input type="email" id="unsubscribeEmail" placeholder="Enter your email">
|
||||
<button onclick="unsubscribe()">Unsubscribe</button>
|
||||
<p class="form-message" id="unsubscribeMessage"></p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
306
frontend/public/styles.css
Normal file
306
frontend/public/styles.css
Normal file
@@ -0,0 +1,306 @@
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.hero {
|
||||
text-align: center;
|
||||
padding: 60px 20px;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.hero-content {
|
||||
max-width: 700px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.hero h1 {
|
||||
font-size: 3.5rem;
|
||||
margin-bottom: 20px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.tagline {
|
||||
font-size: 1.5rem;
|
||||
margin-bottom: 15px;
|
||||
font-weight: 300;
|
||||
}
|
||||
|
||||
.description {
|
||||
font-size: 1.1rem;
|
||||
margin-bottom: 40px;
|
||||
opacity: 0.9;
|
||||
}
|
||||
|
||||
.subscription-form {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
max-width: 500px;
|
||||
margin: 0 auto 40px;
|
||||
}
|
||||
|
||||
.subscription-form input {
|
||||
padding: 15px 20px;
|
||||
font-size: 1rem;
|
||||
border: none;
|
||||
border-radius: 8px;
|
||||
outline: none;
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.subscription-form button {
|
||||
padding: 15px 30px;
|
||||
font-size: 1.1rem;
|
||||
font-weight: 600;
|
||||
background: #ff6b6b;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 8px;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.subscription-form button:hover {
|
||||
background: #ff5252;
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
|
||||
}
|
||||
|
||||
.subscription-form button:active {
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
.form-message {
|
||||
margin-top: 10px;
|
||||
font-size: 0.9rem;
|
||||
min-height: 20px;
|
||||
}
|
||||
|
||||
.form-message.success {
|
||||
color: #4caf50;
|
||||
}
|
||||
|
||||
.form-message.error {
|
||||
color: #f44336;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
gap: 40px;
|
||||
margin-top: 40px;
|
||||
}
|
||||
|
||||
.stat-item {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-number {
|
||||
display: block;
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 0.9rem;
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.news-section {
|
||||
background: white;
|
||||
border-radius: 20px;
|
||||
padding: 40px;
|
||||
margin: 40px 0;
|
||||
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
|
||||
}
|
||||
|
||||
.news-section h2 {
|
||||
font-size: 2rem;
|
||||
margin-bottom: 30px;
|
||||
color: #333;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.news-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
|
||||
gap: 25px;
|
||||
}
|
||||
|
||||
.news-card {
|
||||
background: #f8f9fa;
|
||||
border-radius: 12px;
|
||||
padding: 20px;
|
||||
transition: all 0.3s ease;
|
||||
border-left: 4px solid #667eea;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.news-card:hover {
|
||||
transform: translateY(-5px);
|
||||
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
|
||||
background: white;
|
||||
}
|
||||
|
||||
.news-card h3 {
|
||||
font-size: 1.2rem;
|
||||
margin-bottom: 10px;
|
||||
color: #333;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.news-card p {
|
||||
color: #666;
|
||||
font-size: 0.95rem;
|
||||
margin-bottom: 15px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.news-card .source {
|
||||
font-size: 0.85rem;
|
||||
color: #667eea;
|
||||
font-weight: 600;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.news-card .read-more {
|
||||
color: #667eea;
|
||||
text-decoration: none;
|
||||
font-weight: 600;
|
||||
font-size: 0.9rem;
|
||||
display: inline-block;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.news-card .read-more:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #666;
|
||||
font-size: 1.1rem;
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
footer {
|
||||
text-align: center;
|
||||
padding: 40px 20px;
|
||||
color: white;
|
||||
}
|
||||
|
||||
footer a {
|
||||
color: white;
|
||||
text-decoration: underline;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
footer a:hover {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
/* Modal Styles */
|
||||
.modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
z-index: 1000;
|
||||
left: 0;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: rgba(0, 0, 0, 0.5);
|
||||
backdrop-filter: blur(5px);
|
||||
}
|
||||
|
||||
.modal-content {
|
||||
background-color: white;
|
||||
margin: 15% auto;
|
||||
padding: 30px;
|
||||
border-radius: 12px;
|
||||
width: 90%;
|
||||
max-width: 500px;
|
||||
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.close {
|
||||
color: #aaa;
|
||||
float: right;
|
||||
font-size: 28px;
|
||||
font-weight: bold;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.close:hover {
|
||||
color: #000;
|
||||
}
|
||||
|
||||
.modal-content h2 {
|
||||
margin-bottom: 20px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.modal-content input {
|
||||
width: 100%;
|
||||
padding: 12px;
|
||||
margin: 15px 0;
|
||||
border: 2px solid #ddd;
|
||||
border-radius: 8px;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
.modal-content button {
|
||||
width: 100%;
|
||||
padding: 12px;
|
||||
background: #ff6b6b;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 8px;
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.modal-content button:hover {
|
||||
background: #ff5252;
|
||||
}
|
||||
|
||||
/* Responsive Design */
|
||||
@media (max-width: 768px) {
|
||||
.hero h1 {
|
||||
font-size: 2.5rem;
|
||||
}
|
||||
|
||||
.tagline {
|
||||
font-size: 1.2rem;
|
||||
}
|
||||
|
||||
.news-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.stats {
|
||||
flex-direction: column;
|
||||
gap: 20px;
|
||||
}
|
||||
}
|
||||
|
||||
57
frontend/server.js
Normal file
57
frontend/server.js
Normal file
@@ -0,0 +1,57 @@
|
||||
const express = require('express');
|
||||
const path = require('path');
|
||||
const axios = require('axios');
|
||||
|
||||
const app = express();
|
||||
const PORT = process.env.PORT || 3000;
|
||||
const API_URL = process.env.API_URL || 'http://localhost:5001';
|
||||
|
||||
// Serve static files
|
||||
app.use(express.static('public'));
|
||||
app.use(express.json());
|
||||
|
||||
// API proxy
|
||||
app.get('/api/news', async (req, res) => {
|
||||
try {
|
||||
const response = await axios.get(`${API_URL}/api/news`);
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
res.status(500).json({ error: 'Failed to fetch news' });
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/api/stats', async (req, res) => {
|
||||
try {
|
||||
const response = await axios.get(`${API_URL}/api/stats`);
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
res.status(500).json({ error: 'Failed to fetch stats' });
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/api/subscribe', async (req, res) => {
|
||||
try {
|
||||
const response = await axios.post(`${API_URL}/api/subscribe`, req.body);
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
res.status(error.response?.status || 500).json(
|
||||
error.response?.data || { error: 'Failed to subscribe' }
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/api/unsubscribe', async (req, res) => {
|
||||
try {
|
||||
const response = await axios.post(`${API_URL}/api/unsubscribe`, req.body);
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
res.status(error.response?.status || 500).json(
|
||||
error.response?.data || { error: 'Failed to unsubscribe' }
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, () => {
|
||||
console.log(`Frontend server running on http://localhost:${PORT}`);
|
||||
});
|
||||
|
||||
25
news_crawler/.gitignore
vendored
Normal file
25
news_crawler/.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
191
news_crawler/CHANGES.md
Normal file
191
news_crawler/CHANGES.md
Normal file
@@ -0,0 +1,191 @@
|
||||
# Recent Changes - Full Content Storage
|
||||
|
||||
## ✅ What Changed
|
||||
|
||||
### 1. Removed Content Length Limit
|
||||
**Before:**
|
||||
```python
|
||||
'content': content_text[:10000] # Limited to 10k chars
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
'content': content_text # Full content, no limit
|
||||
```
|
||||
|
||||
### 2. Simplified Database Schema
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
summary: String, // Short summary
|
||||
full_content: String // Limited content
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
content: String // Full article content, no limit
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Enhanced API Response
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
link: "...",
|
||||
summary: "..."
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
author: "...", // NEW!
|
||||
link: "...",
|
||||
preview: "...", // First 200 chars
|
||||
word_count: 1250, // NEW!
|
||||
has_full_content: true // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 📊 Database Structure
|
||||
|
||||
### Articles Collection
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
title: String, // Article title
|
||||
author: String, // Article author (extracted)
|
||||
link: String, // Article URL (unique)
|
||||
content: String, // FULL article content (no limit)
|
||||
word_count: Number, // Word count
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When crawled
|
||||
created_at: DateTime // When added
|
||||
}
|
||||
```
|
||||
|
||||
## 🆕 New API Endpoint
|
||||
|
||||
### GET /api/news/<article_url>
|
||||
Get full article content by URL.
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# URL encode the article URL
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"title": "New U-Bahn Line Opens in Munich",
|
||||
"author": "Max Mustermann",
|
||||
"link": "https://example.com/article",
|
||||
"content": "The full article text here... (complete, no truncation)",
|
||||
"word_count": 1250,
|
||||
"source": "Süddeutsche Zeitung München",
|
||||
"published_at": "2024-11-10T10:00:00Z",
|
||||
"crawled_at": "2024-11-10T16:30:00Z",
|
||||
"created_at": "2024-11-10T16:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## 📈 Enhanced Stats
|
||||
|
||||
### GET /api/stats
|
||||
Now includes crawled article count:
|
||||
|
||||
```json
|
||||
{
|
||||
"subscribers": 150,
|
||||
"articles": 500,
|
||||
"crawled_articles": 350 // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 🎯 Benefits
|
||||
|
||||
1. **Complete Content** - No truncation, full articles stored
|
||||
2. **Better for AI** - Full context for summarization/analysis
|
||||
3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content`
|
||||
4. **More Metadata** - Author, word count, crawl timestamp
|
||||
5. **Better API** - Preview in list, full content on demand
|
||||
|
||||
## 🔄 Migration
|
||||
|
||||
If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field.
|
||||
|
||||
To migrate old articles:
|
||||
```javascript
|
||||
// MongoDB shell
|
||||
db.articles.updateMany(
|
||||
{ full_content: { $exists: true } },
|
||||
[
|
||||
{
|
||||
$set: {
|
||||
content: "$full_content"
|
||||
}
|
||||
},
|
||||
{
|
||||
$unset: ["full_content", "summary"]
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## 🚀 Usage
|
||||
|
||||
### Crawl Articles
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
### Get Article List (with previews)
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
### Get Full Article Content
|
||||
```bash
|
||||
# Get the article URL from the list, then:
|
||||
curl "http://localhost:5001/api/news/<encoded_url>"
|
||||
```
|
||||
|
||||
### Check Stats
|
||||
```bash
|
||||
curl http://localhost:5001/api/stats
|
||||
```
|
||||
|
||||
## 📝 Example Workflow
|
||||
|
||||
1. **Add RSS Feed**
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "News Source", "url": "https://example.com/rss"}'
|
||||
```
|
||||
|
||||
2. **Crawl Articles**
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
3. **View Articles**
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
4. **Get Full Content**
|
||||
```bash
|
||||
# Copy article link from above, URL encode it
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
Now you have complete article content ready for AI processing! 🎉
|
||||
13
news_crawler/Dockerfile
Normal file
13
news_crawler/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy crawler service
|
||||
COPY crawler_service.py .
|
||||
|
||||
# Run crawler
|
||||
CMD ["python", "crawler_service.py"]
|
||||
353
news_crawler/EXTRACTION_STRATEGIES.md
Normal file
353
news_crawler/EXTRACTION_STRATEGIES.md
Normal file
@@ -0,0 +1,353 @@
|
||||
# Content Extraction Strategies
|
||||
|
||||
The crawler uses multiple strategies to dynamically extract article metadata from any website.
|
||||
|
||||
## 🎯 What Gets Extracted
|
||||
|
||||
1. **Title** - Article headline
|
||||
2. **Author** - Article writer/journalist
|
||||
3. **Published Date** - When article was published
|
||||
4. **Content** - Main article text
|
||||
5. **Description** - Meta description/summary
|
||||
|
||||
## 📋 Extraction Strategies
|
||||
|
||||
### 1. Title Extraction
|
||||
|
||||
Tries multiple methods in order of reliability:
|
||||
|
||||
#### Strategy 1: H1 Tag
|
||||
```html
|
||||
<h1>Article Title Here</h1>
|
||||
```
|
||||
✅ Most reliable - usually the main headline
|
||||
|
||||
#### Strategy 2: Open Graph Meta Tag
|
||||
```html
|
||||
<meta property="og:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Facebook, very reliable
|
||||
|
||||
#### Strategy 3: Twitter Card Meta Tag
|
||||
```html
|
||||
<meta name="twitter:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Twitter, reliable
|
||||
|
||||
#### Strategy 4: Title Tag (Fallback)
|
||||
```html
|
||||
<title>Article Title | Site Name</title>
|
||||
```
|
||||
⚠️ Often includes site name, needs cleaning
|
||||
|
||||
**Cleaning:**
|
||||
- Removes " | Site Name"
|
||||
- Removes " - Site Name"
|
||||
|
||||
---
|
||||
|
||||
### 2. Author Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Meta Author Tag
|
||||
```html
|
||||
<meta name="author" content="John Doe" />
|
||||
```
|
||||
✅ Standard HTML meta tag
|
||||
|
||||
#### Strategy 2: Rel="author" Link
|
||||
```html
|
||||
<a rel="author" href="/author/john-doe">John Doe</a>
|
||||
```
|
||||
✅ Semantic HTML
|
||||
|
||||
#### Strategy 3: Common Class Names
|
||||
```html
|
||||
<div class="author-name">John Doe</div>
|
||||
<span class="byline">By John Doe</span>
|
||||
<p class="writer">John Doe</p>
|
||||
```
|
||||
✅ Searches for: author-name, author, byline, writer
|
||||
|
||||
#### Strategy 4: Schema.org Markup
|
||||
```html
|
||||
<span itemprop="author">John Doe</span>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 5: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"author": {
|
||||
"@type": "Person",
|
||||
"name": "John Doe"
|
||||
}
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured, very reliable
|
||||
|
||||
**Cleaning:**
|
||||
- Removes "By " prefix
|
||||
- Validates length (< 100 chars)
|
||||
|
||||
---
|
||||
|
||||
### 3. Date Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Time Tag with Datetime
|
||||
```html
|
||||
<time datetime="2024-11-10T10:00:00Z">November 10, 2024</time>
|
||||
```
|
||||
✅ Most reliable - ISO format
|
||||
|
||||
#### Strategy 2: Article Published Time Meta
|
||||
```html
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Open Graph standard
|
||||
|
||||
#### Strategy 3: OG Published Time
|
||||
```html
|
||||
<meta property="og:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Facebook standard
|
||||
|
||||
#### Strategy 4: Common Class Names
|
||||
```html
|
||||
<span class="publish-date">November 10, 2024</span>
|
||||
<time class="published">2024-11-10</time>
|
||||
<div class="timestamp">10:00 AM, Nov 10</div>
|
||||
```
|
||||
✅ Searches for: publish-date, published, date, timestamp
|
||||
|
||||
#### Strategy 5: Schema.org Markup
|
||||
```html
|
||||
<meta itemprop="datePublished" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 6: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"datePublished": "2024-11-10T10:00:00Z"
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured
|
||||
|
||||
---
|
||||
|
||||
### 4. Content Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Semantic HTML Tags
|
||||
```html
|
||||
<article>
|
||||
<p>Article content here...</p>
|
||||
</article>
|
||||
```
|
||||
✅ Best practice HTML5
|
||||
|
||||
#### Strategy 2: Common Class Names
|
||||
```html
|
||||
<div class="article-content">...</div>
|
||||
<div class="article-body">...</div>
|
||||
<div class="post-content">...</div>
|
||||
<div class="entry-content">...</div>
|
||||
<div class="story-body">...</div>
|
||||
```
|
||||
✅ Searches for common patterns
|
||||
|
||||
#### Strategy 3: Schema.org Markup
|
||||
```html
|
||||
<div itemprop="articleBody">
|
||||
<p>Content here...</p>
|
||||
</div>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 4: Main Tag
|
||||
```html
|
||||
<main>
|
||||
<p>Content here...</p>
|
||||
</main>
|
||||
```
|
||||
✅ Semantic HTML5
|
||||
|
||||
#### Strategy 5: Body Tag (Fallback)
|
||||
```html
|
||||
<body>
|
||||
<p>Content here...</p>
|
||||
</body>
|
||||
```
|
||||
⚠️ Last resort, may include navigation
|
||||
|
||||
**Content Filtering:**
|
||||
- Removes `<script>`, `<style>`, `<nav>`, `<footer>`, `<header>`, `<aside>`
|
||||
- Filters out short paragraphs (< 50 chars) - likely ads/navigation
|
||||
- Keeps only substantial paragraphs
|
||||
- **No length limit** - stores full article content
|
||||
|
||||
---
|
||||
|
||||
## 🔍 How It Works
|
||||
|
||||
### Example: Crawling a News Article
|
||||
|
||||
```python
|
||||
# 1. Fetch HTML
|
||||
response = requests.get(article_url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 2. Extract title (tries 4 strategies)
|
||||
title = extract_title(soup)
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# 3. Extract author (tries 5 strategies)
|
||||
author = extract_author(soup)
|
||||
# Result: "Max Mustermann"
|
||||
|
||||
# 4. Extract date (tries 6 strategies)
|
||||
published_date = extract_date(soup)
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
|
||||
# 5. Extract content (tries 5 strategies)
|
||||
content = extract_main_content(soup)
|
||||
# Result: "The new U-Bahn line connecting..."
|
||||
|
||||
# 6. Save to database
|
||||
article_doc = {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'published_at': published_date,
|
||||
'full_content': content,
|
||||
'word_count': len(content.split())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Success Rates by Strategy
|
||||
|
||||
Based on common news sites:
|
||||
|
||||
| Strategy | Success Rate | Notes |
|
||||
|----------|-------------|-------|
|
||||
| H1 for title | 95% | Almost universal |
|
||||
| OG meta tags | 90% | Most modern sites |
|
||||
| Time tag for date | 85% | HTML5 sites |
|
||||
| JSON-LD | 70% | Growing adoption |
|
||||
| Class name patterns | 60% | Varies by site |
|
||||
| Schema.org | 50% | Not widely adopted |
|
||||
|
||||
---
|
||||
|
||||
## 🎨 Real-World Examples
|
||||
|
||||
### Example 1: Süddeutsche Zeitung
|
||||
```html
|
||||
<article>
|
||||
<h1>New U-Bahn Line Opens</h1>
|
||||
<span class="author">Max Mustermann</span>
|
||||
<time datetime="2024-11-10T10:00:00Z">10. November 2024</time>
|
||||
<div class="article-body">
|
||||
<p>The new U-Bahn line...</p>
|
||||
</div>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (class), Date (time), Content (article-body)
|
||||
|
||||
### Example 2: Medium Blog
|
||||
```html
|
||||
<article>
|
||||
<h1>How to Build a News Crawler</h1>
|
||||
<meta property="og:title" content="How to Build a News Crawler" />
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
<a rel="author" href="/author">Jane Smith</a>
|
||||
<section>
|
||||
<p>In this article...</p>
|
||||
</section>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (OG meta), Author (rel), Date (article meta), Content (section)
|
||||
|
||||
### Example 3: WordPress Blog
|
||||
```html
|
||||
<div class="post">
|
||||
<h1 class="entry-title">My Blog Post</h1>
|
||||
<span class="byline">By John Doe</span>
|
||||
<time class="published">November 10, 2024</time>
|
||||
<div class="entry-content">
|
||||
<p>Blog content here...</p>
|
||||
</div>
|
||||
</div>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (byline), Date (published), Content (entry-content)
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Edge Cases Handled
|
||||
|
||||
1. **Missing Fields**: Returns `None` instead of crashing
|
||||
2. **Multiple Authors**: Takes first one found
|
||||
3. **Relative Dates**: Stores as-is ("2 hours ago")
|
||||
4. **Paywalls**: Extracts what's available
|
||||
5. **JavaScript-rendered**: Only gets server-side HTML
|
||||
6. **Ads/Navigation**: Filtered out by paragraph length
|
||||
7. **Site Name in Title**: Cleaned automatically
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
|
||||
- [ ] JavaScript rendering (Selenium/Playwright)
|
||||
- [ ] Paywall bypass (where legal)
|
||||
- [ ] Image extraction
|
||||
- [ ] Video detection
|
||||
- [ ] Related articles
|
||||
- [ ] Tags/categories
|
||||
- [ ] Reading time estimation
|
||||
- [ ] Language detection
|
||||
- [ ] Sentiment analysis
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
Test the extraction on a specific URL:
|
||||
|
||||
```python
|
||||
from crawler_service import extract_article_content
|
||||
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
data = extract_article_content(url)
|
||||
|
||||
print(f"Title: {data['title']}")
|
||||
print(f"Author: {data['author']}")
|
||||
print(f"Date: {data['published_date']}")
|
||||
print(f"Content length: {len(data['content'])} chars")
|
||||
print(f"Word count: {data['word_count']}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📚 Standards Supported
|
||||
|
||||
- ✅ HTML5 semantic tags
|
||||
- ✅ Open Graph Protocol
|
||||
- ✅ Twitter Cards
|
||||
- ✅ Schema.org microdata
|
||||
- ✅ JSON-LD structured data
|
||||
- ✅ Dublin Core metadata
|
||||
- ✅ Common CSS class patterns
|
||||
306
news_crawler/HOW_IT_WORKS.md
Normal file
306
news_crawler/HOW_IT_WORKS.md
Normal file
@@ -0,0 +1,306 @@
|
||||
# How the News Crawler Works
|
||||
|
||||
## 🎯 Overview
|
||||
|
||||
The crawler dynamically extracts article metadata from any website using multiple fallback strategies.
|
||||
|
||||
## 📊 Flow Diagram
|
||||
|
||||
```
|
||||
RSS Feed URL
|
||||
↓
|
||||
Parse RSS Feed
|
||||
↓
|
||||
For each article link:
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 1. Fetch HTML Page │
|
||||
│ GET https://example.com/article │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 2. Parse with BeautifulSoup │
|
||||
│ soup = BeautifulSoup(html) │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 3. Clean HTML │
|
||||
│ Remove: scripts, styles, nav, │
|
||||
│ footer, header, ads │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 4. Extract Title │
|
||||
│ Try: H1 → OG meta → Twitter → │
|
||||
│ Title tag │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 5. Extract Author │
|
||||
│ Try: Meta author → rel=author → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 6. Extract Date │
|
||||
│ Try: <time> → Meta tags → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 7. Extract Content │
|
||||
│ Try: <article> → Class names → │
|
||||
│ <main> → <body> │
|
||||
│ Filter short paragraphs │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 8. Save to MongoDB │
|
||||
│ { │
|
||||
│ title, author, date, │
|
||||
│ content, word_count │
|
||||
│ } │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
Wait 1 second (rate limiting)
|
||||
↓
|
||||
Next article
|
||||
```
|
||||
|
||||
## 🔍 Detailed Example
|
||||
|
||||
### Input: RSS Feed Entry
|
||||
```xml
|
||||
<item>
|
||||
<title>New U-Bahn Line Opens</title>
|
||||
<link>https://www.sueddeutsche.de/muenchen/article-123</link>
|
||||
<pubDate>Mon, 10 Nov 2024 10:00:00 +0100</pubDate>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Step 1: Fetch HTML
|
||||
```python
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
response = requests.get(url)
|
||||
html = response.content
|
||||
```
|
||||
|
||||
### Step 2: Parse HTML
|
||||
```python
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
```
|
||||
|
||||
### Step 3: Extract Title
|
||||
```python
|
||||
# Try H1
|
||||
h1 = soup.find('h1')
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# If no H1, try OG meta
|
||||
og_title = soup.find('meta', property='og:title')
|
||||
# Fallback chain continues...
|
||||
```
|
||||
|
||||
### Step 4: Extract Author
|
||||
```python
|
||||
# Try meta author
|
||||
meta_author = soup.find('meta', name='author')
|
||||
# Result: None
|
||||
|
||||
# Try class names
|
||||
author_elem = soup.select_one('[class*="author"]')
|
||||
# Result: "Max Mustermann"
|
||||
```
|
||||
|
||||
### Step 5: Extract Date
|
||||
```python
|
||||
# Try time tag
|
||||
time_tag = soup.find('time')
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
```
|
||||
|
||||
### Step 6: Extract Content
|
||||
```python
|
||||
# Try article tag
|
||||
article = soup.find('article')
|
||||
paragraphs = article.find_all('p')
|
||||
|
||||
# Filter paragraphs
|
||||
content = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text().strip()
|
||||
if len(text) >= 50: # Keep substantial paragraphs
|
||||
content.append(text)
|
||||
|
||||
full_content = '\n\n'.join(content)
|
||||
# Result: "The new U-Bahn line connecting the city center..."
|
||||
```
|
||||
|
||||
### Step 7: Save to Database
|
||||
```python
|
||||
article_doc = {
|
||||
'title': 'New U-Bahn Line Opens in Munich',
|
||||
'author': 'Max Mustermann',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'summary': 'Short summary from RSS...',
|
||||
'full_content': 'The new U-Bahn line connecting...',
|
||||
'word_count': 1250,
|
||||
'source': 'Süddeutsche Zeitung München',
|
||||
'published_at': '2024-11-10T10:00:00Z',
|
||||
'crawled_at': datetime.utcnow(),
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
db.articles.update_one(
|
||||
{'link': article_url},
|
||||
{'$set': article_doc},
|
||||
upsert=True
|
||||
)
|
||||
```
|
||||
|
||||
## 🎨 What Makes It "Dynamic"?
|
||||
|
||||
### Traditional Approach (Hardcoded)
|
||||
```python
|
||||
# Only works for one specific site
|
||||
title = soup.find('h1', class_='article-title').text
|
||||
author = soup.find('span', class_='author-name').text
|
||||
```
|
||||
❌ Breaks when site changes
|
||||
❌ Doesn't work on other sites
|
||||
|
||||
### Our Approach (Dynamic)
|
||||
```python
|
||||
# Works on ANY site
|
||||
title = extract_title(soup) # Tries 4 different methods
|
||||
author = extract_author(soup) # Tries 5 different methods
|
||||
```
|
||||
✅ Adapts to different HTML structures
|
||||
✅ Falls back to alternatives
|
||||
✅ Works across multiple sites
|
||||
|
||||
## 🛡️ Robustness Features
|
||||
|
||||
### 1. Multiple Strategies
|
||||
Each field has 4-6 extraction strategies
|
||||
```python
|
||||
def extract_title(soup):
|
||||
# Try strategy 1
|
||||
if h1 := soup.find('h1'):
|
||||
return h1.text
|
||||
|
||||
# Try strategy 2
|
||||
if og_title := soup.find('meta', property='og:title'):
|
||||
return og_title['content']
|
||||
|
||||
# Try strategy 3...
|
||||
# Try strategy 4...
|
||||
```
|
||||
|
||||
### 2. Validation
|
||||
```python
|
||||
# Title must be reasonable length
|
||||
if title and len(title) > 10:
|
||||
return title
|
||||
|
||||
# Author must be < 100 chars
|
||||
if author and len(author) < 100:
|
||||
return author
|
||||
```
|
||||
|
||||
### 3. Cleaning
|
||||
```python
|
||||
# Remove site name from title
|
||||
if ' | ' in title:
|
||||
title = title.split(' | ')[0]
|
||||
|
||||
# Remove "By" from author
|
||||
author = author.replace('By ', '').strip()
|
||||
```
|
||||
|
||||
### 4. Error Handling
|
||||
```python
|
||||
try:
|
||||
data = extract_article_content(url)
|
||||
except Timeout:
|
||||
print("Timeout - skip")
|
||||
except RequestException:
|
||||
print("Network error - skip")
|
||||
except Exception:
|
||||
print("Unknown error - skip")
|
||||
```
|
||||
|
||||
## 📈 Success Metrics
|
||||
|
||||
After crawling, you'll see:
|
||||
|
||||
```
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
|
||||
Title: ✓ Found
|
||||
Author: ✓ Found (Max Mustermann)
|
||||
Date: ✓ Found (2024-11-10T10:00:00Z)
|
||||
Content: ✓ Found (1250 words)
|
||||
```
|
||||
|
||||
## 🗄️ Database Result
|
||||
|
||||
**Before Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens",
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
source: "Süddeutsche Zeitung"
|
||||
}
|
||||
```
|
||||
|
||||
**After Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens in Munich", // ← Enhanced
|
||||
author: "Max Mustermann", // ← NEW!
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
full_content: "The new U-Bahn line...", // ← NEW! (1250 words)
|
||||
word_count: 1250, // ← NEW!
|
||||
source: "Süddeutsche Zeitung",
|
||||
published_at: "2024-11-10T10:00:00Z", // ← Enhanced
|
||||
crawled_at: ISODate("2024-11-10T16:30:00Z"), // ← NEW!
|
||||
created_at: ISODate("2024-11-10T16:00:00Z")
|
||||
}
|
||||
```
|
||||
|
||||
## 🚀 Running the Crawler
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
Now you have rich, structured article data ready for AI processing! 🎉
|
||||
127
news_crawler/QUICKSTART.md
Normal file
127
news_crawler/QUICKSTART.md
Normal file
@@ -0,0 +1,127 @@
|
||||
# News Crawler - Quick Start
|
||||
|
||||
## 1. Install Dependencies
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 2. Configure Environment
|
||||
|
||||
Make sure MongoDB is running and accessible. The crawler will use the same database as the backend.
|
||||
|
||||
Default connection: `mongodb://localhost:27017/`
|
||||
|
||||
To use a different MongoDB URI, create a `.env` file:
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## 3. Run the Crawler
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
## 4. Verify Results
|
||||
|
||||
Check your MongoDB database:
|
||||
|
||||
```bash
|
||||
# Using mongosh
|
||||
mongosh
|
||||
use munich_news
|
||||
db.articles.find({full_content: {$exists: true}}).count()
|
||||
db.articles.findOne({full_content: {$exists: true}})
|
||||
```
|
||||
|
||||
## 5. Schedule Regular Crawling
|
||||
|
||||
### Option A: Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Edit crontab
|
||||
crontab -e
|
||||
|
||||
# Add this line to run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Option B: Docker
|
||||
|
||||
```bash
|
||||
# Build and run
|
||||
docker-compose up
|
||||
|
||||
# Or run as a one-off
|
||||
docker-compose run --rm crawler
|
||||
```
|
||||
|
||||
### Option C: Manual
|
||||
|
||||
Just run the script whenever you want to fetch new articles:
|
||||
|
||||
```bash
|
||||
python crawler_service.py
|
||||
```
|
||||
|
||||
## What Gets Crawled?
|
||||
|
||||
The crawler:
|
||||
1. Fetches all active RSS feeds from the database
|
||||
2. For each feed, gets the latest articles
|
||||
3. Crawls the full content from each article URL
|
||||
4. Saves: title, full_content, word_count, crawled_at
|
||||
5. Skips articles that already have content
|
||||
|
||||
## Output Example
|
||||
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
URL: https://www.sueddeutsche.de/muenchen/rss
|
||||
🔍 Crawling: New U-Bahn Line Opens in Munich...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles from Süddeutsche Zeitung München
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No feeds found:**
|
||||
- Make sure you've added RSS feeds via the backend API
|
||||
- Check MongoDB connection
|
||||
|
||||
**Can't extract content:**
|
||||
- Some sites block scrapers
|
||||
- Some sites require JavaScript (not supported yet)
|
||||
- Check if the URL is accessible
|
||||
|
||||
**Timeout errors:**
|
||||
- Increase timeout in the code
|
||||
- Check your internet connection
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- View them in the frontend
|
||||
- Use Ollama to summarize them
|
||||
- Generate newsletters with full content
|
||||
- Perform text analysis
|
||||
225
news_crawler/README.md
Normal file
225
news_crawler/README.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# News Crawler Microservice
|
||||
|
||||
A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB.
|
||||
|
||||
## Features
|
||||
|
||||
- 🔍 Extracts full article content from RSS feed links
|
||||
- 📊 Calculates word count
|
||||
- 🔄 Avoids re-crawling already processed articles
|
||||
- ⏱️ Rate limiting (1 second delay between requests)
|
||||
- 🎯 Smart content extraction using multiple selectors
|
||||
- 🧹 Cleans up scripts, styles, and navigation elements
|
||||
|
||||
## Installation
|
||||
|
||||
1. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Configure environment variables:
|
||||
Create a `.env` file in the project root (or use the backend's `.env`):
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Standalone Execution
|
||||
|
||||
Run the crawler directly:
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed (default)
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
### As a Module
|
||||
|
||||
```python
|
||||
from crawler_service import crawl_all_feeds, crawl_rss_feed
|
||||
|
||||
# Crawl all active feeds
|
||||
result = crawl_all_feeds(max_articles_per_feed=10)
|
||||
print(result)
|
||||
|
||||
# Crawl a specific feed
|
||||
crawl_rss_feed(
|
||||
feed_url='https://example.com/rss',
|
||||
feed_name='Example News',
|
||||
max_articles=10
|
||||
)
|
||||
```
|
||||
|
||||
### Via Backend API
|
||||
|
||||
The backend has integrated endpoints:
|
||||
|
||||
```bash
|
||||
# Start crawler
|
||||
curl -X POST http://localhost:5001/api/crawler/start
|
||||
|
||||
# Check status
|
||||
curl http://localhost:5001/api/crawler/status
|
||||
|
||||
# Crawl specific feed
|
||||
curl -X POST http://localhost:5001/api/crawler/feed/<feed_id>
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB
|
||||
2. **Parse Feed**: Extracts article links from each feed
|
||||
3. **Crawl Content**: For each article:
|
||||
- Fetches HTML page
|
||||
- Removes scripts, styles, navigation
|
||||
- Extracts main content using smart selectors
|
||||
- Calculates word count
|
||||
4. **Store Data**: Saves to MongoDB with metadata
|
||||
5. **Skip Duplicates**: Avoids re-crawling articles with existing content
|
||||
|
||||
## Content Extraction Strategy
|
||||
|
||||
The crawler tries multiple selectors in order:
|
||||
|
||||
1. `<article>` tag
|
||||
2. Elements with class containing "article-content", "article-body"
|
||||
3. Elements with class containing "post-content", "entry-content"
|
||||
4. `<main>` tag
|
||||
5. Fallback to all `<p>` tags in body
|
||||
|
||||
## Database Schema
|
||||
|
||||
Articles are stored with these fields:
|
||||
|
||||
```javascript
|
||||
{
|
||||
title: String, // Article title
|
||||
link: String, // Article URL (unique)
|
||||
summary: String, // Short summary
|
||||
full_content: String, // Full article text (max 10,000 chars)
|
||||
word_count: Number, // Number of words
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When content was crawled
|
||||
created_at: DateTime // When added to database
|
||||
}
|
||||
```
|
||||
|
||||
## Scheduling
|
||||
|
||||
### Using Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Using systemd Timer (Linux)
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.service`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=News Crawler Service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/path/to/news_crawler
|
||||
ExecStart=/path/to/venv/bin/python crawler_service.py
|
||||
User=your-user
|
||||
```
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.timer`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Run News Crawler every 6 hours
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=6h
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
```bash
|
||||
sudo systemctl enable news-crawler.timer
|
||||
sudo systemctl start news-crawler.timer
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
|
||||
Create `Dockerfile`:
|
||||
```dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY crawler_service.py .
|
||||
|
||||
CMD ["python", "crawler_service.py"]
|
||||
```
|
||||
|
||||
Build and run:
|
||||
```bash
|
||||
docker build -t news-crawler .
|
||||
docker run --env-file ../.env news-crawler
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment variables:
|
||||
|
||||
- `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`)
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
- 1 second delay between article requests
|
||||
- Respects server resources
|
||||
- User-Agent header included
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: Can't extract content**
|
||||
- Some sites block scrapers
|
||||
- Try adjusting User-Agent header
|
||||
- Some sites require JavaScript (consider Selenium)
|
||||
|
||||
**Issue: Timeout errors**
|
||||
- Increase timeout in `extract_article_content()`
|
||||
- Check network connectivity
|
||||
|
||||
**Issue: Memory usage**
|
||||
- Reduce `max_articles_per_feed`
|
||||
- Content limited to 10,000 characters per article
|
||||
|
||||
## Architecture
|
||||
|
||||
This is a standalone microservice that:
|
||||
- Can run independently of the main backend
|
||||
- Shares the same MongoDB database
|
||||
- Can be deployed separately
|
||||
- Can be scheduled independently
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- Use Ollama to summarize articles
|
||||
- Perform sentiment analysis
|
||||
- Extract keywords and topics
|
||||
- Generate newsletter content
|
||||
- Create article recommendations
|
||||
194
news_crawler/RSS_URL_EXTRACTION.md
Normal file
194
news_crawler/RSS_URL_EXTRACTION.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# RSS URL Extraction - How It Works
|
||||
|
||||
## The Problem
|
||||
|
||||
Different RSS feed providers use different fields to store the article URL:
|
||||
|
||||
### Example 1: Standard RSS (uses `link`)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link>https://example.com/article/123</link>
|
||||
<guid>internal-id-456</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 2: Some feeds (uses `guid` as URL)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid>https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 3: Atom feeds (uses `id`)
|
||||
```xml
|
||||
<entry>
|
||||
<title>Article Title</title>
|
||||
<id>https://example.com/article/123</id>
|
||||
</entry>
|
||||
```
|
||||
|
||||
### Example 4: Complex feeds (guid as object)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid isPermaLink="true">https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 5: Multiple links
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link rel="alternate" type="text/html" href="https://example.com/article/123"/>
|
||||
<link rel="enclosure" type="image/jpeg" href="https://example.com/image.jpg"/>
|
||||
</item>
|
||||
```
|
||||
|
||||
## Our Solution
|
||||
|
||||
The `extract_article_url()` function tries multiple strategies in order:
|
||||
|
||||
### Strategy 1: Check `link` field (most common)
|
||||
```python
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
```
|
||||
✅ Works for: Most RSS 2.0 feeds
|
||||
|
||||
### Strategy 2: Check `guid` field
|
||||
```python
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
```
|
||||
✅ Works for: Feeds that use GUID as permalink
|
||||
|
||||
### Strategy 3: Check `id` field
|
||||
```python
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
```
|
||||
✅ Works for: Atom feeds
|
||||
|
||||
### Strategy 4: Check `links` array
|
||||
```python
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
```
|
||||
✅ Works for: Feeds with multiple links (prefers HTML content)
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Süddeutsche Zeitung
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Munich News',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'guid': 'sz-internal-123'
|
||||
}
|
||||
# Returns: 'https://www.sueddeutsche.de/muenchen/article-123'
|
||||
```
|
||||
|
||||
### Medium Blog
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Blog Post',
|
||||
'guid': 'https://medium.com/@user/post-abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://medium.com/@user/post-abc123'
|
||||
```
|
||||
|
||||
### YouTube RSS
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Video Title',
|
||||
'id': 'https://www.youtube.com/watch?v=abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://www.youtube.com/watch?v=abc123'
|
||||
```
|
||||
|
||||
### Complex Feed
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Article',
|
||||
'links': [
|
||||
{'rel': 'alternate', 'type': 'text/html', 'href': 'https://example.com/article'},
|
||||
{'rel': 'enclosure', 'type': 'image/jpeg', 'href': 'https://example.com/image.jpg'}
|
||||
]
|
||||
}
|
||||
# Returns: 'https://example.com/article' (prefers text/html)
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
All extracted URLs must:
|
||||
1. Start with `http://` or `https://`
|
||||
2. Be a valid string (not None or empty)
|
||||
|
||||
If no valid URL is found:
|
||||
```python
|
||||
return None
|
||||
# Crawler will skip this entry and log a warning
|
||||
```
|
||||
|
||||
## Testing Different Feeds
|
||||
|
||||
To test if a feed works with our extractor:
|
||||
|
||||
```python
|
||||
import feedparser
|
||||
from rss_utils import extract_article_url
|
||||
|
||||
# Parse feed
|
||||
feed = feedparser.parse('https://example.com/rss')
|
||||
|
||||
# Test each entry
|
||||
for entry in feed.entries[:5]:
|
||||
url = extract_article_url(entry)
|
||||
if url:
|
||||
print(f"✓ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" URL: {url}")
|
||||
else:
|
||||
print(f"✗ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
```
|
||||
|
||||
## Supported Feed Types
|
||||
|
||||
✅ RSS 2.0
|
||||
✅ RSS 1.0
|
||||
✅ Atom
|
||||
✅ Custom RSS variants
|
||||
✅ Feeds with multiple links
|
||||
✅ Feeds with GUID as permalink
|
||||
|
||||
## Edge Cases Handled
|
||||
|
||||
1. **GUID is not a URL**: Checks if it starts with `http`
|
||||
2. **Multiple links**: Prefers `text/html` type
|
||||
3. **GUID as dict**: Extracts `href` field
|
||||
4. **Missing fields**: Returns None instead of crashing
|
||||
5. **Non-HTTP URLs**: Filters out `mailto:`, `ftp:`, etc.
|
||||
|
||||
## Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
- [ ] Support for `feedburner:origLink`
|
||||
- [ ] Support for `pheedo:origLink`
|
||||
- [ ] Resolve shortened URLs (bit.ly, etc.)
|
||||
- [ ] Handle relative URLs (convert to absolute)
|
||||
- [ ] Cache URL extraction results
|
||||
79
news_crawler/check_database.py
Normal file
79
news_crawler/check_database.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Quick script to check what RSS feeds are in the database
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add parent directory to path to import from backend
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'backend'))
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', 'backend', '.env'))
|
||||
except:
|
||||
pass
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
print(f"Connecting to: {MONGODB_URI}")
|
||||
print(f"Database: {DB_NAME}\n")
|
||||
|
||||
try:
|
||||
client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
|
||||
# Test connection
|
||||
client.server_info()
|
||||
print("✓ Connected to MongoDB\n")
|
||||
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
# Get all feeds
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database\n")
|
||||
print("Add feeds using the API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(feeds)} RSS feed(s):\n")
|
||||
print("="*80)
|
||||
|
||||
for i, feed in enumerate(feeds, 1):
|
||||
print(f"\n{i}. {feed.get('name', 'Unknown')}")
|
||||
print(f" URL: {feed.get('url', 'N/A')}")
|
||||
print(f" Active: {feed.get('active', True)}")
|
||||
print(f" Created: {feed.get('created_at', 'N/A')}")
|
||||
print(f" ID: {feed.get('_id', 'N/A')}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
# Check articles
|
||||
articles_collection = db['articles']
|
||||
total_articles = articles_collection.count_documents({})
|
||||
crawled_articles = articles_collection.count_documents({'full_content': {'$exists': True}})
|
||||
|
||||
print(f"\nArticles in database:")
|
||||
print(f" Total: {total_articles}")
|
||||
print(f" With full content: {crawled_articles}")
|
||||
print(f" Without full content: {total_articles - crawled_articles}")
|
||||
|
||||
if total_articles > 0:
|
||||
print("\nSample article:")
|
||||
sample = articles_collection.find_one()
|
||||
print(f" Title: {sample.get('title', 'N/A')[:60]}")
|
||||
print(f" Link: {sample.get('link', 'N/A')}")
|
||||
print(f" Has full_content: {bool(sample.get('full_content'))}")
|
||||
print(f" Word count: {sample.get('word_count', 'N/A')}")
|
||||
|
||||
print("\n✓ Database check complete!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
sys.exit(1)
|
||||
90
news_crawler/config.py
Normal file
90
news_crawler/config.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Configuration management for news crawler
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from backend/.env
|
||||
backend_dir = Path(__file__).parent.parent / 'backend'
|
||||
env_path = backend_dir / '.env'
|
||||
|
||||
if env_path.exists():
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
print(f"✓ Loaded configuration from: {env_path}")
|
||||
else:
|
||||
print(f"⚠ Warning: .env file not found at {env_path}")
|
||||
|
||||
|
||||
class Config:
|
||||
"""Centralized configuration for news crawler"""
|
||||
|
||||
# MongoDB Configuration
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
||||
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
|
||||
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
||||
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
|
||||
|
||||
# Crawler Configuration
|
||||
RATE_LIMIT_DELAY = 1 # seconds between requests
|
||||
MAX_CONTENT_LENGTH = 50000 # characters
|
||||
SUMMARY_MAX_WORDS = 150 # maximum words in AI summary
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print current configuration (without sensitive data)"""
|
||||
print("\n" + "="*60)
|
||||
print("News Crawler Configuration")
|
||||
print("="*60)
|
||||
print(f"MongoDB URI: {cls.MONGODB_URI}")
|
||||
print(f"Database: {cls.DB_NAME}")
|
||||
print(f"\nOllama Configuration:")
|
||||
print(f" Base URL: {cls.OLLAMA_BASE_URL}")
|
||||
print(f" Model: {cls.OLLAMA_MODEL}")
|
||||
print(f" Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Timeout: {cls.OLLAMA_TIMEOUT}s")
|
||||
print(f" Has API Key: {bool(cls.OLLAMA_API_KEY)}")
|
||||
print(f"\nCrawler Settings:")
|
||||
print(f" Rate Limit: {cls.RATE_LIMIT_DELAY}s between requests")
|
||||
print(f" Max Content: {cls.MAX_CONTENT_LENGTH} chars")
|
||||
print(f" Summary Length: {cls.SUMMARY_MAX_WORDS} words")
|
||||
print("="*60 + "\n")
|
||||
|
||||
@classmethod
|
||||
def validate(cls):
|
||||
"""Validate configuration and return list of issues"""
|
||||
issues = []
|
||||
|
||||
# Check MongoDB
|
||||
if not cls.MONGODB_URI:
|
||||
issues.append("MONGODB_URI is not set")
|
||||
|
||||
# Check Ollama if enabled
|
||||
if cls.OLLAMA_ENABLED:
|
||||
if not cls.OLLAMA_BASE_URL:
|
||||
issues.append("OLLAMA_BASE_URL is not set but Ollama is enabled")
|
||||
if not cls.OLLAMA_MODEL:
|
||||
issues.append("OLLAMA_MODEL is not set but Ollama is enabled")
|
||||
if cls.OLLAMA_TIMEOUT < 5:
|
||||
issues.append(f"OLLAMA_TIMEOUT ({cls.OLLAMA_TIMEOUT}s) is too low, recommend at least 5s")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
else:
|
||||
print("✓ Configuration is valid")
|
||||
489
news_crawler/crawler_service.py
Normal file
489
news_crawler/crawler_service.py
Normal file
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Web crawler service to extract full article content from RSS feed links
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
import feedparser
|
||||
import time
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
articles_collection = db['articles']
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
# Initialize Ollama client
|
||||
ollama_client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Print configuration on startup
|
||||
if __name__ != '__main__':
|
||||
Config.print_config()
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print("🤖 Ollama AI summarization: ENABLED")
|
||||
if ollama_client.is_available():
|
||||
print("✓ Ollama server is reachable")
|
||||
else:
|
||||
print("⚠ Warning: Ollama server is not reachable")
|
||||
else:
|
||||
print("ℹ Ollama AI summarization: DISABLED")
|
||||
|
||||
|
||||
def get_active_rss_feeds():
|
||||
"""Get all active RSS feeds from database"""
|
||||
feeds = []
|
||||
cursor = rss_feeds_collection.find({'active': True})
|
||||
for feed in cursor:
|
||||
feeds.append({
|
||||
'id': str(feed['_id']),
|
||||
'name': feed.get('name', ''),
|
||||
'url': feed.get('url', '')
|
||||
})
|
||||
return feeds
|
||||
|
||||
|
||||
def extract_article_content(url, timeout=10):
|
||||
"""
|
||||
Extract main article content from a URL with smart detection
|
||||
Returns: dict with title, content, author, date, and metadata
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Remove script and style elements
|
||||
for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
|
||||
script.decompose()
|
||||
|
||||
# === EXTRACT TITLE ===
|
||||
title = extract_title(soup)
|
||||
|
||||
# === EXTRACT AUTHOR ===
|
||||
author = extract_author(soup)
|
||||
|
||||
# === EXTRACT PUBLISHED DATE ===
|
||||
published_date = extract_date(soup)
|
||||
|
||||
# === EXTRACT MAIN CONTENT ===
|
||||
content_text = extract_main_content(soup)
|
||||
|
||||
# === EXTRACT META DESCRIPTION ===
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
if not meta_desc:
|
||||
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
|
||||
description = meta_desc.get('content', '') if meta_desc else ''
|
||||
|
||||
return {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'content': content_text, # Full content, no limit
|
||||
'description': description,
|
||||
'published_date': published_date,
|
||||
'word_count': len(content_text.split()) if content_text else 0,
|
||||
'crawled_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f"Timeout crawling {url}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error crawling {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Unexpected error crawling {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_title(soup):
|
||||
"""
|
||||
Extract article title using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for h1 tag
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
title = h1.get_text().strip()
|
||||
if title and len(title) > 10: # Reasonable title length
|
||||
return title
|
||||
|
||||
# Strategy 2: Look for meta og:title
|
||||
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
||||
if og_title and og_title.get('content'):
|
||||
return og_title.get('content').strip()
|
||||
|
||||
# Strategy 3: Look for meta twitter:title
|
||||
twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
|
||||
if twitter_title and twitter_title.get('content'):
|
||||
return twitter_title.get('content').strip()
|
||||
|
||||
# Strategy 4: Look for title tag (fallback)
|
||||
title_tag = soup.find('title')
|
||||
if title_tag:
|
||||
title = title_tag.get_text().strip()
|
||||
# Clean up common patterns like "Site Name | Article Title"
|
||||
if ' | ' in title:
|
||||
title = title.split(' | ')[0]
|
||||
elif ' - ' in title:
|
||||
title = title.split(' - ')[0]
|
||||
return title
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_author(soup):
|
||||
"""
|
||||
Extract article author using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for meta author
|
||||
meta_author = soup.find('meta', attrs={'name': 'author'})
|
||||
if meta_author and meta_author.get('content'):
|
||||
return meta_author.get('content').strip()
|
||||
|
||||
# Strategy 2: Look for rel="author"
|
||||
rel_author = soup.find('a', attrs={'rel': 'author'})
|
||||
if rel_author:
|
||||
return rel_author.get_text().strip()
|
||||
|
||||
# Strategy 3: Look for common author class names
|
||||
author_selectors = [
|
||||
'[class*="author-name"]',
|
||||
'[class*="author"]',
|
||||
'[class*="byline"]',
|
||||
'[class*="writer"]',
|
||||
'[rel="author"]',
|
||||
'[itemprop="author"]'
|
||||
]
|
||||
|
||||
for selector in author_selectors:
|
||||
author_elem = soup.select_one(selector)
|
||||
if author_elem:
|
||||
author = author_elem.get_text().strip()
|
||||
# Clean up common patterns
|
||||
author = author.replace('By ', '').replace('by ', '').strip()
|
||||
if author and len(author) < 100: # Reasonable author name length
|
||||
return author
|
||||
|
||||
# Strategy 4: Look for JSON-LD structured data
|
||||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||||
if json_ld:
|
||||
try:
|
||||
import json
|
||||
data = json.loads(json_ld.string)
|
||||
if isinstance(data, dict) and data.get('author'):
|
||||
author_data = data.get('author')
|
||||
if isinstance(author_data, dict):
|
||||
return author_data.get('name', '')
|
||||
elif isinstance(author_data, str):
|
||||
return author_data
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_date(soup):
|
||||
"""
|
||||
Extract published date using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for time tag with datetime attribute
|
||||
time_tag = soup.find('time')
|
||||
if time_tag and time_tag.get('datetime'):
|
||||
return time_tag.get('datetime')
|
||||
|
||||
# Strategy 2: Look for meta article:published_time
|
||||
meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
|
||||
if meta_published and meta_published.get('content'):
|
||||
return meta_published.get('content')
|
||||
|
||||
# Strategy 3: Look for meta og:published_time
|
||||
og_published = soup.find('meta', attrs={'property': 'og:published_time'})
|
||||
if og_published and og_published.get('content'):
|
||||
return og_published.get('content')
|
||||
|
||||
# Strategy 4: Look for common date class names
|
||||
date_selectors = [
|
||||
'[class*="publish-date"]',
|
||||
'[class*="published"]',
|
||||
'[class*="date"]',
|
||||
'[class*="timestamp"]',
|
||||
'[itemprop="datePublished"]'
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
date_elem = soup.select_one(selector)
|
||||
if date_elem:
|
||||
# Try datetime attribute first
|
||||
if date_elem.get('datetime'):
|
||||
return date_elem.get('datetime')
|
||||
# Otherwise get text
|
||||
date_text = date_elem.get_text().strip()
|
||||
if date_text and len(date_text) < 50:
|
||||
return date_text
|
||||
|
||||
# Strategy 5: Look for JSON-LD structured data
|
||||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||||
if json_ld:
|
||||
try:
|
||||
import json
|
||||
data = json.loads(json_ld.string)
|
||||
if isinstance(data, dict):
|
||||
return data.get('datePublished') or data.get('dateCreated')
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_main_content(soup):
|
||||
"""
|
||||
Extract main article content using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Try common article content selectors
|
||||
content_selectors = [
|
||||
'article',
|
||||
'[class*="article-content"]',
|
||||
'[class*="article-body"]',
|
||||
'[class*="post-content"]',
|
||||
'[class*="entry-content"]',
|
||||
'[class*="content-body"]',
|
||||
'[class*="story-body"]',
|
||||
'[itemprop="articleBody"]',
|
||||
'main'
|
||||
]
|
||||
|
||||
article_content = None
|
||||
for selector in content_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
article_content = element
|
||||
break
|
||||
|
||||
# Fallback: get body
|
||||
if not article_content:
|
||||
article_content = soup.find('body')
|
||||
|
||||
if not article_content:
|
||||
return ''
|
||||
|
||||
# Extract text from paragraphs
|
||||
paragraphs = article_content.find_all('p')
|
||||
|
||||
# Filter out short paragraphs (likely navigation/ads)
|
||||
content_paragraphs = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text().strip()
|
||||
# Keep paragraphs with at least 50 characters
|
||||
if len(text) >= 50:
|
||||
content_paragraphs.append(text)
|
||||
|
||||
content_text = '\n\n'.join(content_paragraphs)
|
||||
|
||||
return content_text
|
||||
|
||||
|
||||
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
||||
"""
|
||||
Crawl articles from an RSS feed
|
||||
Returns: dict with statistics
|
||||
"""
|
||||
print(f"\n📰 Crawling feed: {feed_name}")
|
||||
print(f" URL: {feed_url}")
|
||||
|
||||
try:
|
||||
# Parse RSS feed
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print(f" ⚠ No entries found in feed")
|
||||
return 0
|
||||
|
||||
crawled_count = 0
|
||||
summarized_count = 0
|
||||
failed_summaries = 0
|
||||
|
||||
for entry in feed.entries[:max_articles]:
|
||||
# Extract article URL using utility function
|
||||
article_url = extract_article_url(entry)
|
||||
|
||||
if not article_url:
|
||||
print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
|
||||
continue
|
||||
|
||||
# Check if article already exists and has content
|
||||
existing = articles_collection.find_one({'link': article_url})
|
||||
if existing and existing.get('content'):
|
||||
print(f" ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
|
||||
continue
|
||||
|
||||
print(f" 🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
|
||||
|
||||
# Extract full content
|
||||
article_data = extract_article_content(article_url)
|
||||
|
||||
if article_data and article_data.get('content'):
|
||||
# Summarize with Ollama if enabled
|
||||
summary_result = None
|
||||
if Config.OLLAMA_ENABLED and article_data.get('content'):
|
||||
print(f" 🤖 Summarizing with AI...")
|
||||
summary_result = ollama_client.summarize_article(
|
||||
article_data['content'],
|
||||
max_words=Config.SUMMARY_MAX_WORDS
|
||||
)
|
||||
|
||||
if summary_result['success']:
|
||||
print(f" ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
|
||||
summarized_count += 1
|
||||
else:
|
||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||
failed_summaries += 1
|
||||
|
||||
# Prepare document
|
||||
article_doc = {
|
||||
'title': article_data.get('title') or entry.get('title', ''),
|
||||
'author': article_data.get('author'),
|
||||
'link': article_url,
|
||||
'content': article_data.get('content', ''), # Full article content
|
||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||
'word_count': article_data.get('word_count', 0),
|
||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
|
||||
'crawled_at': article_data.get('crawled_at'),
|
||||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
try:
|
||||
# Upsert: update if exists, insert if not
|
||||
articles_collection.update_one(
|
||||
{'link': article_url},
|
||||
{'$set': article_doc},
|
||||
upsert=True
|
||||
)
|
||||
crawled_count += 1
|
||||
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
||||
|
||||
except DuplicateKeyError:
|
||||
print(f" ⚠ Duplicate key error")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error saving: {e}")
|
||||
else:
|
||||
print(f" ✗ Failed to extract content")
|
||||
|
||||
# Be nice to servers - add delay
|
||||
time.sleep(1)
|
||||
|
||||
print(f" ✓ Crawled {crawled_count} articles from {feed_name}")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f" 🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
|
||||
|
||||
return {
|
||||
'crawled': crawled_count,
|
||||
'summarized': summarized_count,
|
||||
'failed_summaries': failed_summaries
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error processing feed {feed_name}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl_all_feeds(max_articles_per_feed=10):
|
||||
"""
|
||||
Crawl all active RSS feeds
|
||||
Returns: dict with statistics
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🚀 Starting RSS Feed Crawler")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
feeds = get_active_rss_feeds()
|
||||
|
||||
if not feeds:
|
||||
print("⚠ No active RSS feeds found")
|
||||
return {
|
||||
'total_feeds': 0,
|
||||
'total_articles_crawled': 0,
|
||||
'duration_seconds': 0
|
||||
}
|
||||
|
||||
print(f"Found {len(feeds)} active feed(s)")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
|
||||
|
||||
total_crawled = 0
|
||||
total_summarized = 0
|
||||
total_failed = 0
|
||||
|
||||
for feed in feeds:
|
||||
result = crawl_rss_feed(
|
||||
feed['url'],
|
||||
feed['name'],
|
||||
max_articles=max_articles_per_feed
|
||||
)
|
||||
total_crawled += result['crawled']
|
||||
total_summarized += result['summarized']
|
||||
total_failed += result['failed_summaries']
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"✓ Crawling Complete!")
|
||||
print(f" Total feeds processed: {len(feeds)}")
|
||||
print(f" Total articles crawled: {total_crawled}")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f" Total articles summarized: {total_summarized}")
|
||||
print(f" Failed summarizations: {total_failed}")
|
||||
if total_summarized > 0:
|
||||
success_rate = (total_summarized / (total_summarized + total_failed)) * 100
|
||||
print(f" Success rate: {success_rate:.1f}%")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
if total_crawled > 0:
|
||||
print(f" Average time per article: {duration/total_crawled:.1f}s")
|
||||
print("="*60 + "\n")
|
||||
|
||||
return {
|
||||
'total_feeds': len(feeds),
|
||||
'total_articles_crawled': total_crawled,
|
||||
'total_summarized': total_summarized,
|
||||
'failed_summaries': total_failed,
|
||||
'duration_seconds': round(duration, 2)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Can be run standalone for testing
|
||||
import sys
|
||||
max_articles = 10
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
max_articles = int(sys.argv[1])
|
||||
except ValueError:
|
||||
print("Usage: python crawler_service.py [max_articles_per_feed]")
|
||||
sys.exit(1)
|
||||
|
||||
crawl_all_feeds(max_articles_per_feed=max_articles)
|
||||
33
news_crawler/docker-compose.yml
Normal file
33
news_crawler/docker-compose.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
crawler:
|
||||
build: .
|
||||
container_name: news-crawler
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://mongodb:27017/
|
||||
networks:
|
||||
- munich-news-network
|
||||
depends_on:
|
||||
- mongodb
|
||||
# Run once and exit
|
||||
restart: "no"
|
||||
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: munich-news-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
networks:
|
||||
- munich-news-network
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
driver: bridge
|
||||
290
news_crawler/ollama_client.py
Normal file
290
news_crawler/ollama_client.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
Ollama client for AI-powered article summarization
|
||||
"""
|
||||
import requests
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for communicating with Ollama server for text summarization"""
|
||||
|
||||
def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
|
||||
"""
|
||||
Initialize Ollama client
|
||||
|
||||
Args:
|
||||
base_url: Ollama server URL (e.g., http://localhost:11434)
|
||||
model: Model name to use (e.g., phi3:latest)
|
||||
api_key: Optional API key for authentication
|
||||
enabled: Whether Ollama is enabled
|
||||
timeout: Request timeout in seconds (default 30)
|
||||
"""
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.model = model
|
||||
self.api_key = api_key
|
||||
self.enabled = enabled
|
||||
self.timeout = timeout
|
||||
|
||||
def summarize_article(self, content, max_words=150):
|
||||
"""
|
||||
Summarize article content using Ollama
|
||||
|
||||
Args:
|
||||
content: Full article text
|
||||
max_words: Maximum words in summary (default 150)
|
||||
|
||||
Returns:
|
||||
{
|
||||
'summary': str, # AI-generated summary
|
||||
'summary_word_count': int, # Summary word count
|
||||
'original_word_count': int, # Original article word count
|
||||
'success': bool, # Whether summarization succeeded
|
||||
'error': str or None, # Error message if failed
|
||||
'duration': float # Time taken in seconds
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': 0,
|
||||
'success': False,
|
||||
'error': 'Ollama is not enabled',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
if not content or len(content.strip()) == 0:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': 0,
|
||||
'success': False,
|
||||
'error': 'Content is empty',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
# Calculate original word count
|
||||
original_word_count = len(content.split())
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Construct prompt
|
||||
prompt = self._build_summarization_prompt(content, max_words)
|
||||
|
||||
# Prepare request
|
||||
url = f"{self.base_url}/api/generate"
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'prompt': prompt,
|
||||
'stream': False,
|
||||
'options': {
|
||||
'temperature': 0.7,
|
||||
'num_predict': 250 # Limit response length
|
||||
}
|
||||
}
|
||||
|
||||
# Make request
|
||||
response = requests.post(
|
||||
url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse response
|
||||
result = response.json()
|
||||
summary = result.get('response', '').strip()
|
||||
|
||||
if not summary:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': 'Ollama returned empty summary',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
summary_word_count = len(summary.split())
|
||||
|
||||
return {
|
||||
'summary': summary,
|
||||
'summary_word_count': summary_word_count,
|
||||
'original_word_count': original_word_count,
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Request timed out after {self.timeout} seconds',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Cannot connect to Ollama server at {self.base_url}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except requests.exceptions.HTTPError as e:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Unexpected error: {str(e)}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
def _build_summarization_prompt(self, content, max_words):
|
||||
"""Build prompt for article summarization"""
|
||||
# Truncate content if too long (keep first 5000 words)
|
||||
words = content.split()
|
||||
if len(words) > 5000:
|
||||
content = ' '.join(words[:5000]) + '...'
|
||||
|
||||
prompt = f"""Summarize the following article in English in {max_words} words or less. Even if the article is in German or another language, provide the summary in English. Focus on the key points, main message, and important details. Be concise and clear.
|
||||
|
||||
Article:
|
||||
{content}
|
||||
|
||||
English Summary (max {max_words} words):"""
|
||||
|
||||
return prompt
|
||||
|
||||
def is_available(self):
|
||||
"""
|
||||
Check if Ollama server is reachable
|
||||
|
||||
Returns:
|
||||
bool: True if server is reachable, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/api/tags"
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=5)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def test_connection(self):
|
||||
"""
|
||||
Test connection and return server info
|
||||
|
||||
Returns:
|
||||
{
|
||||
'available': bool,
|
||||
'models': list,
|
||||
'current_model': str,
|
||||
'error': str or None
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': 'Ollama is not enabled'
|
||||
}
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/api/tags"
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=5)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
models = [m.get('name', '') for m in result.get('models', [])]
|
||||
|
||||
return {
|
||||
'available': True,
|
||||
'models': models,
|
||||
'current_model': self.model,
|
||||
'error': None
|
||||
}
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': f'Cannot connect to Ollama server at {self.base_url}'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Quick test
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
client = OllamaClient(
|
||||
base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
|
||||
model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
|
||||
enabled=True
|
||||
)
|
||||
|
||||
print("Testing Ollama connection...")
|
||||
result = client.test_connection()
|
||||
print(f"Available: {result['available']}")
|
||||
print(f"Models: {result['models']}")
|
||||
print(f"Current model: {result['current_model']}")
|
||||
|
||||
if result['available']:
|
||||
print("\nTesting summarization...")
|
||||
test_content = """
|
||||
The new U-Bahn line connecting Munich's city center with the airport opened today.
|
||||
Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
|
||||
The line will significantly reduce travel time between the airport and downtown Munich.
|
||||
Construction took five years and cost approximately 2 billion euros.
|
||||
The new line includes 10 stations and runs every 10 minutes during peak hours.
|
||||
"""
|
||||
|
||||
summary_result = client.summarize_article(test_content, max_words=50)
|
||||
print(f"Success: {summary_result['success']}")
|
||||
print(f"Summary: {summary_result['summary']}")
|
||||
print(f"Original word count: {summary_result['original_word_count']}")
|
||||
print(f"Summary word count: {summary_result['summary_word_count']}")
|
||||
print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
|
||||
print(f"Duration: {summary_result['duration']:.2f}s")
|
||||
6
news_crawler/requirements.txt
Normal file
6
news_crawler/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
beautifulsoup4==4.12.2
|
||||
lxml==4.9.3
|
||||
requests==2.31.0
|
||||
feedparser==6.0.10
|
||||
pymongo==4.6.1
|
||||
python-dotenv==1.0.0
|
||||
98
news_crawler/rss_utils.py
Normal file
98
news_crawler/rss_utils.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Utility functions for RSS feed processing
|
||||
"""
|
||||
|
||||
|
||||
def extract_article_url(entry):
|
||||
"""
|
||||
Extract article URL from RSS entry.
|
||||
Different RSS feeds use different fields for the article URL.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article URL or None if not found
|
||||
|
||||
Examples:
|
||||
- Most feeds use 'link'
|
||||
- Some use 'guid' as the URL
|
||||
- Some use 'id' as the URL
|
||||
- Some have guid as a dict with 'href'
|
||||
"""
|
||||
# Try 'link' first (most common)
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
|
||||
# Try 'guid' if it's a valid URL
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
|
||||
# Try 'id' if it's a valid URL
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
|
||||
# Try 'links' array (some feeds have multiple links)
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type, but accept any http link
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
# If no alternate found, return first http link
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
return link.get('href')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_article_summary(entry):
|
||||
"""
|
||||
Extract article summary/description from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article summary or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('summary'):
|
||||
return entry.get('summary', '')
|
||||
elif entry.get('description'):
|
||||
return entry.get('description', '')
|
||||
elif entry.get('content'):
|
||||
# content is usually a list of dicts
|
||||
content = entry.get('content', [])
|
||||
if content and isinstance(content, list) and len(content) > 0:
|
||||
return content[0].get('value', '')
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def extract_published_date(entry):
|
||||
"""
|
||||
Extract published date from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Published date or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('published'):
|
||||
return entry.get('published', '')
|
||||
elif entry.get('updated'):
|
||||
return entry.get('updated', '')
|
||||
elif entry.get('created'):
|
||||
return entry.get('created', '')
|
||||
|
||||
return ''
|
||||
83
news_crawler/test_crawler.py
Normal file
83
news_crawler/test_crawler.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify crawler functionality
|
||||
"""
|
||||
from crawler_service import extract_article_content, get_active_rss_feeds
|
||||
import sys
|
||||
|
||||
|
||||
def test_content_extraction():
|
||||
"""Test content extraction from a sample URL"""
|
||||
print("Testing content extraction...")
|
||||
|
||||
# Test with a simple news site
|
||||
test_url = "https://www.bbc.com/news"
|
||||
|
||||
print(f"Extracting content from: {test_url}")
|
||||
result = extract_article_content(test_url, timeout=10)
|
||||
|
||||
if result:
|
||||
print("✓ Content extraction successful!")
|
||||
print(f" Title: {result.get('title', 'N/A')[:50]}...")
|
||||
print(f" Content length: {len(result.get('content', ''))} chars")
|
||||
print(f" Word count: {result.get('word_count', 0)}")
|
||||
return True
|
||||
else:
|
||||
print("✗ Content extraction failed")
|
||||
return False
|
||||
|
||||
|
||||
def test_database_connection():
|
||||
"""Test MongoDB connection"""
|
||||
print("\nTesting database connection...")
|
||||
|
||||
try:
|
||||
feeds = get_active_rss_feeds()
|
||||
print(f"✓ Database connection successful!")
|
||||
print(f" Found {len(feeds)} active RSS feed(s)")
|
||||
|
||||
if feeds:
|
||||
print("\n Active feeds:")
|
||||
for feed in feeds:
|
||||
print(f" - {feed['name']}: {feed['url']}")
|
||||
else:
|
||||
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("News Crawler - Test Suite")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Test database connection
|
||||
db_ok = test_database_connection()
|
||||
|
||||
# Test content extraction
|
||||
extract_ok = test_content_extraction()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test Results:")
|
||||
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
|
||||
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
if db_ok and extract_ok:
|
||||
print("✓ All tests passed! Crawler is ready to use.")
|
||||
print("\nRun the crawler with:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("✗ Some tests failed. Please check the errors above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
129
news_crawler/test_ollama.py
Normal file
129
news_crawler/test_ollama.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script for Ollama integration
|
||||
Tests connection, configuration, and summarization
|
||||
"""
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("Ollama Integration Test")
|
||||
print("="*70)
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate configuration
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
print()
|
||||
|
||||
# Initialize client
|
||||
client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Test 1: Check if Ollama is enabled
|
||||
print("Test 1: Configuration Check")
|
||||
print(f" Ollama Enabled: {Config.OLLAMA_ENABLED}")
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
print(" ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
|
||||
print("\n" + "="*70)
|
||||
exit(0)
|
||||
|
||||
# Test 2: Test connection
|
||||
print("\nTest 2: Connection Test")
|
||||
conn_result = client.test_connection()
|
||||
print(f" Available: {conn_result['available']}")
|
||||
print(f" Current Model: {conn_result['current_model']}")
|
||||
|
||||
if conn_result['available']:
|
||||
print(f" ✓ Connected to Ollama server")
|
||||
if conn_result['models']:
|
||||
print(f" Available models: {', '.join(conn_result['models'][:5])}")
|
||||
if conn_result['current_model'] not in conn_result['models']:
|
||||
print(f" ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
|
||||
else:
|
||||
print(f" ✗ Connection failed: {conn_result['error']}")
|
||||
print("\n" + "="*70)
|
||||
exit(1)
|
||||
|
||||
# Test 3: Test summarization with sample article
|
||||
print("\nTest 3: Summarization Test")
|
||||
print(" Testing with sample German article...")
|
||||
|
||||
sample_article = """
|
||||
Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet.
|
||||
Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil.
|
||||
Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen.
|
||||
Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen
|
||||
und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung
|
||||
den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich
|
||||
über 50.000 Fahrgästen auf der neuen Strecke.
|
||||
"""
|
||||
|
||||
result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
|
||||
|
||||
print(f"\n Success: {result['success']}")
|
||||
if result['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f"\n Original word count: {result['original_word_count']}")
|
||||
print(f" Summary word count: {result['summary_word_count']}")
|
||||
print(f" Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
|
||||
print(f" Duration: {result['duration']:.2f}s")
|
||||
print(f"\n Summary (English):")
|
||||
print(f" {'-'*70}")
|
||||
print(f" {result['summary']}")
|
||||
print(f" {'-'*70}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result['error']}")
|
||||
|
||||
# Test 4: Test with English article
|
||||
print("\nTest 4: English Article Test")
|
||||
print(" Testing with English article...")
|
||||
|
||||
english_article = """
|
||||
The city council approved a new bike lane network spanning 50 kilometers across Munich.
|
||||
The project aims to promote sustainable transportation and reduce car traffic in the city center.
|
||||
Construction will begin next month and is expected to be completed within two years.
|
||||
The bike lanes will connect major residential areas with business districts and public transport hubs.
|
||||
Environmental groups have praised the initiative as a significant step toward carbon neutrality.
|
||||
"""
|
||||
|
||||
result2 = client.summarize_article(english_article, max_words=50)
|
||||
|
||||
print(f"\n Success: {result2['success']}")
|
||||
if result2['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f" Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
|
||||
print(f" Duration: {result2['duration']:.2f}s")
|
||||
print(f"\n Summary:")
|
||||
print(f" {result2['summary']}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result2['error']}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("Test Summary")
|
||||
print("="*70)
|
||||
print(f"✓ Configuration: Valid")
|
||||
print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
|
||||
print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
|
||||
print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
|
||||
print("="*70)
|
||||
|
||||
if result['success'] and result2['success']:
|
||||
print("\n🎉 All tests passed! Ollama integration is working correctly.")
|
||||
print("\nYou can now run the crawler with AI summarization:")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print("\n⚠ Some tests failed. Check the errors above.")
|
||||
|
||||
print()
|
||||
154
news_crawler/test_rss_feeds.py
Normal file
154
news_crawler/test_rss_feeds.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify RSS feed URL extraction
|
||||
Tests actual feeds from the database
|
||||
"""
|
||||
import feedparser
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
client = MongoClient(MONGODB_URI)
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def test_feed(feed_name, feed_url):
|
||||
"""Test a single RSS feed"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing: {feed_name}")
|
||||
print(f"URL: {feed_url}")
|
||||
print('='*70)
|
||||
|
||||
try:
|
||||
# Parse the feed
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
return False
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 5 entries
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, entry in enumerate(feed.entries[:5], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
print(f"Title: {entry.get('title', 'No title')[:60]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL: {article_url}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
fail_count += 1
|
||||
|
||||
# Test summary extraction
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
else:
|
||||
print(f"⚠ No summary found")
|
||||
|
||||
# Test date extraction
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Published: {pub_date}")
|
||||
else:
|
||||
print(f"⚠ No published date found")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Results for {feed_name}:")
|
||||
print(f" ✓ Success: {success_count}/5")
|
||||
print(f" ❌ Failed: {fail_count}/5")
|
||||
print('='*70)
|
||||
|
||||
return fail_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing feed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*70)
|
||||
|
||||
# Get all RSS feeds from database
|
||||
print("\nFetching RSS feeds from database...")
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database")
|
||||
print("\nAdd feeds using:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
results = {}
|
||||
for feed in feeds:
|
||||
feed_name = feed.get('name', 'Unknown')
|
||||
feed_url = feed.get('url', '')
|
||||
active = feed.get('active', True)
|
||||
|
||||
if not active:
|
||||
print(f"\n⏭ Skipping inactive feed: {feed_name}")
|
||||
continue
|
||||
|
||||
if not feed_url:
|
||||
print(f"\n❌ Feed '{feed_name}' has no URL")
|
||||
results[feed_name] = False
|
||||
continue
|
||||
|
||||
results[feed_name] = test_feed(feed_name, feed_url)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("FINAL RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for feed_name, success in results.items():
|
||||
status = "✓ PASS" if success else "❌ FAIL"
|
||||
print(f"{status} - {feed_name}")
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for s in results.values() if s)
|
||||
|
||||
print(f"\nTotal: {passed}/{total} feeds passed")
|
||||
print("="*70 + "\n")
|
||||
|
||||
if passed == total:
|
||||
print("✓ All feeds are working correctly!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("⚠ Some feeds have issues. Check the output above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main())
|
||||
28
news_sender/.gitignore
vendored
Normal file
28
news_sender/.gitignore
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# Generated files
|
||||
newsletter_preview.html
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
303
news_sender/README.md
Normal file
303
news_sender/README.md
Normal file
@@ -0,0 +1,303 @@
|
||||
# News Sender Microservice
|
||||
|
||||
Standalone service for sending Munich News Daily newsletters to subscribers.
|
||||
|
||||
## Features
|
||||
|
||||
- 📧 Sends beautiful HTML newsletters
|
||||
- 🤖 Uses AI-generated article summaries
|
||||
- 📊 Tracks sending statistics
|
||||
- 🧪 Test mode for development
|
||||
- 📝 Preview generation
|
||||
- 🔄 Fetches data from shared MongoDB
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
cd news_sender
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The service uses the same `.env` file as the backend (`../backend/.env`):
|
||||
|
||||
```env
|
||||
# MongoDB
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
|
||||
# Email (Gmail example)
|
||||
SMTP_SERVER=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
EMAIL_USER=your-email@gmail.com
|
||||
EMAIL_PASSWORD=your-app-password
|
||||
|
||||
# Newsletter Settings (optional)
|
||||
NEWSLETTER_MAX_ARTICLES=10
|
||||
WEBSITE_URL=http://localhost:3000
|
||||
```
|
||||
|
||||
**Gmail Setup:**
|
||||
1. Enable 2-factor authentication
|
||||
2. Generate an App Password: https://support.google.com/accounts/answer/185833
|
||||
3. Use the App Password (not your regular password)
|
||||
|
||||
## Usage
|
||||
|
||||
### 1. Preview Newsletter
|
||||
|
||||
Generate HTML preview without sending:
|
||||
|
||||
```bash
|
||||
python sender_service.py preview
|
||||
```
|
||||
|
||||
This creates `newsletter_preview.html` - open it in your browser to see how the newsletter looks.
|
||||
|
||||
### 2. Send Test Email
|
||||
|
||||
Send to a single email address for testing:
|
||||
|
||||
```bash
|
||||
python sender_service.py test your-email@example.com
|
||||
```
|
||||
|
||||
### 3. Send to All Subscribers
|
||||
|
||||
Send newsletter to all active subscribers:
|
||||
|
||||
```bash
|
||||
# Send with default article count (10)
|
||||
python sender_service.py send
|
||||
|
||||
# Send with custom article count
|
||||
python sender_service.py send 15
|
||||
```
|
||||
|
||||
### 4. Use as Python Module
|
||||
|
||||
```python
|
||||
from sender_service import send_newsletter, preview_newsletter
|
||||
|
||||
# Send newsletter
|
||||
result = send_newsletter(max_articles=10)
|
||||
print(f"Sent to {result['sent_count']} subscribers")
|
||||
|
||||
# Generate preview
|
||||
html = preview_newsletter(max_articles=5)
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ 1. Fetch Articles from MongoDB │
|
||||
│ - Get latest articles with AI summaries │
|
||||
│ - Sort by creation date (newest first) │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ 2. Fetch Active Subscribers │
|
||||
│ - Get all subscribers with status='active' │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ 3. Render Newsletter HTML │
|
||||
│ - Load newsletter_template.html │
|
||||
│ - Populate with articles and metadata │
|
||||
│ - Generate beautiful HTML email │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ 4. Send Emails │
|
||||
│ - Connect to SMTP server │
|
||||
│ - Send to each subscriber │
|
||||
│ - Track success/failure │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ 5. Report Statistics │
|
||||
│ - Total sent │
|
||||
│ - Failed sends │
|
||||
│ - Error details │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Output Example
|
||||
|
||||
```
|
||||
======================================================================
|
||||
📧 Munich News Daily - Newsletter Sender
|
||||
======================================================================
|
||||
|
||||
Fetching latest 10 articles with AI summaries...
|
||||
✓ Found 10 articles
|
||||
|
||||
Fetching active subscribers...
|
||||
✓ Found 150 active subscriber(s)
|
||||
|
||||
Rendering newsletter HTML...
|
||||
✓ Newsletter rendered
|
||||
|
||||
Sending newsletter: 'Munich News Daily - November 10, 2024'
|
||||
----------------------------------------------------------------------
|
||||
[1/150] Sending to user1@example.com... ✓
|
||||
[2/150] Sending to user2@example.com... ✓
|
||||
[3/150] Sending to user3@example.com... ✓
|
||||
...
|
||||
|
||||
======================================================================
|
||||
📊 Sending Complete
|
||||
======================================================================
|
||||
✓ Successfully sent: 148
|
||||
✗ Failed: 2
|
||||
📰 Articles included: 10
|
||||
======================================================================
|
||||
```
|
||||
|
||||
## Scheduling
|
||||
|
||||
### Using Cron (Linux/Mac)
|
||||
|
||||
Send newsletter daily at 8 AM:
|
||||
|
||||
```bash
|
||||
# Edit crontab
|
||||
crontab -e
|
||||
|
||||
# Add this line
|
||||
0 8 * * * cd /path/to/news_sender && /path/to/venv/bin/python sender_service.py send
|
||||
```
|
||||
|
||||
### Using systemd Timer (Linux)
|
||||
|
||||
Create `/etc/systemd/system/news-sender.service`:
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Munich News Sender
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/path/to/news_sender
|
||||
ExecStart=/path/to/venv/bin/python sender_service.py send
|
||||
User=your-user
|
||||
```
|
||||
|
||||
Create `/etc/systemd/system/news-sender.timer`:
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Send Munich News Daily at 8 AM
|
||||
|
||||
[Timer]
|
||||
OnCalendar=daily
|
||||
OnCalendar=*-*-* 08:00:00
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
|
||||
```bash
|
||||
sudo systemctl enable news-sender.timer
|
||||
sudo systemctl start news-sender.timer
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
|
||||
Create `Dockerfile`:
|
||||
|
||||
```dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY sender_service.py newsletter_template.html ./
|
||||
|
||||
CMD ["python", "sender_service.py", "send"]
|
||||
```
|
||||
|
||||
Build and run:
|
||||
|
||||
```bash
|
||||
docker build -t news-sender .
|
||||
docker run --env-file ../backend/.env news-sender
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Email credentials not configured"
|
||||
- Check that `EMAIL_USER` and `EMAIL_PASSWORD` are set in `.env`
|
||||
- For Gmail, use an App Password, not your regular password
|
||||
|
||||
### "No articles with summaries found"
|
||||
- Run the crawler first: `cd ../news_crawler && python crawler_service.py 10`
|
||||
- Make sure Ollama is enabled and working
|
||||
- Check MongoDB has articles with `summary` field
|
||||
|
||||
### "No active subscribers found"
|
||||
- Add subscribers via the backend API
|
||||
- Check subscriber status is 'active' in MongoDB
|
||||
|
||||
### SMTP Connection Errors
|
||||
- Verify SMTP server and port are correct
|
||||
- Check firewall isn't blocking SMTP port
|
||||
- For Gmail, ensure "Less secure app access" is enabled or use App Password
|
||||
|
||||
### Emails Going to Spam
|
||||
- Set up SPF, DKIM, and DMARC records for your domain
|
||||
- Use a verified email address
|
||||
- Avoid spam trigger words in subject/content
|
||||
- Include unsubscribe link (already included in template)
|
||||
|
||||
## Architecture
|
||||
|
||||
This is a standalone microservice that:
|
||||
- Runs independently of the backend
|
||||
- Shares the same MongoDB database
|
||||
- Can be deployed separately
|
||||
- Can be scheduled independently
|
||||
- Has no dependencies on backend code
|
||||
|
||||
## Integration with Other Services
|
||||
|
||||
```
|
||||
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
||||
│ Backend │ │ Crawler │ │ Sender │
|
||||
│ (Flask) │ │ (Scraper) │ │ (Email) │
|
||||
└──────┬───────┘ └──────┬───────┘ └──────┬───────┘
|
||||
│ │ │
|
||||
│ │ │
|
||||
└────────────────────┴─────────────────────┘
|
||||
│
|
||||
┌───────▼────────┐
|
||||
│ MongoDB │
|
||||
│ (Shared DB) │
|
||||
└────────────────┘
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Test the newsletter:**
|
||||
```bash
|
||||
python sender_service.py test your-email@example.com
|
||||
```
|
||||
|
||||
2. **Schedule daily sending:**
|
||||
- Set up cron job or systemd timer
|
||||
- Choose appropriate time (e.g., 8 AM)
|
||||
|
||||
3. **Monitor sending:**
|
||||
- Check logs for errors
|
||||
- Track open rates (requires email tracking service)
|
||||
- Monitor spam complaints
|
||||
|
||||
4. **Optimize:**
|
||||
- Add email tracking pixels
|
||||
- A/B test subject lines
|
||||
- Personalize content per subscriber
|
||||
162
news_sender/newsletter_template.html
Normal file
162
news_sender/newsletter_template.html
Normal file
@@ -0,0 +1,162 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<title>Munich News Daily</title>
|
||||
<!--[if mso]>
|
||||
<style type="text/css">
|
||||
body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
|
||||
</style>
|
||||
<![endif]-->
|
||||
</head>
|
||||
<body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
|
||||
<!-- Wrapper Table -->
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
|
||||
<tr>
|
||||
<td align="center" style="padding: 20px 0;">
|
||||
<!-- Main Container -->
|
||||
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
|
||||
|
||||
<!-- Header -->
|
||||
<tr>
|
||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
||||
<h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
|
||||
Munich News Daily
|
||||
</h1>
|
||||
<p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
|
||||
{{ date }}
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Greeting -->
|
||||
<tr>
|
||||
<td style="padding: 30px 40px 20px 40px;">
|
||||
<p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
|
||||
Good morning ☀️
|
||||
</p>
|
||||
<p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
|
||||
Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Divider -->
|
||||
<tr>
|
||||
<td style="padding: 0 40px;">
|
||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Articles -->
|
||||
{% for article in articles %}
|
||||
<tr>
|
||||
<td style="padding: 25px 40px;">
|
||||
<!-- Article Number Badge -->
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
|
||||
<tr>
|
||||
<td>
|
||||
<span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
|
||||
{{ loop.index }}
|
||||
</span>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<!-- Article Title -->
|
||||
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
|
||||
{{ article.title }}
|
||||
</h2>
|
||||
|
||||
<!-- Article Meta -->
|
||||
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
|
||||
<span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
|
||||
{% if article.author %}
|
||||
<span> • {{ article.author }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
|
||||
<!-- Article Summary -->
|
||||
<p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
|
||||
{{ article.summary }}
|
||||
</p>
|
||||
|
||||
<!-- Read More Link -->
|
||||
<a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
|
||||
Read more →
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Article Divider -->
|
||||
{% if not loop.last %}
|
||||
<tr>
|
||||
<td style="padding: 0 40px;">
|
||||
<div style="height: 1px; background-color: #f0f0f0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
<!-- Bottom Divider -->
|
||||
<tr>
|
||||
<td style="padding: 25px 40px 0 40px;">
|
||||
<div style="height: 1px; background-color: #e0e0e0;"></div>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Summary Box -->
|
||||
<tr>
|
||||
<td style="padding: 30px 40px;">
|
||||
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
|
||||
<tr>
|
||||
<td style="padding: 25px; text-align: center;">
|
||||
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
|
||||
Today's Digest
|
||||
</p>
|
||||
<p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
|
||||
{{ article_count }}
|
||||
</p>
|
||||
<p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
|
||||
stories • AI-summarized • 5 min read
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<!-- Footer -->
|
||||
<tr>
|
||||
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
|
||||
<p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
|
||||
Munich News Daily
|
||||
</p>
|
||||
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
|
||||
AI-powered news summaries for busy people.<br>
|
||||
Delivered daily to your inbox.
|
||||
</p>
|
||||
|
||||
<!-- Footer Links -->
|
||||
<p style="margin: 0; font-size: 12px; color: #666666;">
|
||||
<a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
|
||||
<span style="color: #444444;"> • </span>
|
||||
<a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
|
||||
</p>
|
||||
|
||||
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
|
||||
© {{ year }} Munich News Daily. All rights reserved.
|
||||
</p>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
<!-- End Main Container -->
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<!-- End Wrapper Table -->
|
||||
</body>
|
||||
</html>
|
||||
3
news_sender/requirements.txt
Normal file
3
news_sender/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
pymongo==4.6.1
|
||||
python-dotenv==1.0.0
|
||||
Jinja2==3.1.2
|
||||
313
news_sender/sender_service.py
Normal file
313
news_sender/sender_service.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
News Sender Service - Standalone microservice for sending newsletters
|
||||
Fetches articles from MongoDB and sends to subscribers via email
|
||||
"""
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from jinja2 import Template
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from backend/.env
|
||||
backend_dir = Path(__file__).parent.parent / 'backend'
|
||||
env_path = backend_dir / '.env'
|
||||
|
||||
if env_path.exists():
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
print(f"✓ Loaded configuration from: {env_path}")
|
||||
else:
|
||||
print(f"⚠ Warning: .env file not found at {env_path}")
|
||||
|
||||
|
||||
class Config:
|
||||
"""Configuration for news sender"""
|
||||
# MongoDB
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# Email
|
||||
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
|
||||
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
|
||||
EMAIL_USER = os.getenv('EMAIL_USER', '')
|
||||
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
|
||||
|
||||
# Newsletter
|
||||
MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
|
||||
WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')
|
||||
|
||||
|
||||
# MongoDB connection
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
articles_collection = db['articles']
|
||||
subscribers_collection = db['subscribers']
|
||||
|
||||
|
||||
def get_latest_articles(max_articles=10):
|
||||
"""
|
||||
Get latest articles with AI summaries from database
|
||||
|
||||
Returns:
|
||||
list: Articles with summaries
|
||||
"""
|
||||
cursor = articles_collection.find(
|
||||
{'summary': {'$exists': True, '$ne': None}}
|
||||
).sort('created_at', -1).limit(max_articles)
|
||||
|
||||
articles = []
|
||||
for doc in cursor:
|
||||
articles.append({
|
||||
'title': doc.get('title', ''),
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'summary': doc.get('summary', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published_at': doc.get('published_at', '')
|
||||
})
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
def get_active_subscribers():
|
||||
"""
|
||||
Get all active subscribers from database
|
||||
|
||||
Returns:
|
||||
list: Email addresses of active subscribers
|
||||
"""
|
||||
cursor = subscribers_collection.find({'status': 'active'})
|
||||
return [doc['email'] for doc in cursor]
|
||||
|
||||
|
||||
def render_newsletter_html(articles):
|
||||
"""
|
||||
Render newsletter HTML from template
|
||||
|
||||
Args:
|
||||
articles: List of article dictionaries
|
||||
|
||||
Returns:
|
||||
str: Rendered HTML content
|
||||
"""
|
||||
# Load template
|
||||
template_path = Path(__file__).parent / 'newsletter_template.html'
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
template_content = f.read()
|
||||
|
||||
template = Template(template_content)
|
||||
|
||||
# Prepare template data
|
||||
now = datetime.now()
|
||||
template_data = {
|
||||
'date': now.strftime('%A, %B %d, %Y'),
|
||||
'year': now.year,
|
||||
'article_count': len(articles),
|
||||
'articles': articles,
|
||||
'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
|
||||
'website_link': Config.WEBSITE_URL
|
||||
}
|
||||
|
||||
# Render HTML
|
||||
return template.render(**template_data)
|
||||
|
||||
|
||||
def send_email(to_email, subject, html_content):
|
||||
"""
|
||||
Send email to a single recipient
|
||||
|
||||
Args:
|
||||
to_email: Recipient email address
|
||||
subject: Email subject
|
||||
html_content: HTML content of email
|
||||
|
||||
Returns:
|
||||
tuple: (success: bool, error: str or None)
|
||||
"""
|
||||
try:
|
||||
msg = MIMEMultipart('alternative')
|
||||
msg['Subject'] = subject
|
||||
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
|
||||
msg['To'] = to_email
|
||||
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
|
||||
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
|
||||
msg['X-Mailer'] = 'Munich News Daily Sender'
|
||||
|
||||
# Add plain text version as fallback
|
||||
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
|
||||
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
|
||||
|
||||
# Add HTML version
|
||||
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
|
||||
|
||||
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
|
||||
server.starttls()
|
||||
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
|
||||
return True, None
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def send_newsletter(max_articles=None, test_email=None):
|
||||
"""
|
||||
Send newsletter to all active subscribers
|
||||
|
||||
Args:
|
||||
max_articles: Maximum number of articles to include (default from config)
|
||||
test_email: If provided, send only to this email (for testing)
|
||||
|
||||
Returns:
|
||||
dict: Statistics about sending
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("📧 Munich News Daily - Newsletter Sender")
|
||||
print("="*70)
|
||||
|
||||
# Validate email configuration
|
||||
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
|
||||
print("❌ Email credentials not configured")
|
||||
print(" Set EMAIL_USER and EMAIL_PASSWORD in .env file")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Email credentials not configured'
|
||||
}
|
||||
|
||||
# Get articles
|
||||
max_articles = max_articles or Config.MAX_ARTICLES
|
||||
print(f"\nFetching latest {max_articles} articles with AI summaries...")
|
||||
articles = get_latest_articles(max_articles)
|
||||
|
||||
if not articles:
|
||||
print("❌ No articles with summaries found")
|
||||
print(" Run the crawler with Ollama enabled first")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'No articles with summaries'
|
||||
}
|
||||
|
||||
print(f"✓ Found {len(articles)} articles")
|
||||
|
||||
# Get subscribers
|
||||
if test_email:
|
||||
subscribers = [test_email]
|
||||
print(f"\n🧪 Test mode: Sending to {test_email} only")
|
||||
else:
|
||||
print("\nFetching active subscribers...")
|
||||
subscribers = get_active_subscribers()
|
||||
print(f"✓ Found {len(subscribers)} active subscriber(s)")
|
||||
|
||||
if not subscribers:
|
||||
print("❌ No active subscribers found")
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'No active subscribers'
|
||||
}
|
||||
|
||||
# Render newsletter
|
||||
print("\nRendering newsletter HTML...")
|
||||
html_content = render_newsletter_html(articles)
|
||||
print("✓ Newsletter rendered")
|
||||
|
||||
# Send to subscribers
|
||||
subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
|
||||
print(f"\nSending newsletter: '{subject}'")
|
||||
print("-" * 70)
|
||||
|
||||
sent_count = 0
|
||||
failed_count = 0
|
||||
errors = []
|
||||
|
||||
for i, email in enumerate(subscribers, 1):
|
||||
print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')
|
||||
success, error = send_email(email, subject, html_content)
|
||||
|
||||
if success:
|
||||
print("✓")
|
||||
sent_count += 1
|
||||
else:
|
||||
print(f"✗ {error}")
|
||||
failed_count += 1
|
||||
errors.append({'email': email, 'error': error})
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("📊 Sending Complete")
|
||||
print("="*70)
|
||||
print(f"✓ Successfully sent: {sent_count}")
|
||||
print(f"✗ Failed: {failed_count}")
|
||||
print(f"📰 Articles included: {len(articles)}")
|
||||
print("="*70 + "\n")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'sent_count': sent_count,
|
||||
'failed_count': failed_count,
|
||||
'total_subscribers': len(subscribers),
|
||||
'article_count': len(articles),
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
|
||||
def preview_newsletter(max_articles=None):
|
||||
"""
|
||||
Generate newsletter HTML for preview (doesn't send)
|
||||
|
||||
Args:
|
||||
max_articles: Maximum number of articles to include
|
||||
|
||||
Returns:
|
||||
str: HTML content
|
||||
"""
|
||||
max_articles = max_articles or Config.MAX_ARTICLES
|
||||
articles = get_latest_articles(max_articles)
|
||||
|
||||
if not articles:
|
||||
return "<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>"
|
||||
|
||||
return render_newsletter_html(articles)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
# Parse command line arguments
|
||||
if len(sys.argv) > 1:
|
||||
command = sys.argv[1]
|
||||
|
||||
if command == 'preview':
|
||||
# Generate preview HTML
|
||||
html = preview_newsletter()
|
||||
output_file = 'newsletter_preview.html'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html)
|
||||
print(f"✓ Preview saved to {output_file}")
|
||||
print(f" Open it in your browser to see the newsletter")
|
||||
|
||||
elif command == 'test':
|
||||
# Send test email
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python sender_service.py test <email>")
|
||||
sys.exit(1)
|
||||
test_email = sys.argv[2]
|
||||
send_newsletter(test_email=test_email)
|
||||
|
||||
elif command == 'send':
|
||||
# Send to all subscribers
|
||||
max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
|
||||
send_newsletter(max_articles=max_articles)
|
||||
|
||||
else:
|
||||
print("Unknown command. Usage:")
|
||||
print(" python sender_service.py preview - Generate HTML preview")
|
||||
print(" python sender_service.py test <email> - Send test email")
|
||||
print(" python sender_service.py send [count] - Send to all subscribers")
|
||||
else:
|
||||
# Default: send newsletter
|
||||
send_newsletter()
|
||||
96
test_feeds_quick.py
Normal file
96
test_feeds_quick.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Quick test script - Run from project root with backend venv activated
|
||||
Usage:
|
||||
cd /path/to/munich-news
|
||||
source backend/venv/bin/activate # or backend/venv/Scripts/activate on Windows
|
||||
python test_feeds_quick.py
|
||||
"""
|
||||
import sys
|
||||
sys.path.insert(0, 'backend')
|
||||
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
import feedparser
|
||||
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
print("="*80)
|
||||
print("RSS Feed Test - Checking Database Feeds")
|
||||
print("="*80)
|
||||
|
||||
# Connect to database
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
# Get RSS feeds
|
||||
feeds = list(db['rss_feeds'].find())
|
||||
|
||||
if not feeds:
|
||||
print("\n❌ No RSS feeds in database!")
|
||||
print("\nAdd a feed first:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
for feed_doc in feeds:
|
||||
name = feed_doc.get('name', 'Unknown')
|
||||
url = feed_doc.get('url', '')
|
||||
active = feed_doc.get('active', True)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Feed: {name}")
|
||||
print(f"URL: {url}")
|
||||
print(f"Active: {active}")
|
||||
print('='*80)
|
||||
|
||||
if not active:
|
||||
print("⏭ Skipping (inactive)")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse RSS
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found")
|
||||
continue
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 3 entries
|
||||
for i, entry in enumerate(feed.entries[:3], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
title = entry.get('title', 'No title')
|
||||
print(f"Title: {title[:70]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL extracted: {article_url}")
|
||||
else:
|
||||
print(f"❌ Could not extract URL")
|
||||
print(f" Available fields: {list(entry.keys())[:10]}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
|
||||
# Test summary
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
|
||||
# Test date
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Date: {pub_date}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Test complete!")
|
||||
print("="*80)
|
||||
Reference in New Issue
Block a user