This commit is contained in:
2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions

40
.dockerignore Normal file
View File

@@ -0,0 +1,40 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
ENV/
.venv
# Node
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Environment variables
.env
.env.local
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db
# Git
.git/
.gitignore
# Documentation
*.md
!README.md

187
.gitignore vendored Normal file
View File

@@ -0,0 +1,187 @@
# ===================================
# Python
# ===================================
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
# Virtual Environments
env/
venv/
ENV/
.venv
.virtualenv
backend/env/
backend/venv/
news_crawler/env/
news_crawler/venv/
news_sender/env/
news_sender/venv/
# Python Distribution / Packaging
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
*.manifest
*.spec
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# ===================================
# Node.js
# ===================================
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.npm
.yarn-integrity
package-lock.json
yarn.lock
.pnp
.pnp.js
# ===================================
# Environment Variables & Secrets
# ===================================
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
*.env
# ===================================
# Database
# ===================================
*.db
*.sqlite
*.sqlite3
*.db-journal
# MongoDB
data/
mongodb_data/
# ===================================
# IDE & Editors
# ===================================
# VSCode
.vscode/
.vscode-test/
*.code-workspace
# PyCharm / IntelliJ
.idea/
*.iml
*.iws
*.ipr
out/
# Sublime Text
*.sublime-project
*.sublime-workspace
# Vim
*.swp
*.swo
*~
.vim/
# Emacs
*~
\#*\#
.\#*
# ===================================
# OS Files
# ===================================
# macOS
.DS_Store
.AppleDouble
.LSOverride
._*
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Windows
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
*.stackdump
[Dd]esktop.ini
$RECYCLE.BIN/
*.cab
*.msi
*.msix
*.msm
*.msp
*.lnk
# Linux
.directory
.Trash-*
# ===================================
# Project Specific
# ===================================
# Generated files
newsletter_preview.html
*.log
# Temporary files
*.tmp
*.temp
*.bak
*.backup
# Docker volumes
mongodb_data/
ollama_data/
# Spec artifacts (optional - uncomment if you don't want to track specs)
# .kiro/specs/
# Test outputs
test-results/
coverage/

View File

@@ -0,0 +1,487 @@
# Design Document - AI Article Summarization
## Overview
This design integrates Ollama AI into the news crawler workflow to automatically generate concise summaries of articles. The system will extract full article content, send it to Ollama for summarization, and store both the original content and the AI-generated summary in MongoDB.
## Architecture
### High-Level Flow
```
RSS Feed → Extract Content → Summarize with Ollama → Store in MongoDB
↓ ↓ ↓
Full Article Text AI Summary (≤150 words) Both Stored
```
### Component Diagram
```
┌─────────────────────────────────────────────────────────────┐
│ News Crawler Service │
│ │
│ ┌────────────────┐ ┌──────────────────┐ │
│ │ RSS Parser │──────→│ Content Extractor│ │
│ └────────────────┘ └──────────────────┘ │
│ │ │
│ ↓ │
│ ┌──────────────────┐ │
│ │ Ollama Client │ │
│ │ (New Component) │ │
│ └──────────────────┘ │
│ │ │
│ ↓ │
│ ┌──────────────────┐ │
│ │ Database Writer │ │
│ └──────────────────┘ │
└─────────────────────────────────────────────────────────────┘
┌──────────────────┐
│ Ollama Server │
│ (External) │
└──────────────────┘
┌──────────────────┐
│ MongoDB │
└──────────────────┘
```
## Components and Interfaces
### 1. Ollama Client Module
**File:** `news_crawler/ollama_client.py`
**Purpose:** Handle communication with Ollama server for summarization
**Interface:**
```python
class OllamaClient:
def __init__(self, base_url, model, api_key=None, enabled=True):
"""Initialize Ollama client with configuration"""
def summarize_article(self, content: str, max_words: int = 150) -> dict:
"""
Summarize article content using Ollama
Args:
content: Full article text
max_words: Maximum words in summary (default 150)
Returns:
{
'summary': str, # AI-generated summary
'word_count': int, # Summary word count
'success': bool, # Whether summarization succeeded
'error': str or None, # Error message if failed
'duration': float # Time taken in seconds
}
"""
def is_available(self) -> bool:
"""Check if Ollama server is reachable"""
def test_connection(self) -> dict:
"""Test connection and return server info"""
```
**Key Methods:**
1. **summarize_article()**
- Constructs prompt for Ollama
- Sends HTTP POST request
- Handles timeouts and errors
- Validates response
- Returns structured result
2. **is_available()**
- Quick health check
- Returns True/False
- Used before attempting summarization
3. **test_connection()**
- Detailed connection test
- Returns server info and model list
- Used for diagnostics
### 2. Enhanced Crawler Service
**File:** `news_crawler/crawler_service.py`
**Changes:**
```python
# Add Ollama client initialization
from ollama_client import OllamaClient
# Initialize at module level
ollama_client = OllamaClient(
base_url=os.getenv('OLLAMA_BASE_URL'),
model=os.getenv('OLLAMA_MODEL'),
api_key=os.getenv('OLLAMA_API_KEY'),
enabled=os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
)
# Modify crawl_rss_feed() to include summarization
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
# ... existing code ...
# After extracting content
article_data = extract_article_content(article_url)
# NEW: Summarize with Ollama
summary_result = None
if ollama_client.enabled and article_data.get('content'):
print(f" 🤖 Summarizing with AI...")
summary_result = ollama_client.summarize_article(
article_data['content'],
max_words=150
)
if summary_result['success']:
print(f" ✓ Summary generated ({summary_result['word_count']} words)")
else:
print(f" ⚠ Summarization failed: {summary_result['error']}")
# Build article document with summary
article_doc = {
'title': article_data.get('title'),
'author': article_data.get('author'),
'link': article_url,
'content': article_data.get('content'),
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
'word_count': article_data.get('word_count'),
'summary_word_count': summary_result['word_count'] if summary_result and summary_result['success'] else None,
'source': feed_name,
'published_at': extract_published_date(entry),
'crawled_at': article_data.get('crawled_at'),
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
'created_at': datetime.utcnow()
}
```
### 3. Configuration Module
**File:** `news_crawler/config.py` (new file)
**Purpose:** Centralize configuration management
```python
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')
class Config:
# MongoDB
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
# Ollama
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
# Crawler
RATE_LIMIT_DELAY = 1 # seconds between requests
MAX_CONTENT_LENGTH = 50000 # characters
```
## Data Models
### Updated Article Schema
```javascript
{
_id: ObjectId,
title: String,
author: String,
link: String, // Unique index
content: String, // Full article content
summary: String, // AI-generated summary (≤150 words)
word_count: Number, // Original content word count
summary_word_count: Number, // Summary word count
source: String,
published_at: String,
crawled_at: DateTime,
summarized_at: DateTime, // When AI summary was generated
created_at: DateTime
}
```
### Ollama Request Format
```json
{
"model": "phi3:latest",
"prompt": "Summarize the following article in 150 words or less. Focus on the key points and main message:\n\n[ARTICLE CONTENT]",
"stream": false,
"options": {
"temperature": 0.7,
"max_tokens": 200
}
}
```
### Ollama Response Format
```json
{
"model": "phi3:latest",
"created_at": "2024-11-10T16:30:00Z",
"response": "The AI-generated summary text here...",
"done": true,
"total_duration": 5000000000
}
```
## Error Handling
### Error Scenarios and Responses
| Scenario | Handling | User Impact |
|----------|----------|-------------|
| Ollama server down | Log warning, store original content | Article saved without summary |
| Ollama timeout (>30s) | Cancel request, store original | Article saved without summary |
| Empty summary returned | Log error, store original | Article saved without summary |
| Invalid response format | Log error, store original | Article saved without summary |
| Network error | Retry once, then store original | Article saved without summary |
| Model not found | Log error, disable Ollama | All articles saved without summaries |
### Error Logging Format
```python
{
'timestamp': datetime.utcnow(),
'article_url': article_url,
'error_type': 'timeout|connection|invalid_response|empty_summary',
'error_message': str(error),
'ollama_config': {
'base_url': OLLAMA_BASE_URL,
'model': OLLAMA_MODEL,
'enabled': OLLAMA_ENABLED
}
}
```
## Testing Strategy
### Unit Tests
1. **test_ollama_client.py**
- Test summarization with mock responses
- Test timeout handling
- Test error scenarios
- Test connection checking
2. **test_crawler_with_ollama.py**
- Test crawler with Ollama enabled
- Test crawler with Ollama disabled
- Test fallback when Ollama fails
- Test rate limiting
### Integration Tests
1. **test_end_to_end.py**
- Crawl real RSS feed
- Summarize with real Ollama
- Verify database storage
- Check all fields populated
### Manual Testing
1. Test with Ollama enabled and working
2. Test with Ollama disabled
3. Test with Ollama unreachable
4. Test with slow Ollama responses
5. Test with various article lengths
## Performance Considerations
### Timing Estimates
- Article extraction: 2-5 seconds
- Ollama summarization: 5-15 seconds (depends on article length and model)
- Database write: <1 second
- **Total per article: 8-21 seconds**
### Optimization Strategies
1. **Sequential Processing**
- Process one article at a time
- Prevents overwhelming Ollama
- Easier to debug
2. **Timeout Management**
- 30-second timeout per request
- Prevents hanging on slow responses
3. **Rate Limiting**
- 1-second delay between articles
- Respects server resources
4. **Future: Batch Processing**
- Queue articles for summarization
- Process in batches
- Use Celery for async processing
### Resource Usage
- **Memory**: ~100MB per crawler instance
- **Network**: ~1-5KB per article (to Ollama)
- **Storage**: +150 words per article (~1KB)
- **CPU**: Minimal (Ollama does the heavy lifting)
## Security Considerations
1. **API Key Storage**
- Store in environment variables
- Never commit to git
- Use secrets management in production
2. **Content Sanitization**
- Don't log full article content
- Sanitize URLs in logs
- Limit error message detail
3. **Network Security**
- Support HTTPS for Ollama
- Validate SSL certificates
- Use secure connections
4. **Rate Limiting**
- Prevent abuse of Ollama server
- Implement backoff on errors
- Monitor usage patterns
## Deployment Considerations
### Environment Variables
```bash
# Required
OLLAMA_BASE_URL=http://localhost:11434
OLLAMA_MODEL=phi3:latest
OLLAMA_ENABLED=true
# Optional
OLLAMA_API_KEY=your-api-key
OLLAMA_TIMEOUT=30
```
### Docker Deployment
```yaml
# docker-compose.yml
services:
crawler:
build: ./news_crawler
environment:
- OLLAMA_BASE_URL=http://ollama:11434
- OLLAMA_ENABLED=true
depends_on:
- ollama
- mongodb
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
```
### Monitoring
1. **Metrics to Track**
- Summarization success rate
- Average summarization time
- Ollama server uptime
- Error frequency by type
2. **Logging**
- Log all summarization attempts
- Log errors with context
- Log performance metrics
3. **Alerts**
- Alert if Ollama is down >5 minutes
- Alert if success rate <80%
- Alert if average time >20 seconds
## Migration Plan
### Phase 1: Add Ollama Client (Week 1)
- Create ollama_client.py
- Add configuration
- Write unit tests
- Test with sample articles
### Phase 2: Integrate with Crawler (Week 1)
- Modify crawler_service.py
- Add summarization step
- Update database schema
- Test end-to-end
### Phase 3: Update Backend API (Week 2)
- Update news routes
- Add summary fields to responses
- Update frontend to display summaries
- Deploy to production
### Phase 4: Monitor and Optimize (Ongoing)
- Monitor performance
- Tune prompts for better summaries
- Optimize rate limiting
- Add batch processing if needed
## Rollback Plan
If issues arise:
1. **Immediate**: Set `OLLAMA_ENABLED=false`
2. **Short-term**: Revert crawler code changes
3. **Long-term**: Remove Ollama integration
System will continue to work with original content if Ollama is disabled.
## Success Metrics
- ✅ 95%+ of articles successfully summarized
- ✅ Average summarization time <15 seconds
- ✅ Zero data loss (all articles stored even if summarization fails)
- ✅ Ollama uptime >99%
- ✅ Summary quality: readable and accurate (manual review)
## Future Enhancements
1. **Multi-language Support**
- Detect article language
- Use appropriate model
- Translate summaries
2. **Custom Summary Lengths**
- Allow configuration per feed
- Support different lengths for different use cases
3. **Sentiment Analysis**
- Add sentiment score
- Categorize as positive/negative/neutral
4. **Keyword Extraction**
- Extract key topics
- Enable better search
5. **Batch Processing**
- Queue articles
- Process in parallel
- Use Celery for async
6. **Caching**
- Cache summaries
- Avoid re-processing
- Use Redis for cache

View File

@@ -0,0 +1,164 @@
# Requirements Document
## Introduction
This feature integrates Ollama AI into the news crawler to automatically summarize articles before storing them in the database. Instead of storing full article content, the system will generate concise 150-word summaries using AI, making the content more digestible for newsletter readers and reducing storage requirements.
## Glossary
- **Crawler Service**: The standalone microservice that fetches and processes article content from RSS feeds
- **Ollama Server**: The AI inference server that provides text summarization capabilities
- **Article Content**: The full text extracted from a news article webpage
- **Summary**: A concise AI-generated version of the article content (max 150 words)
- **MongoDB**: The database where articles and summaries are stored
## Requirements
### Requirement 1: Ollama Integration in Crawler
**User Story:** As a system administrator, I want the crawler to use Ollama for summarization, so that articles are automatically condensed before storage.
#### Acceptance Criteria
1. WHEN the crawler extracts article content, THE Crawler Service SHALL send the content to the Ollama Server for summarization
2. WHEN sending content to Ollama, THE Crawler Service SHALL include a prompt requesting a summary of 150 words or less
3. WHEN Ollama returns a summary, THE Crawler Service SHALL validate that the summary is not empty
4. IF the Ollama Server is unavailable, THEN THE Crawler Service SHALL store the original content without summarization and log a warning
5. WHEN summarization fails, THE Crawler Service SHALL continue processing other articles without stopping
### Requirement 2: Configuration Management
**User Story:** As a system administrator, I want to configure Ollama settings, so that I can control the summarization behavior.
#### Acceptance Criteria
1. THE Crawler Service SHALL read Ollama configuration from environment variables
2. THE Crawler Service SHALL support the following configuration options:
- OLLAMA_BASE_URL (server URL)
- OLLAMA_MODEL (model name)
- OLLAMA_ENABLED (enable/disable flag)
- OLLAMA_API_KEY (optional authentication)
3. WHERE OLLAMA_ENABLED is false, THE Crawler Service SHALL store original content without summarization
4. WHERE OLLAMA_ENABLED is true AND Ollama is unreachable, THE Crawler Service SHALL log an error and store original content
### Requirement 3: Summary Storage
**User Story:** As a developer, I want summaries stored in the database, so that the frontend can display concise article previews.
#### Acceptance Criteria
1. WHEN a summary is generated, THE Crawler Service SHALL store it in the `summary` field in MongoDB
2. WHEN storing an article, THE Crawler Service SHALL include both the original content and the AI summary
3. THE Crawler Service SHALL store the following fields:
- `content` (original full text)
- `summary` (AI-generated, max 150 words)
- `word_count` (original content word count)
- `summary_word_count` (summary word count)
- `summarized_at` (timestamp when summarized)
4. WHEN an article already has a summary, THE Crawler Service SHALL not re-summarize it
### Requirement 4: Error Handling and Resilience
**User Story:** As a system administrator, I want the crawler to handle AI failures gracefully, so that the system remains reliable.
#### Acceptance Criteria
1. IF Ollama returns an error, THEN THE Crawler Service SHALL log the error and store the original content
2. IF Ollama times out (>30 seconds), THEN THE Crawler Service SHALL cancel the request and store the original content
3. IF the summary is empty or invalid, THEN THE Crawler Service SHALL store the original content
4. WHEN an error occurs, THE Crawler Service SHALL include an error indicator in the database record
5. THE Crawler Service SHALL continue processing remaining articles after any summarization failure
### Requirement 5: Performance and Rate Limiting
**User Story:** As a system administrator, I want the crawler to respect rate limits, so that it doesn't overwhelm the Ollama server.
#### Acceptance Criteria
1. THE Crawler Service SHALL wait at least 1 second between Ollama API calls
2. THE Crawler Service SHALL set a timeout of 30 seconds for each Ollama request
3. WHEN processing multiple articles, THE Crawler Service SHALL process them sequentially to avoid overloading Ollama
4. THE Crawler Service SHALL log the time taken for each summarization
5. THE Crawler Service SHALL display progress indicators showing summarization status
### Requirement 6: Monitoring and Logging
**User Story:** As a system administrator, I want detailed logs of summarization activity, so that I can monitor and troubleshoot the system.
#### Acceptance Criteria
1. THE Crawler Service SHALL log when summarization starts for each article
2. THE Crawler Service SHALL log the original word count and summary word count
3. THE Crawler Service SHALL log any errors or warnings from Ollama
4. THE Crawler Service SHALL display a summary of total articles summarized at the end
5. THE Crawler Service SHALL include summarization statistics in the final report
### Requirement 7: API Endpoint Updates
**User Story:** As a frontend developer, I want API endpoints to return summaries, so that I can display them to users.
#### Acceptance Criteria
1. WHEN fetching articles via GET /api/news, THE Backend API SHALL include the `summary` field if available
2. WHEN fetching a single article via GET /api/news/<url>, THE Backend API SHALL include both `content` and `summary`
3. THE Backend API SHALL include a `has_summary` boolean field indicating if AI summarization was performed
4. THE Backend API SHALL include `summarized_at` timestamp if available
5. WHERE no summary exists, THE Backend API SHALL return a preview of the original content (first 200 chars)
### Requirement 8: Backward Compatibility
**User Story:** As a developer, I want the system to work with existing articles, so that no data migration is required.
#### Acceptance Criteria
1. THE Crawler Service SHALL work with articles that don't have summaries
2. THE Backend API SHALL handle articles with or without summaries gracefully
3. WHERE an article has no summary, THE Backend API SHALL generate a preview from the content field
4. THE Crawler Service SHALL not re-process articles that already have summaries
5. THE system SHALL continue to function if Ollama is disabled or unavailable
## Non-Functional Requirements
### Performance
- Summarization SHALL complete within 30 seconds per article
- The crawler SHALL process at least 10 articles per minute (including summarization)
- Database operations SHALL not be significantly slower with summary storage
### Reliability
- The system SHALL maintain 99% uptime even if Ollama is unavailable
- Failed summarizations SHALL not prevent article storage
- The crawler SHALL recover from Ollama errors without manual intervention
### Security
- Ollama API keys SHALL be stored in environment variables, not in code
- Article content SHALL not be logged to prevent sensitive data exposure
- API communication with Ollama SHALL support HTTPS
### Scalability
- The system SHALL support multiple Ollama servers for load balancing (future)
- The crawler SHALL handle articles of any length (up to 50,000 words)
- The database schema SHALL support future enhancements (tags, categories, etc.)
## Dependencies
- Ollama server must be running and accessible
- `requests` Python library for HTTP communication
- Environment variables properly configured
- MongoDB with sufficient storage for both content and summaries
## Assumptions
- Ollama server is already set up and configured
- The phi3:latest model (or configured model) supports summarization tasks
- Network connectivity between crawler and Ollama server is reliable
- Articles are in English or the configured Ollama model supports the article language
## Future Enhancements
- Support for multiple languages
- Customizable summary length
- Sentiment analysis integration
- Keyword extraction
- Category classification
- Batch summarization for improved performance
- Caching of summaries to avoid re-processing

View File

@@ -0,0 +1,92 @@
# Implementation Plan
- [x] 1. Create Ollama client module
- Create `news_crawler/ollama_client.py` with OllamaClient class
- Implement `summarize_article()` method with prompt construction and API call
- Implement `is_available()` method for health checks
- Implement `test_connection()` method for diagnostics
- Add timeout handling (30 seconds)
- Add error handling for connection, timeout, and invalid responses
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 4.1, 4.2, 4.3, 5.2_
- [x] 2. Create configuration module for crawler
- Create `news_crawler/config.py` with Config class
- Load environment variables (OLLAMA_BASE_URL, OLLAMA_MODEL, OLLAMA_ENABLED, OLLAMA_API_KEY, OLLAMA_TIMEOUT)
- Add validation for required configuration
- Add default values for optional configuration
- _Requirements: 2.1, 2.2, 2.3, 2.4_
- [x] 3. Integrate Ollama client into crawler service
- Import OllamaClient in `news_crawler/crawler_service.py`
- Initialize Ollama client at module level using Config
- Modify `crawl_rss_feed()` to call summarization after content extraction
- Add conditional logic to skip summarization if OLLAMA_ENABLED is false
- Add error handling to continue processing if summarization fails
- Add logging for summarization start, success, and failure
- Add rate limiting delay after summarization
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 2.3, 2.4, 4.1, 4.5, 5.1, 5.3, 6.1, 6.2, 6.3_
- [x] 4. Update database schema and storage
- Modify article document structure in `crawl_rss_feed()` to include:
- `summary` field (AI-generated summary)
- `summary_word_count` field
- `summarized_at` field (timestamp)
- Update MongoDB upsert logic to handle new fields
- Add check to skip re-summarization if article already has summary
- _Requirements: 3.1, 3.2, 3.3, 3.4, 8.4_
- [x] 5. Update backend API to return summaries
- Modify `backend/routes/news_routes.py` GET /api/news endpoint
- Add `summary`, `summary_word_count`, `summarized_at` fields to response
- Add `has_summary` boolean field to indicate if AI summarization was performed
- Modify GET /api/news/<url> endpoint to include summary fields
- Add fallback to content preview if no summary exists
- _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3_
- [x] 6. Update database schema documentation
- Update `backend/DATABASE_SCHEMA.md` with new summary fields
- Add example document showing summary fields
- Document the summarization workflow
- _Requirements: 3.1, 3.2, 3.3_
- [x] 7. Add environment variable configuration
- Update `backend/env.template` with Ollama configuration
- Add comments explaining each Ollama setting
- Document default values
- _Requirements: 2.1, 2.2_
- [x] 8. Create test script for Ollama integration
- Create `news_crawler/test_ollama.py` to test Ollama connection
- Test summarization with sample article
- Test error handling (timeout, connection failure)
- Display configuration and connection status
- _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 4.1, 4.2_
- [x] 9. Update crawler statistics and logging
- Add summarization statistics to final report in `crawl_all_feeds()`
- Track total articles summarized vs failed
- Log average summarization time
- Display progress indicators during summarization
- _Requirements: 5.4, 6.1, 6.2, 6.3, 6.4, 6.5_
- [x] 10. Create documentation for AI summarization
- Create `news_crawler/AI_SUMMARIZATION.md` explaining the feature
- Document configuration options
- Provide troubleshooting guide
- Add examples of usage
- _Requirements: 2.1, 2.2, 2.3, 2.4, 6.1, 6.2, 6.3_
- [x] 11. Update main README with AI summarization info
- Add section about AI summarization feature
- Document Ollama setup requirements
- Add configuration examples
- Update API endpoint documentation
- _Requirements: 2.1, 2.2, 7.1, 7.2_
- [x] 12. Test end-to-end workflow
- Run crawler with Ollama enabled
- Verify articles are summarized correctly
- Check database contains all expected fields
- Test API endpoints return summaries
- Verify error handling when Ollama is disabled/unavailable
- _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5, 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3, 8.4, 8.5_

209
ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,209 @@
# Munich News Daily - Architecture
## System Overview
```
┌─────────────────────────────────────────────────────────────┐
│ Users / Browsers │
└────────────────────────┬────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ Frontend (Port 3000) │
│ Node.js + Express + Vanilla JS │
│ - Subscription form │
│ - News display │
│ - RSS feed management UI (future) │
└────────────────────────┬────────────────────────────────────┘
│ HTTP/REST
┌─────────────────────────────────────────────────────────────┐
│ Backend API (Port 5001) │
│ Flask + Python │
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Routes (Blueprints) │ │
│ │ - subscription_routes.py (subscribe/unsubscribe) │ │
│ │ - news_routes.py (get news, stats) │ │
│ │ - rss_routes.py (manage RSS feeds) │ │
│ │ - ollama_routes.py (AI features) │ │
│ └──────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Services (Business Logic) │ │
│ │ - news_service.py (fetch & save articles) │ │
│ │ - email_service.py (send newsletters) │ │
│ │ - ollama_service.py (AI integration) │ │
│ └──────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Core │ │
│ │ - config.py (configuration) │ │
│ │ - database.py (DB connection) │ │
│ └──────────────────────────────────────────────────────┘ │
└────────────────────────┬────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ MongoDB (Port 27017) │
│ │
│ Collections: │
│ - articles (news articles with full content) │
│ - subscribers (email subscribers) │
│ - rss_feeds (RSS feed sources) │
└─────────────────────────┬───────────────────────────────────┘
│ Read/Write
┌─────────────────────────┴───────────────────────────────────┐
│ News Crawler Microservice │
│ (Standalone) │
│ │
│ - Fetches RSS feeds from MongoDB │
│ - Crawls full article content │
│ - Extracts text, metadata, word count │
│ - Stores back to MongoDB │
│ - Can run independently or scheduled │
└──────────────────────────────────────────────────────────────┘
│ (Optional)
┌─────────────────────────────────────────────────────────────┐
│ Ollama AI Server (Port 11434) │
│ (Optional, External) │
│ │
│ - Article summarization │
│ - Content analysis │
│ - AI-powered features │
└──────────────────────────────────────────────────────────────┘
```
## Component Details
### Frontend (Port 3000)
- **Technology**: Node.js, Express, Vanilla JavaScript
- **Responsibilities**:
- User interface
- Subscription management
- News display
- API proxy to backend
- **Communication**: HTTP REST to Backend
### Backend API (Port 5001)
- **Technology**: Python, Flask
- **Architecture**: Modular with Blueprints
- **Responsibilities**:
- REST API endpoints
- Business logic
- Database operations
- Email sending
- AI integration
- **Communication**:
- HTTP REST from Frontend
- MongoDB driver to Database
- HTTP to Ollama (optional)
### MongoDB (Port 27017)
- **Technology**: MongoDB 7.0
- **Responsibilities**:
- Persistent data storage
- Articles, subscribers, RSS feeds
- **Communication**: MongoDB protocol
### News Crawler (Standalone)
- **Technology**: Python, BeautifulSoup
- **Architecture**: Microservice (can run independently)
- **Responsibilities**:
- Fetch RSS feeds
- Crawl article content
- Extract and clean text
- Store in database
- **Communication**: MongoDB driver to Database
- **Execution**:
- Manual: `python crawler_service.py`
- Scheduled: Cron, systemd, Docker
- On-demand: Via backend API (future)
### Ollama AI Server (Optional, External)
- **Technology**: Ollama
- **Responsibilities**:
- AI model inference
- Text summarization
- Content analysis
- **Communication**: HTTP REST API
## Data Flow
### 1. News Aggregation Flow
```
RSS Feeds → Backend (news_service) → MongoDB (articles)
```
### 2. Content Crawling Flow
```
MongoDB (rss_feeds) → Crawler → Article URLs →
Web Scraping → MongoDB (articles with full_content)
```
### 3. Subscription Flow
```
User → Frontend → Backend (subscription_routes) →
MongoDB (subscribers)
```
### 4. Newsletter Flow (Future)
```
Scheduler → Backend (email_service) →
MongoDB (articles + subscribers) → SMTP → Users
```
### 5. AI Processing Flow (Optional)
```
MongoDB (articles) → Backend (ollama_service) →
Ollama Server → AI Summary → MongoDB (articles)
```
## Deployment Options
### Development
- All services run locally
- MongoDB via Docker Compose
- Manual crawler execution
### Production
- Backend: Cloud VM, Container, or PaaS
- Frontend: Static hosting or same server
- MongoDB: MongoDB Atlas or self-hosted
- Crawler: Scheduled job (cron, systemd timer)
- Ollama: Separate GPU server (optional)
## Scalability Considerations
### Current Architecture
- Monolithic backend (single Flask instance)
- Standalone crawler (can run multiple instances)
- Shared MongoDB
### Future Improvements
- Load balancer for backend
- Message queue for crawler jobs (Celery + Redis)
- Caching layer (Redis)
- CDN for frontend
- Read replicas for MongoDB
## Security
- CORS enabled for frontend-backend communication
- MongoDB authentication (production)
- Environment variables for secrets
- Input validation on all endpoints
- Rate limiting (future)
## Monitoring (Future)
- Application logs
- MongoDB metrics
- Crawler success/failure tracking
- API response times
- Error tracking (Sentry)

136
CHANGELOG.md Normal file
View File

@@ -0,0 +1,136 @@
# Changelog
## [Unreleased] - 2024-11-10
### Added - Major Refactoring
#### Backend Modularization
- ✅ Restructured backend into modular architecture
- ✅ Created separate route blueprints:
- `subscription_routes.py` - User subscriptions
- `news_routes.py` - News fetching and stats
- `rss_routes.py` - RSS feed management (CRUD)
- `ollama_routes.py` - AI integration
- ✅ Created service layer:
- `news_service.py` - News fetching logic
- `email_service.py` - Newsletter sending
- `ollama_service.py` - AI communication
- ✅ Centralized configuration in `config.py`
- ✅ Separated database logic in `database.py`
- ✅ Reduced main `app.py` from 700+ lines to 27 lines
#### RSS Feed Management
- ✅ Dynamic RSS feed management via API
- ✅ Add/remove/list/toggle RSS feeds without code changes
- ✅ Unique index on RSS feed URLs (prevents duplicates)
- ✅ Default feeds auto-initialized on first run
- ✅ Created `fix_duplicates.py` utility script
#### News Crawler Microservice
- ✅ Created standalone `news_crawler/` microservice
- ✅ Web scraping with BeautifulSoup
- ✅ Smart content extraction using multiple selectors
- ✅ Full article content storage in MongoDB
- ✅ Word count calculation
- ✅ Duplicate prevention (skips already-crawled articles)
- ✅ Rate limiting (1 second between requests)
- ✅ Can run independently or scheduled
- ✅ Docker support for crawler
- ✅ Comprehensive documentation
#### API Endpoints
New endpoints added:
- `GET /api/rss-feeds` - List all RSS feeds
- `POST /api/rss-feeds` - Add new RSS feed
- `DELETE /api/rss-feeds/<id>` - Remove RSS feed
- `PATCH /api/rss-feeds/<id>/toggle` - Toggle feed active status
#### Documentation
- ✅ Created `ARCHITECTURE.md` - System architecture overview
- ✅ Created `backend/STRUCTURE.md` - Backend structure guide
- ✅ Created `news_crawler/README.md` - Crawler documentation
- ✅ Created `news_crawler/QUICKSTART.md` - Quick start guide
- ✅ Created `news_crawler/test_crawler.py` - Test suite
- ✅ Updated main `README.md` with new features
- ✅ Updated `DATABASE_SCHEMA.md` with new fields
#### Configuration
- ✅ Added `FLASK_PORT` environment variable
- ✅ Fixed `OLLAMA_MODEL` typo in `.env`
- ✅ Port 5001 default to avoid macOS AirPlay conflict
### Changed
- Backend structure: Monolithic → Modular
- RSS feeds: Hardcoded → Database-driven
- Article storage: Summary only → Full content support
- Configuration: Scattered → Centralized
### Technical Improvements
- Separation of concerns (routes vs services)
- Better testability
- Easier maintenance
- Scalable architecture
- Independent microservices
- Proper error handling
- Comprehensive logging
### Database Schema Updates
Articles collection now includes:
- `full_content` - Full article text
- `word_count` - Number of words
- `crawled_at` - When content was crawled
RSS Feeds collection added:
- `name` - Feed name
- `url` - Feed URL (unique)
- `active` - Active status
- `created_at` - Creation timestamp
### Files Added
```
backend/
├── config.py
├── database.py
├── fix_duplicates.py
├── STRUCTURE.md
├── routes/
│ ├── __init__.py
│ ├── subscription_routes.py
│ ├── news_routes.py
│ ├── rss_routes.py
│ └── ollama_routes.py
└── services/
├── __init__.py
├── news_service.py
├── email_service.py
└── ollama_service.py
news_crawler/
├── crawler_service.py
├── test_crawler.py
├── requirements.txt
├── .gitignore
├── Dockerfile
├── docker-compose.yml
├── README.md
└── QUICKSTART.md
Root:
├── ARCHITECTURE.md
└── CHANGELOG.md
```
### Files Removed
- Old monolithic `backend/app.py` (replaced with modular version)
### Next Steps (Future Enhancements)
- [ ] Frontend UI for RSS feed management
- [ ] Automatic article summarization with Ollama
- [ ] Scheduled newsletter sending
- [ ] Article categorization and tagging
- [ ] Search functionality
- [ ] User preferences (categories, frequency)
- [ ] Analytics dashboard
- [ ] API rate limiting
- [ ] Caching layer (Redis)
- [ ] Message queue for crawler (Celery)

206
QUICK_REFERENCE.md Normal file
View File

@@ -0,0 +1,206 @@
# Quick Reference Guide
## Starting the Application
### 1. Start MongoDB
```bash
docker-compose up -d
```
### 2. Start Backend (Port 5001)
```bash
cd backend
source venv/bin/activate # or: venv\Scripts\activate on Windows
python app.py
```
### 3. Start Frontend (Port 3000)
```bash
cd frontend
npm start
```
### 4. Run Crawler (Optional)
```bash
cd news_crawler
pip install -r requirements.txt
python crawler_service.py 10
```
## Common Commands
### RSS Feed Management
**List all feeds:**
```bash
curl http://localhost:5001/api/rss-feeds
```
**Add a feed:**
```bash
curl -X POST http://localhost:5001/api/rss-feeds \
-H "Content-Type: application/json" \
-d '{"name": "Feed Name", "url": "https://example.com/rss"}'
```
**Remove a feed:**
```bash
curl -X DELETE http://localhost:5001/api/rss-feeds/<feed_id>
```
**Toggle feed status:**
```bash
curl -X PATCH http://localhost:5001/api/rss-feeds/<feed_id>/toggle
```
### News & Subscriptions
**Get latest news:**
```bash
curl http://localhost:5001/api/news
```
**Subscribe:**
```bash
curl -X POST http://localhost:5001/api/subscribe \
-H "Content-Type: application/json" \
-d '{"email": "user@example.com"}'
```
**Get stats:**
```bash
curl http://localhost:5001/api/stats
```
### Ollama (AI)
**Test connection:**
```bash
curl http://localhost:5001/api/ollama/ping
```
**List models:**
```bash
curl http://localhost:5001/api/ollama/models
```
### Database
**Connect to MongoDB:**
```bash
mongosh
use munich_news
```
**Check articles:**
```javascript
db.articles.find().limit(5)
db.articles.countDocuments()
db.articles.countDocuments({full_content: {$exists: true}})
```
**Check subscribers:**
```javascript
db.subscribers.find()
db.subscribers.countDocuments({status: "active"})
```
**Check RSS feeds:**
```javascript
db.rss_feeds.find()
```
## File Locations
### Configuration
- Backend: `backend/.env`
- Frontend: `frontend/package.json`
- Crawler: Uses backend's `.env` or own `.env`
### Logs
- Backend: Terminal output
- Frontend: Terminal output
- Crawler: Terminal output
### Database
- MongoDB data: Docker volume `mongodb_data`
- Database name: `munich_news`
## Ports
| Service | Port | URL |
|---------|------|-----|
| Frontend | 3000 | http://localhost:3000 |
| Backend | 5001 | http://localhost:5001 |
| MongoDB | 27017 | mongodb://localhost:27017 |
| Ollama | 11434 | http://localhost:11434 |
## Troubleshooting
### Backend won't start
- Check if port 5001 is available
- Verify MongoDB is running
- Check `.env` file exists
### Frontend can't connect
- Verify backend is running on port 5001
- Check CORS settings
- Check API_URL in frontend
### Crawler fails
- Install dependencies: `pip install -r requirements.txt`
- Check MongoDB connection
- Verify RSS feeds exist in database
### MongoDB connection error
- Start MongoDB: `docker-compose up -d`
- Check connection string in `.env`
- Verify port 27017 is not blocked
### Port 5000 conflict (macOS)
- AirPlay uses port 5000
- Use port 5001 instead (set in `.env`)
- Or disable AirPlay Receiver in System Preferences
## Project Structure
```
munich-news/
├── backend/ # Main API (Flask)
├── frontend/ # Web UI (Express + JS)
├── news_crawler/ # Crawler microservice
├── .env # Environment variables
└── docker-compose.yml # MongoDB setup
```
## Environment Variables
### Backend (.env)
```env
MONGODB_URI=mongodb://localhost:27017/
FLASK_PORT=5001
SMTP_SERVER=smtp.gmail.com
SMTP_PORT=587
EMAIL_USER=your-email@gmail.com
EMAIL_PASSWORD=your-app-password
OLLAMA_BASE_URL=http://127.0.0.1:11434
OLLAMA_MODEL=phi3:latest
OLLAMA_ENABLED=true
```
## Development Workflow
1. **Add RSS Feed** → Backend API
2. **Run Crawler** → Fetches full content
3. **View News** → Frontend displays articles
4. **Users Subscribe** → Via frontend form
5. **Send Newsletter** → Manual or scheduled
## Useful Links
- Frontend: http://localhost:3000
- Backend API: http://localhost:5001
- MongoDB: mongodb://localhost:27017
- Architecture: See `ARCHITECTURE.md`
- Backend Structure: See `backend/STRUCTURE.md`
- Crawler Guide: See `news_crawler/README.md`

327
README.md Normal file
View File

@@ -0,0 +1,327 @@
# Munich News Daily 📰
A TLDR/Morning Brew-style news email platform specifically for Munich. Get the latest Munich news delivered to your inbox every morning.
## Features
- 📧 Email newsletter subscription system
- 📰 Aggregated news from multiple Munich news sources
- 🎨 Beautiful, modern web interface
- 📊 Subscription statistics
- 🔄 Real-time news updates
## Tech Stack
- **Backend**: Python (Flask) - Modular architecture with blueprints
- **Frontend**: Node.js (Express + Vanilla JavaScript)
- **Database**: MongoDB
- **News Crawler**: Standalone Python microservice
- **News Sources**: RSS feeds from major Munich news outlets
## Setup Instructions
### Prerequisites
- Python 3.8+
- Node.js 14+
- npm or yarn
- Docker and Docker Compose (recommended for MongoDB) OR MongoDB (local installation or MongoDB Atlas account)
### Backend Setup
1. Navigate to the backend directory:
```bash
cd backend
```
2. Create a virtual environment (recommended):
```bash
python3 -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. Install dependencies:
```bash
pip install -r requirements.txt
```
4. Set up MongoDB using Docker Compose (recommended):
```bash
# From the project root directory
docker-compose up -d
```
This will start MongoDB in a Docker container. The database will be available at `mongodb://localhost:27017/`
**Useful Docker commands:**
```bash
# Start MongoDB
docker-compose up -d
# Stop MongoDB
docker-compose down
# View MongoDB logs
docker-compose logs -f mongodb
# Restart MongoDB
docker-compose restart mongodb
# Remove MongoDB and all data (WARNING: deletes all data)
docker-compose down -v
```
**Alternative options:**
- **Local MongoDB**: Install MongoDB locally and make sure it's running
- **MongoDB Atlas** (Cloud): Create a free account at [mongodb.com/cloud/atlas](https://www.mongodb.com/cloud/atlas) and get your connection string
5. Create a `.env` file in the backend directory:
```bash
# Copy the template file
cp env.template .env
```
Then edit `.env` with your configuration:
```env
# MongoDB connection (default: mongodb://localhost:27017/)
# For Docker Compose (no authentication):
MONGODB_URI=mongodb://localhost:27017/
# For Docker Compose with authentication (if you modify docker-compose.yml):
# MONGODB_URI=mongodb://admin:password@localhost:27017/
# Or for MongoDB Atlas:
# MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
# Email configuration (optional for testing)
SMTP_SERVER=smtp.gmail.com
SMTP_PORT=587
EMAIL_USER=your-email@gmail.com
EMAIL_PASSWORD=your-app-password
# Ollama Configuration (for AI-powered features)
# Remote Ollama server URL
OLLAMA_BASE_URL=http://your-remote-server-ip:11434
# Optional: API key if your Ollama server requires authentication
# OLLAMA_API_KEY=your-api-key-here
# Model name to use (e.g., llama2, mistral, codellama, llama3)
OLLAMA_MODEL=llama2
# Enable/disable Ollama features (true/false)
OLLAMA_ENABLED=false
```
**Notes:**
- For Gmail, you'll need to use an [App Password](https://support.google.com/accounts/answer/185833) instead of your regular password.
- For Ollama, replace `your-remote-server-ip` with your actual server IP or domain. Set `OLLAMA_ENABLED=true` to enable AI features.
6. Run the backend server:
```bash
python app.py
```
The backend will run on `http://localhost:5001` (port 5001 to avoid conflict with AirPlay on macOS)
### Frontend Setup
1. Navigate to the frontend directory:
```bash
cd frontend
```
2. Install dependencies:
```bash
npm install
```
3. Run the frontend server:
```bash
npm start
```
The frontend will run on `http://localhost:3000`
## Usage
1. Open your browser and go to `http://localhost:3000`
2. Enter your email address to subscribe to the newsletter
3. View the latest Munich news on the homepage
4. The backend will aggregate news from multiple Munich news sources
## Sending Newsletters
To send newsletters to all subscribers, you can add a scheduled task or manually trigger the `send_newsletter()` function in `app.py`. For production, consider using:
- **Cron jobs** (Linux/Mac)
- **Task Scheduler** (Windows)
- **Celery** with Redis/RabbitMQ for more advanced scheduling
- **Cloud functions** (AWS Lambda, Google Cloud Functions)
Example cron job to send daily at 8 AM:
```
0 8 * * * cd /path/to/munich-news/backend && python -c "from app import send_newsletter; send_newsletter()"
```
## Project Structure
```
munich-news/
├── backend/ # Main API server
│ ├── app.py # Flask application entry point
│ ├── config.py # Configuration management
│ ├── database.py # Database connection
│ ├── routes/ # API endpoints (blueprints)
│ ├── services/ # Business logic
│ ├── templates/ # Email templates
│ └── requirements.txt # Python dependencies
├── news_crawler/ # Crawler microservice
│ ├── crawler_service.py # Standalone crawler
│ ├── ollama_client.py # AI summarization client
│ ├── requirements.txt # Crawler dependencies
│ └── README.md # Crawler documentation
├── news_sender/ # Newsletter sender microservice
│ ├── sender_service.py # Standalone email sender
│ ├── newsletter_template.html # Email template
│ ├── requirements.txt # Sender dependencies
│ └── README.md # Sender documentation
├── frontend/ # Web interface
│ ├── server.js # Express server
│ ├── package.json # Node.js dependencies
│ └── public/
│ ├── index.html # Main page
│ ├── styles.css # Styling
│ └── app.js # Frontend JavaScript
├── docker-compose.yml # Docker Compose for MongoDB (development)
├── docker-compose.prod.yml # Docker Compose with authentication (production)
└── README.md
```
## API Endpoints
### `POST /api/subscribe`
Subscribe to the newsletter
- Body: `{ "email": "user@example.com" }`
### `POST /api/unsubscribe`
Unsubscribe from the newsletter
- Body: `{ "email": "user@example.com" }`
### `GET /api/news`
Get latest Munich news articles
### `GET /api/stats`
Get subscription statistics
- Returns: `{ "subscribers": number, "articles": number, "crawled_articles": number }`
### `GET /api/news/<article_url>`
Get full article content by URL
- Returns: Full article with content, author, word count, etc.
### `GET /api/ollama/ping`
Test connection to Ollama server
- Returns: Connection status and Ollama configuration
- Response examples:
- Success: `{ "status": "success", "message": "...", "response": "...", "ollama_config": {...} }`
- Disabled: `{ "status": "disabled", "message": "...", "ollama_config": {...} }`
- Error: `{ "status": "error", "message": "...", "error_details": "...", "troubleshooting": {...}, "ollama_config": {...} }`
### `GET /api/ollama/models`
List available models on Ollama server
- Returns: List of available models and current configuration
- Response: `{ "status": "success", "models": [...], "current_model": "...", "ollama_config": {...} }`
### `GET /api/rss-feeds`
Get all RSS feeds
- Returns: `{ "feeds": [...] }`
### `POST /api/rss-feeds`
Add a new RSS feed
- Body: `{ "name": "Feed Name", "url": "https://example.com/rss" }`
- Returns: `{ "message": "...", "id": "..." }`
### `DELETE /api/rss-feeds/<feed_id>`
Remove an RSS feed
- Returns: `{ "message": "..." }`
### `PATCH /api/rss-feeds/<feed_id>/toggle`
Toggle RSS feed active status
- Returns: `{ "message": "...", "active": boolean }`
## Database Schema
### Articles Collection
```javascript
{
_id: ObjectId,
title: String,
link: String (unique),
summary: String,
source: String,
published_at: String,
created_at: DateTime
}
```
### Subscribers Collection
```javascript
{
_id: ObjectId,
email: String (unique, lowercase),
subscribed_at: DateTime,
status: String ('active' | 'inactive')
}
```
**Indexes:**
- `articles.link` - Unique index to prevent duplicate articles
- `articles.created_at` - For efficient sorting
- `subscribers.email` - Unique index for email lookups
- `subscribers.subscribed_at` - For analytics
## News Crawler Microservice
The project includes a standalone crawler microservice that fetches full article content from RSS feeds.
### Running the Crawler
```bash
cd news_crawler
# Install dependencies
pip install -r requirements.txt
# Run crawler
python crawler_service.py 10
```
See `news_crawler/README.md` for detailed documentation.
### What It Does
- Crawls full article content from RSS feed links
- Extracts text, word count, and metadata
- Stores in MongoDB for AI processing
- Skips already-crawled articles
- Rate-limited (1 second between requests)
## Customization
### Adding News Sources
Use the API to add RSS feeds dynamically:
```bash
curl -X POST http://localhost:5001/api/rss-feeds \
-H "Content-Type: application/json" \
-d '{"name": "Your Source Name", "url": "https://example.com/rss"}'
```
### Styling
Modify `frontend/public/styles.css` to customize the appearance.
## License
MIT
## Contributing
Feel free to submit issues and enhancement requests!

132
TEST_INSTRUCTIONS.md Normal file
View File

@@ -0,0 +1,132 @@
# Testing RSS Feed URL Extraction
## Quick Test (Recommended)
Run this from the project root with backend virtual environment activated:
```bash
# 1. Activate backend virtual environment
cd backend
source venv/bin/activate # On Windows: venv\Scripts\activate
# 2. Go back to project root
cd ..
# 3. Run the test
python test_feeds_quick.py
```
This will:
- ✓ Check what RSS feeds are in your database
- ✓ Fetch each feed
- ✓ Test URL extraction on first 3 articles
- ✓ Show what fields are available
- ✓ Verify summary and date extraction
## Expected Output
```
================================================================================
RSS Feed Test - Checking Database Feeds
================================================================================
✓ Found 3 feed(s) in database
================================================================================
Feed: Süddeutsche Zeitung München
URL: https://www.sueddeutsche.de/muenchen/rss
Active: True
================================================================================
Fetching RSS feed...
✓ Found 20 entries
--- Entry 1 ---
Title: New U-Bahn Line Opens in Munich
✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-123
✓ Summary: The new U-Bahn line connecting the city center...
✓ Date: Mon, 10 Nov 2024 10:00:00 +0100
--- Entry 2 ---
Title: Munich Weather Update
✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-124
✓ Summary: Weather forecast for the week...
✓ Date: Mon, 10 Nov 2024 09:30:00 +0100
...
```
## If No Feeds Found
Add a feed first:
```bash
curl -X POST http://localhost:5001/api/rss-feeds \
-H "Content-Type: application/json" \
-d '{"name": "Süddeutsche Politik", "url": "https://rss.sueddeutsche.de/rss/Politik"}'
```
## Testing News Crawler
Once feeds are verified, test the crawler:
```bash
# 1. Install crawler dependencies
cd news_crawler
pip install -r requirements.txt
# 2. Run the test
python test_rss_feeds.py
# 3. Or run the actual crawler
python crawler_service.py 5
```
## Troubleshooting
### "No module named 'pymongo'"
- Activate the backend virtual environment first
- Or install dependencies: `pip install -r backend/requirements.txt`
### "No RSS feeds in database"
- Make sure backend is running
- Add feeds via API (see above)
- Or check if MongoDB is running: `docker-compose ps`
### "Could not extract URL"
- The test will show available fields
- Check if the feed uses `guid`, `id`, or `links` instead of `link`
- Our utility should handle most cases automatically
### "No entries found"
- The RSS feed URL might be invalid
- Try opening the URL in a browser
- Check if it returns valid XML
## Manual Database Check
Using mongosh:
```bash
mongosh
use munich_news
db.rss_feeds.find()
db.articles.find().limit(3)
```
## What to Look For
**Good signs:**
- URLs are extracted successfully
- URLs start with `http://` or `https://`
- Summaries are present
- Dates are extracted
⚠️ **Warning signs:**
- "Could not extract URL" messages
- Empty summaries (not critical)
- Missing dates (not critical)
**Problems:**
- No entries found in feed
- All URL extractions fail
- Feed parsing errors

143
backend/DATABASE_SCHEMA.md Normal file
View File

@@ -0,0 +1,143 @@
# MongoDB Database Schema
This document describes the MongoDB collections and their structure for Munich News Daily.
## Collections
### 1. Articles Collection (`articles`)
Stores all news articles aggregated from Munich news sources.
**Document Structure:**
```javascript
{
_id: ObjectId, // Auto-generated MongoDB ID
title: String, // Article title (required)
author: String, // Article author (optional, extracted during crawl)
link: String, // Article URL (required, unique)
content: String, // Full article content (no length limit)
summary: String, // AI-generated English summary (≤150 words)
word_count: Number, // Word count of full content
summary_word_count: Number, // Word count of AI summary
source: String, // News source name (e.g., "Süddeutsche Zeitung München")
published_at: String, // Original publication date from RSS feed or crawled
crawled_at: DateTime, // When article content was crawled (UTC)
summarized_at: DateTime, // When AI summary was generated (UTC)
created_at: DateTime // When article was added to database (UTC)
}
```
**Indexes:**
- `link` - Unique index to prevent duplicate articles
- `created_at` - Index for efficient sorting by date
**Example Document:**
```javascript
{
_id: ObjectId("507f1f77bcf86cd799439011"),
title: "New U-Bahn Line Opens in Munich",
author: "Max Mustermann",
link: "https://www.sueddeutsche.de/muenchen/ubahn-1.123456",
content: "The new U-Bahn line connecting the city center with the airport opened today. Mayor Dieter Reiter attended the opening ceremony... [full article text continues]",
summary: "Munich's new U-Bahn line connecting the city center to the airport opened today with Mayor Dieter Reiter in attendance. The line features 10 stations and runs every 10 minutes during peak hours, significantly reducing travel time. Construction took five years and cost approximately 2 billion euros.",
word_count: 1250,
summary_word_count: 48,
source: "Süddeutsche Zeitung München",
published_at: "Mon, 15 Jan 2024 10:00:00 +0100",
crawled_at: ISODate("2024-01-15T09:30:00.000Z"),
summarized_at: ISODate("2024-01-15T09:30:15.000Z"),
created_at: ISODate("2024-01-15T09:00:00.000Z")
}
```
### 2. Subscribers Collection (`subscribers`)
Stores all newsletter subscribers.
**Document Structure:**
```javascript
{
_id: ObjectId, // Auto-generated MongoDB ID
email: String, // Subscriber email (required, unique, lowercase)
subscribed_at: DateTime, // When user subscribed (UTC)
status: String // Subscription status: 'active' or 'inactive'
}
```
**Indexes:**
- `email` - Unique index for email lookups and preventing duplicates
- `subscribed_at` - Index for analytics and sorting
**Example Document:**
```javascript
{
_id: ObjectId("507f1f77bcf86cd799439012"),
email: "user@example.com",
subscribed_at: ISODate("2024-01-15T08:30:00.000Z"),
status: "active"
}
```
## Design Decisions
### Why MongoDB?
1. **Flexibility**: Easy to add new fields without schema migrations
2. **Scalability**: Handles large volumes of articles and subscribers efficiently
3. **Performance**: Indexes on frequently queried fields (link, email, created_at)
4. **Document Model**: Natural fit for news articles and subscriber data
### Schema Choices
1. **Unique Link Index**: Prevents duplicate articles from being stored, even if fetched multiple times
2. **Status Field**: Soft delete for subscribers (set to 'inactive' instead of deleting) - allows for analytics and easy re-subscription
3. **UTC Timestamps**: All dates stored in UTC for consistency across timezones
4. **Lowercase Emails**: Emails stored in lowercase to prevent case-sensitivity issues
### Future Enhancements
Potential fields to add in the future:
**Articles:**
- `category`: String (e.g., "politics", "sports", "culture")
- `tags`: Array of Strings
- `image_url`: String
- `sent_in_newsletter`: Boolean (track if article was sent)
- `sent_at`: DateTime (when article was included in newsletter)
**Subscribers:**
- `preferences`: Object (newsletter frequency, categories, etc.)
- `last_sent_at`: DateTime (last newsletter sent date)
- `unsubscribed_at`: DateTime (when user unsubscribed)
- `verification_token`: String (for email verification)
## AI Summarization Workflow
When the crawler processes an article:
1. **Extract Content**: Full article text is extracted from the webpage
2. **Summarize with Ollama**: If `OLLAMA_ENABLED=true`, the content is sent to Ollama for summarization
3. **Store Both**: Both the original `content` and AI-generated `summary` are stored
4. **Fallback**: If Ollama is unavailable or fails, only the original content is stored
### Summary Field Details
- **Language**: Always in English, regardless of source article language
- **Length**: Maximum 150 words
- **Format**: Plain text, concise and clear
- **Purpose**: Quick preview for newsletters and frontend display
### Querying Articles
```javascript
// Get articles with AI summaries
db.articles.find({ summary: { $exists: true, $ne: null } })
// Get articles without summaries
db.articles.find({ summary: { $exists: false } })
// Count summarized articles
db.articles.countDocuments({ summary: { $exists: true, $ne: null } })
```

98
backend/STRUCTURE.md Normal file
View File

@@ -0,0 +1,98 @@
# Backend Structure
The backend has been modularized for better maintainability and scalability.
## Directory Structure
```
backend/
├── app.py # Main Flask application entry point
├── config.py # Configuration management
├── database.py # Database connection and initialization
├── requirements.txt # Python dependencies
├── .env # Environment variables
├── routes/ # API route handlers (blueprints)
│ ├── __init__.py
│ ├── subscription_routes.py # /api/subscribe, /api/unsubscribe
│ ├── news_routes.py # /api/news, /api/stats
│ ├── rss_routes.py # /api/rss-feeds (CRUD operations)
│ └── ollama_routes.py # /api/ollama/* (AI features)
└── services/ # Business logic layer
├── __init__.py
├── news_service.py # News fetching and storage logic
├── email_service.py # Newsletter email sending
└── ollama_service.py # Ollama AI integration
```
## Key Components
### app.py
- Main Flask application
- Registers all blueprints
- Minimal code, just wiring things together
### config.py
- Centralized configuration
- Loads environment variables
- Single source of truth for all settings
### database.py
- MongoDB connection setup
- Collection definitions
- Database initialization with indexes
### routes/
Each route file is a Flask Blueprint handling specific API endpoints:
- **subscription_routes.py**: User subscription management
- **news_routes.py**: News fetching and statistics
- **rss_routes.py**: RSS feed management (add/remove/list/toggle)
- **ollama_routes.py**: AI/Ollama integration endpoints
### services/
Business logic separated from route handlers:
- **news_service.py**: Fetches news from RSS feeds, saves to database
- **email_service.py**: Sends newsletter emails to subscribers
- **ollama_service.py**: Communicates with Ollama AI server
## Benefits of This Structure
1. **Separation of Concerns**: Routes handle HTTP, services handle business logic
2. **Testability**: Each module can be tested independently
3. **Maintainability**: Easy to find and modify specific functionality
4. **Scalability**: Easy to add new routes or services
5. **Reusability**: Services can be used by multiple routes
## Adding New Features
### To add a new API endpoint:
1. Create a new route file in `routes/` or add to existing one
2. Create a Blueprint and define routes
3. Register the blueprint in `app.py`
### To add new business logic:
1. Create a new service file in `services/`
2. Import and use in your route handlers
### Example:
```python
# services/my_service.py
def my_business_logic():
return "Hello"
# routes/my_routes.py
from flask import Blueprint
from services.my_service import my_business_logic
my_bp = Blueprint('my', __name__)
@my_bp.route('/api/my-endpoint')
def my_endpoint():
result = my_business_logic()
return {'message': result}
# app.py
from routes.my_routes import my_bp
app.register_blueprint(my_bp)
```

29
backend/app.py Normal file
View File

@@ -0,0 +1,29 @@
from flask import Flask
from flask_cors import CORS
from config import Config
from database import init_db
from routes.subscription_routes import subscription_bp
from routes.news_routes import news_bp
from routes.rss_routes import rss_bp
from routes.ollama_routes import ollama_bp
from routes.newsletter_routes import newsletter_bp
# Initialize Flask app
app = Flask(__name__)
CORS(app)
# Initialize database
init_db()
# Register blueprints
app.register_blueprint(subscription_bp)
app.register_blueprint(news_bp)
app.register_blueprint(rss_bp)
app.register_blueprint(ollama_bp)
app.register_blueprint(newsletter_bp)
# Print configuration
Config.print_config()
if __name__ == '__main__':
app.run(debug=True, port=Config.FLASK_PORT, host='127.0.0.1')

52
backend/config.py Normal file
View File

@@ -0,0 +1,52 @@
import os
from dotenv import load_dotenv
from pathlib import Path
# Get the directory where this script is located
backend_dir = Path(__file__).parent
env_path = backend_dir / '.env'
# Load .env file
load_dotenv(dotenv_path=env_path)
# Debug: Print if .env file exists (for troubleshooting)
if env_path.exists():
print(f"✓ Loading .env file from: {env_path}")
else:
print(f"⚠ Warning: .env file not found at {env_path}")
print(f" Current working directory: {os.getcwd()}")
print(f" Looking for .env in: {env_path}")
class Config:
"""Application configuration"""
# MongoDB
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
# Email
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
EMAIL_USER = os.getenv('EMAIL_USER', '')
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
# Ollama
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
# Flask
FLASK_PORT = int(os.getenv('FLASK_PORT', '5000'))
@classmethod
def print_config(cls):
"""Print configuration (without sensitive data)"""
print("\nApplication Configuration:")
print(f" MongoDB URI: {cls.MONGODB_URI}")
print(f" Database: {cls.DB_NAME}")
print(f" Flask Port: {cls.FLASK_PORT}")
print(f" Ollama Base URL: {cls.OLLAMA_BASE_URL}")
print(f" Ollama Model: {cls.OLLAMA_MODEL}")
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")

53
backend/database.py Normal file
View File

@@ -0,0 +1,53 @@
from pymongo import MongoClient
from datetime import datetime
from config import Config
# MongoDB setup
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
# Collections
articles_collection = db['articles']
subscribers_collection = db['subscribers']
rss_feeds_collection = db['rss_feeds']
def init_db():
"""Initialize database with indexes"""
# Create unique index on article links to prevent duplicates
articles_collection.create_index('link', unique=True)
# Create index on created_at for faster sorting
articles_collection.create_index('created_at')
# Create unique index on subscriber emails
subscribers_collection.create_index('email', unique=True)
# Create index on subscribed_at
subscribers_collection.create_index('subscribed_at')
# Create unique index on RSS feed URLs
rss_feeds_collection.create_index('url', unique=True)
# Initialize default RSS feeds if collection is empty
if rss_feeds_collection.count_documents({}) == 0:
default_feeds = [
{
'name': 'Süddeutsche Zeitung München',
'url': 'https://www.sueddeutsche.de/muenchen/rss',
'active': True,
'created_at': datetime.utcnow()
},
{
'name': 'Münchner Merkur',
'url': 'https://www.merkur.de/muenchen/rss',
'active': True,
'created_at': datetime.utcnow()
},
{
'name': 'Abendzeitung München',
'url': 'https://www.abendzeitung-muenchen.de/rss',
'active': True,
'created_at': datetime.utcnow()
}
]
rss_feeds_collection.insert_many(default_feeds)
print(f"Initialized {len(default_feeds)} default RSS feeds")
print("Database initialized with indexes")

32
backend/env.template Normal file
View File

@@ -0,0 +1,32 @@
# MongoDB Configuration
# For Docker Compose (no authentication):
MONGODB_URI=mongodb://localhost:27017/
# For Docker Compose with authentication:
# MONGODB_URI=mongodb://admin:password@localhost:27017/
# For MongoDB Atlas (cloud):
# MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
# Email Configuration (for sending newsletters)
SMTP_SERVER=smtp.gmail.com
SMTP_PORT=587
EMAIL_USER=your-email@gmail.com
EMAIL_PASSWORD=your-app-password
# Note: For Gmail, use an App Password: https://support.google.com/accounts/answer/185833
# Ollama Configuration (for AI-powered features)
# Remote Ollama server URL (e.g., http://your-server-ip:11434 or https://your-domain.com)
OLLAMA_BASE_URL=http://localhost:11434
# Optional: API key if your Ollama server requires authentication
# OLLAMA_API_KEY=your-api-key-here
# Model name to use (e.g., llama2, mistral, codellama, llama3, phi3:latest)
OLLAMA_MODEL=phi3:latest
# Enable/disable Ollama features (true/false)
# When enabled, the crawler will automatically summarize articles in English (≤150 words)
OLLAMA_ENABLED=true
# Timeout for Ollama requests in seconds (default: 30)
OLLAMA_TIMEOUT=30
# Flask Server Configuration
# Port for Flask server (default: 5001 to avoid AirPlay conflict on macOS)
FLASK_PORT=5001

61
backend/fix_duplicates.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Script to fix duplicate RSS feeds and create unique index
Run this once: python fix_duplicates.py
"""
from pymongo import MongoClient
from config import Config
# Connect to MongoDB
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
rss_feeds_collection = db['rss_feeds']
print("Fixing duplicate RSS feeds...")
# Get all feeds
all_feeds = list(rss_feeds_collection.find())
print(f"Total feeds found: {len(all_feeds)}")
# Find duplicates by URL
seen_urls = {}
duplicates_to_remove = []
for feed in all_feeds:
url = feed.get('url')
if url in seen_urls:
# This is a duplicate, mark for removal
duplicates_to_remove.append(feed['_id'])
print(f" Duplicate found: {feed['name']} - {url}")
else:
# First occurrence, keep it
seen_urls[url] = feed['_id']
# Remove duplicates
if duplicates_to_remove:
result = rss_feeds_collection.delete_many({'_id': {'$in': duplicates_to_remove}})
print(f"Removed {result.deleted_count} duplicate feeds")
else:
print("No duplicates found")
# Drop existing indexes (if any)
print("\nDropping existing indexes...")
try:
rss_feeds_collection.drop_indexes()
print("Indexes dropped")
except Exception as e:
print(f"Note: {e}")
# Create unique index on URL
print("\nCreating unique index on 'url' field...")
rss_feeds_collection.create_index('url', unique=True)
print("✓ Unique index created successfully")
# Verify
remaining_feeds = list(rss_feeds_collection.find())
print(f"\nFinal feed count: {len(remaining_feeds)}")
print("\nRemaining feeds:")
for feed in remaining_feeds:
print(f" - {feed['name']}: {feed['url']}")
print("\n✓ Done! Duplicates removed and unique index created.")
print("You can now restart your Flask app.")

8
backend/requirements.txt Normal file
View File

@@ -0,0 +1,8 @@
Flask==3.0.0
flask-cors==4.0.0
feedparser==6.0.10
python-dotenv==1.0.0
pymongo==4.6.1
requests==2.31.0
Jinja2==3.1.2

View File

@@ -0,0 +1 @@
# Routes package

View File

@@ -0,0 +1,123 @@
from flask import Blueprint, jsonify
from database import articles_collection
from services.news_service import fetch_munich_news, save_articles_to_db
news_bp = Blueprint('news', __name__)
@news_bp.route('/api/news', methods=['GET'])
def get_news():
"""Get latest Munich news"""
try:
# Fetch fresh news and save to database
articles = fetch_munich_news()
save_articles_to_db(articles)
# Get articles from MongoDB, sorted by created_at (newest first)
cursor = articles_collection.find().sort('created_at', -1).limit(20)
db_articles = []
for doc in cursor:
article = {
'title': doc.get('title', ''),
'author': doc.get('author'),
'link': doc.get('link', ''),
'source': doc.get('source', ''),
'published': doc.get('published_at', ''),
'word_count': doc.get('word_count'),
'has_full_content': bool(doc.get('content')),
'has_summary': bool(doc.get('summary'))
}
# Include AI summary if available
if doc.get('summary'):
article['summary'] = doc.get('summary', '')
article['summary_word_count'] = doc.get('summary_word_count')
article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
# Fallback: Include preview of content if no summary (first 200 chars)
elif doc.get('content'):
article['preview'] = doc.get('content', '')[:200] + '...'
db_articles.append(article)
# Combine fresh articles with database articles and deduplicate
seen_links = set()
combined = []
# Add fresh articles first (they're more recent)
for article in articles:
link = article.get('link', '')
if link and link not in seen_links:
seen_links.add(link)
combined.append(article)
# Add database articles
for article in db_articles:
link = article.get('link', '')
if link and link not in seen_links:
seen_links.add(link)
combined.append(article)
return jsonify({'articles': combined[:20]}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
def get_article_by_url(article_url):
"""Get full article content by URL"""
try:
# Decode URL
from urllib.parse import unquote
decoded_url = unquote(article_url)
# Find article by link
article = articles_collection.find_one({'link': decoded_url})
if not article:
return jsonify({'error': 'Article not found'}), 404
return jsonify({
'title': article.get('title', ''),
'author': article.get('author'),
'link': article.get('link', ''),
'content': article.get('content', ''),
'summary': article.get('summary'),
'word_count': article.get('word_count', 0),
'summary_word_count': article.get('summary_word_count'),
'source': article.get('source', ''),
'published_at': article.get('published_at', ''),
'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@news_bp.route('/api/stats', methods=['GET'])
def get_stats():
"""Get subscription statistics"""
try:
from database import subscribers_collection
# Count only active subscribers
subscriber_count = subscribers_collection.count_documents({'status': 'active'})
# Also get total article count
article_count = articles_collection.count_documents({})
# Count crawled articles
crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})
# Count summarized articles
summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
return jsonify({
'subscribers': subscriber_count,
'articles': article_count,
'crawled_articles': crawled_count,
'summarized_articles': summarized_count
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -0,0 +1,62 @@
from flask import Blueprint, Response
from pathlib import Path
from jinja2 import Template
from datetime import datetime
from database import articles_collection
newsletter_bp = Blueprint('newsletter', __name__)
@newsletter_bp.route('/api/newsletter/preview', methods=['GET'])
def preview_newsletter():
"""Preview the newsletter HTML (for testing)"""
try:
# Get latest articles with AI summaries
cursor = articles_collection.find(
{'summary': {'$exists': True, '$ne': None}}
).sort('created_at', -1).limit(10)
articles = []
for doc in cursor:
articles.append({
'title': doc.get('title', ''),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', '')
})
if not articles:
return Response(
"<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>",
mimetype='text/html'
)
# Load template
template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
template = Template(template_content)
# Prepare data
now = datetime.now()
template_data = {
'date': now.strftime('%A, %B %d, %Y'),
'year': now.year,
'article_count': len(articles),
'articles': articles,
'unsubscribe_link': 'http://localhost:3000/unsubscribe',
'website_link': 'http://localhost:3000'
}
# Render and return HTML
html_content = template.render(**template_data)
return Response(html_content, mimetype='text/html')
except Exception as e:
return Response(
f"<h1>Error</h1><p>{str(e)}</p>",
mimetype='text/html'
), 500

View File

@@ -0,0 +1,158 @@
from flask import Blueprint, jsonify
from config import Config
from services.ollama_service import call_ollama, list_ollama_models
import os
ollama_bp = Blueprint('ollama', __name__)
@ollama_bp.route('/api/ollama/ping', methods=['GET', 'POST'])
def ping_ollama():
"""Test connection to Ollama server"""
try:
# Check if Ollama is enabled
if not Config.OLLAMA_ENABLED:
return jsonify({
'status': 'disabled',
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': False
}
}), 200
# Send a simple test prompt
test_prompt = "Say 'Hello! I am connected and working.' in one sentence."
system_prompt = "You are a helpful assistant. Respond briefly and concisely."
response_text, error_message = call_ollama(test_prompt, system_prompt)
if response_text:
return jsonify({
'status': 'success',
'message': 'Successfully connected to Ollama',
'response': response_text,
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': True
}
}), 200
else:
# Try to get available models for better error message
available_models, _ = list_ollama_models()
troubleshooting = {
'check_server': f'Verify Ollama is running at {Config.OLLAMA_BASE_URL}',
'check_model': f'Verify model "{Config.OLLAMA_MODEL}" is available (run: ollama list)',
'test_connection': f'Test manually: curl {Config.OLLAMA_BASE_URL}/api/generate -d \'{{"model":"{Config.OLLAMA_MODEL}","prompt":"test"}}\''
}
if available_models:
troubleshooting['available_models'] = available_models
troubleshooting['suggestion'] = f'Try setting OLLAMA_MODEL to one of: {", ".join(available_models[:5])}'
return jsonify({
'status': 'error',
'message': error_message or 'Failed to get response from Ollama',
'error_details': error_message,
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': True
},
'troubleshooting': troubleshooting
}), 500
except Exception as e:
return jsonify({
'status': 'error',
'message': f'Error connecting to Ollama: {str(e)}',
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': Config.OLLAMA_ENABLED
}
}), 500
@ollama_bp.route('/api/ollama/config', methods=['GET'])
def get_ollama_config():
"""Get current Ollama configuration (for debugging)"""
try:
from pathlib import Path
backend_dir = Path(__file__).parent.parent
env_path = backend_dir / '.env'
return jsonify({
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': Config.OLLAMA_ENABLED,
'has_api_key': bool(Config.OLLAMA_API_KEY)
},
'env_file_path': str(env_path),
'env_file_exists': env_path.exists(),
'current_working_directory': os.getcwd()
}), 200
except Exception as e:
return jsonify({
'error': str(e),
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': Config.OLLAMA_ENABLED
}
}), 500
@ollama_bp.route('/api/ollama/models', methods=['GET'])
def get_ollama_models():
"""List available models on Ollama server"""
try:
if not Config.OLLAMA_ENABLED:
return jsonify({
'status': 'disabled',
'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': False
}
}), 200
models, error_message = list_ollama_models()
if models is not None:
return jsonify({
'status': 'success',
'models': models,
'current_model': Config.OLLAMA_MODEL,
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': True
}
}), 200
else:
return jsonify({
'status': 'error',
'message': error_message or 'Failed to list models',
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': True
}
}), 500
except Exception as e:
return jsonify({
'status': 'error',
'message': f'Error listing models: {str(e)}',
'ollama_config': {
'base_url': Config.OLLAMA_BASE_URL,
'model': Config.OLLAMA_MODEL,
'enabled': Config.OLLAMA_ENABLED
}
}), 500

View File

@@ -0,0 +1,124 @@
from flask import Blueprint, request, jsonify
from datetime import datetime
from pymongo.errors import DuplicateKeyError
from bson.objectid import ObjectId
import feedparser
from database import rss_feeds_collection
rss_bp = Blueprint('rss', __name__)
@rss_bp.route('/api/rss-feeds', methods=['GET'])
def get_rss_feeds():
"""Get all RSS feeds"""
try:
cursor = rss_feeds_collection.find().sort('created_at', -1)
feeds = []
for feed in cursor:
feeds.append({
'id': str(feed['_id']),
'name': feed.get('name', ''),
'url': feed.get('url', ''),
'active': feed.get('active', True),
'created_at': feed.get('created_at', '').isoformat() if feed.get('created_at') else ''
})
return jsonify({'feeds': feeds}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds', methods=['POST'])
def add_rss_feed():
"""Add a new RSS feed"""
data = request.json
name = data.get('name', '').strip()
url = data.get('url', '').strip()
if not name or not url:
return jsonify({'error': 'Name and URL are required'}), 400
if not url.startswith('http://') and not url.startswith('https://'):
return jsonify({'error': 'URL must start with http:// or https://'}), 400
try:
# Test if the RSS feed is valid
try:
feed = feedparser.parse(url)
if not feed.entries:
return jsonify({'error': 'Invalid RSS feed or no entries found'}), 400
except Exception as e:
return jsonify({'error': f'Failed to parse RSS feed: {str(e)}'}), 400
feed_doc = {
'name': name,
'url': url,
'active': True,
'created_at': datetime.utcnow()
}
try:
result = rss_feeds_collection.insert_one(feed_doc)
return jsonify({
'message': 'RSS feed added successfully',
'id': str(result.inserted_id)
}), 201
except DuplicateKeyError:
return jsonify({'error': 'RSS feed URL already exists'}), 409
except Exception as e:
return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds/<feed_id>', methods=['DELETE'])
def remove_rss_feed(feed_id):
"""Remove an RSS feed"""
try:
# Validate ObjectId
try:
obj_id = ObjectId(feed_id)
except Exception:
return jsonify({'error': 'Invalid feed ID'}), 400
result = rss_feeds_collection.delete_one({'_id': obj_id})
if result.deleted_count > 0:
return jsonify({'message': 'RSS feed removed successfully'}), 200
else:
return jsonify({'error': 'RSS feed not found'}), 404
except Exception as e:
return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds/<feed_id>/toggle', methods=['PATCH'])
def toggle_rss_feed(feed_id):
"""Toggle RSS feed active status"""
try:
# Validate ObjectId
try:
obj_id = ObjectId(feed_id)
except Exception:
return jsonify({'error': 'Invalid feed ID'}), 400
# Get current status
feed = rss_feeds_collection.find_one({'_id': obj_id})
if not feed:
return jsonify({'error': 'RSS feed not found'}), 404
# Toggle status
new_status = not feed.get('active', True)
result = rss_feeds_collection.update_one(
{'_id': obj_id},
{'$set': {'active': new_status}}
)
if result.modified_count > 0:
return jsonify({
'message': f'RSS feed {"activated" if new_status else "deactivated"} successfully',
'active': new_status
}), 200
else:
return jsonify({'error': 'Failed to update RSS feed'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -0,0 +1,63 @@
from flask import Blueprint, request, jsonify
from datetime import datetime
from pymongo.errors import DuplicateKeyError
from database import subscribers_collection
subscription_bp = Blueprint('subscription', __name__)
@subscription_bp.route('/api/subscribe', methods=['POST'])
def subscribe():
"""Subscribe a user to the newsletter"""
data = request.json
email = data.get('email', '').strip().lower()
if not email or '@' not in email:
return jsonify({'error': 'Invalid email address'}), 400
try:
subscriber_doc = {
'email': email,
'subscribed_at': datetime.utcnow(),
'status': 'active'
}
# Try to insert, if duplicate key error, subscriber already exists
try:
subscribers_collection.insert_one(subscriber_doc)
return jsonify({'message': 'Successfully subscribed!'}), 201
except DuplicateKeyError:
# Check if subscriber is active
existing = subscribers_collection.find_one({'email': email})
if existing and existing.get('status') == 'active':
return jsonify({'message': 'Email already subscribed'}), 200
else:
# Reactivate if previously unsubscribed
subscribers_collection.update_one(
{'email': email},
{'$set': {'status': 'active', 'subscribed_at': datetime.utcnow()}}
)
return jsonify({'message': 'Successfully re-subscribed!'}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@subscription_bp.route('/api/unsubscribe', methods=['POST'])
def unsubscribe():
"""Unsubscribe a user from the newsletter"""
data = request.json
email = data.get('email', '').strip().lower()
try:
result = subscribers_collection.update_one(
{'email': email},
{'$set': {'status': 'inactive'}}
)
if result.matched_count > 0:
return jsonify({'message': 'Successfully unsubscribed'}), 200
else:
return jsonify({'error': 'Email not found in subscribers'}), 404
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -0,0 +1 @@
# Services package

View File

@@ -0,0 +1,88 @@
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime
from pathlib import Path
from jinja2 import Template
from config import Config
from database import subscribers_collection, articles_collection
def send_newsletter(max_articles=10):
"""Send newsletter to all subscribers with AI-summarized articles"""
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
print("Email credentials not configured")
return
# Get latest articles with AI summaries from database
cursor = articles_collection.find(
{'summary': {'$exists': True, '$ne': None}}
).sort('created_at', -1).limit(max_articles)
articles = []
for doc in cursor:
articles.append({
'title': doc.get('title', ''),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', '')
})
if not articles:
print("No articles with summaries to send")
return
# Load email template
template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
template = Template(template_content)
# Prepare template data
now = datetime.now()
template_data = {
'date': now.strftime('%A, %B %d, %Y'),
'year': now.year,
'article_count': len(articles),
'articles': articles,
'unsubscribe_link': 'http://localhost:3000', # Update with actual unsubscribe link
'website_link': 'http://localhost:3000'
}
# Render HTML
html_content = template.render(**template_data)
# Get all active subscribers
subscribers_cursor = subscribers_collection.find({'status': 'active'})
subscribers = [doc['email'] for doc in subscribers_cursor]
# Send emails
for subscriber in subscribers:
try:
msg = MIMEMultipart('alternative')
msg['Subject'] = f'Munich News Daily - {datetime.now().strftime("%B %d, %Y")}'
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
msg['To'] = subscriber
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{subscriber}@dongho.kim>'
msg['X-Mailer'] = 'Munich News Daily'
# Add plain text version as fallback
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
# Add HTML version
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
server.starttls()
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
server.send_message(msg)
server.quit()
print(f"Newsletter sent to {subscriber}")
except Exception as e:
print(f"Error sending to {subscriber}: {e}")

View File

@@ -0,0 +1,90 @@
import feedparser
from datetime import datetime
from pymongo.errors import DuplicateKeyError
from database import articles_collection, rss_feeds_collection
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
def get_active_rss_feeds():
"""Get all active RSS feeds from database"""
feeds = []
cursor = rss_feeds_collection.find({'active': True})
for feed in cursor:
feeds.append({
'name': feed.get('name', ''),
'url': feed.get('url', '')
})
return feeds
def fetch_munich_news():
"""Fetch news from Munich news sources"""
articles = []
# Get RSS feeds from database instead of hardcoded list
sources = get_active_rss_feeds()
for source in sources:
try:
feed = feedparser.parse(source['url'])
for entry in feed.entries[:5]: # Get top 5 from each source
# Extract article URL using utility function
article_url = extract_article_url(entry)
if not article_url:
print(f" ⚠ No valid URL for: {entry.get('title', 'Unknown')[:50]}")
continue # Skip entries without valid URL
# Extract summary
summary = extract_article_summary(entry)
if summary:
summary = summary[:200] + '...' if len(summary) > 200 else summary
articles.append({
'title': entry.get('title', ''),
'link': article_url,
'summary': summary,
'source': source['name'],
'published': extract_published_date(entry)
})
except Exception as e:
print(f"Error fetching from {source['name']}: {e}")
return articles
def save_articles_to_db(articles):
"""Save articles to MongoDB, avoiding duplicates"""
saved_count = 0
for article in articles:
try:
# Prepare article document
article_doc = {
'title': article.get('title', ''),
'link': article.get('link', ''),
'summary': article.get('summary', ''),
'source': article.get('source', ''),
'published_at': article.get('published', ''),
'created_at': datetime.utcnow()
}
# Use update_one with upsert to handle duplicates
# This will insert if link doesn't exist, or update if it does
result = articles_collection.update_one(
{'link': article_doc['link']},
{'$setOnInsert': article_doc}, # Only set on insert, don't update existing
upsert=True
)
if result.upserted_id:
saved_count += 1
except DuplicateKeyError:
# Link already exists, skip
pass
except Exception as e:
print(f"Error saving article {article.get('link', 'unknown')}: {e}")
if saved_count > 0:
print(f"Saved {saved_count} new articles to database")

View File

@@ -0,0 +1,96 @@
import requests
from config import Config
def list_ollama_models():
"""List available models on Ollama server"""
if not Config.OLLAMA_ENABLED:
return None, "Ollama is not enabled"
try:
url = f"{Config.OLLAMA_BASE_URL}/api/tags"
headers = {}
if Config.OLLAMA_API_KEY:
headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
result = response.json()
models = result.get('models', [])
model_names = [model.get('name', '') for model in models]
return model_names, None
except requests.exceptions.RequestException as e:
return None, f"Error listing models: {str(e)}"
except Exception as e:
return None, f"Unexpected error: {str(e)}"
def call_ollama(prompt, system_prompt=None):
"""Call Ollama API to generate text"""
if not Config.OLLAMA_ENABLED:
return None, "Ollama is not enabled"
try:
url = f"{Config.OLLAMA_BASE_URL}/api/generate"
payload = {
"model": Config.OLLAMA_MODEL,
"prompt": prompt,
"stream": False
}
if system_prompt:
payload["system"] = system_prompt
headers = {}
if Config.OLLAMA_API_KEY:
headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
print(f"Calling Ollama at {url} with model {Config.OLLAMA_MODEL}")
response = requests.post(url, json=payload, headers=headers, timeout=30)
response.raise_for_status()
result = response.json()
response_text = result.get('response', '').strip()
if not response_text:
return None, "Ollama returned empty response"
return response_text, None
except requests.exceptions.ConnectionError as e:
error_msg = f"Cannot connect to Ollama server at {Config.OLLAMA_BASE_URL}. Is Ollama running?"
print(f"Connection error: {error_msg}")
return None, error_msg
except requests.exceptions.Timeout:
error_msg = "Request to Ollama timed out after 30 seconds"
print(f"Timeout error: {error_msg}")
return None, error_msg
except requests.exceptions.HTTPError as e:
# Check if it's a model not found error
if e.response.status_code == 404:
try:
error_data = e.response.json()
if 'model' in error_data.get('error', '').lower() and 'not found' in error_data.get('error', '').lower():
# Try to get available models
available_models, _ = list_ollama_models()
if available_models:
error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Available models: {', '.join(available_models)}"
else:
error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Use 'ollama list' on the server to see available models."
else:
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
except (ValueError, KeyError):
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
else:
error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
print(f"HTTP error: {error_msg}")
return None, error_msg
except requests.exceptions.RequestException as e:
error_msg = f"Request error: {str(e)}"
print(f"Request error: {error_msg}")
return None, error_msg
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
print(f"Unexpected error: {error_msg}")
return None, error_msg

View File

@@ -0,0 +1,162 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Munich News Daily</title>
<!--[if mso]>
<style type="text/css">
body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
</style>
<![endif]-->
</head>
<body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
<!-- Wrapper Table -->
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
<tr>
<td align="center" style="padding: 20px 0;">
<!-- Main Container -->
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
<!-- Header -->
<tr>
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
<h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
Munich News Daily
</h1>
<p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
{{ date }}
</p>
</td>
</tr>
<!-- Greeting -->
<tr>
<td style="padding: 30px 40px 20px 40px;">
<p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
Good morning ☀️
</p>
<p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
</p>
</td>
</tr>
<!-- Divider -->
<tr>
<td style="padding: 0 40px;">
<div style="height: 1px; background-color: #e0e0e0;"></div>
</td>
</tr>
<!-- Articles -->
{% for article in articles %}
<tr>
<td style="padding: 25px 40px;">
<!-- Article Number Badge -->
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td>
<span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
{{ loop.index }}
</span>
</td>
</tr>
</table>
<!-- Article Title -->
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
{{ article.title }}
</h2>
<!-- Article Meta -->
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
<span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
{% if article.author %}
<span> • {{ article.author }}</span>
{% endif %}
</p>
<!-- Article Summary -->
<p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
{{ article.summary }}
</p>
<!-- Read More Link -->
<a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
Read more →
</a>
</td>
</tr>
<!-- Article Divider -->
{% if not loop.last %}
<tr>
<td style="padding: 0 40px;">
<div style="height: 1px; background-color: #f0f0f0;"></div>
</td>
</tr>
{% endif %}
{% endfor %}
<!-- Bottom Divider -->
<tr>
<td style="padding: 25px 40px 0 40px;">
<div style="height: 1px; background-color: #e0e0e0;"></div>
</td>
</tr>
<!-- Summary Box -->
<tr>
<td style="padding: 30px 40px;">
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
<tr>
<td style="padding: 25px; text-align: center;">
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
Today's Digest
</p>
<p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
{{ article_count }}
</p>
<p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
stories • AI-summarized • 5 min read
</p>
</td>
</tr>
</table>
</td>
</tr>
<!-- Footer -->
<tr>
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
<p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
Munich News Daily
</p>
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
AI-powered news summaries for busy people.<br>
Delivered daily to your inbox.
</p>
<!-- Footer Links -->
<p style="margin: 0; font-size: 12px; color: #666666;">
<a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
<span style="color: #444444;"></span>
<a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
</p>
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
© {{ year }} Munich News Daily. All rights reserved.
</p>
</td>
</tr>
</table>
<!-- End Main Container -->
</td>
</tr>
</table>
<!-- End Wrapper Table -->
</body>
</html>

View File

@@ -0,0 +1,128 @@
#!/usr/bin/env python
"""
Test RSS feed URL extraction
Run from backend directory with venv activated:
cd backend
source venv/bin/activate # or venv\Scripts\activate on Windows
python test_rss_extraction.py
"""
from pymongo import MongoClient
from config import Config
import feedparser
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
print("\n" + "="*80)
print("RSS Feed URL Extraction Test")
print("="*80)
# Connect to database
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
# Get RSS feeds
print("Fetching RSS feeds from database...")
feeds = list(db['rss_feeds'].find())
if not feeds:
print("\n❌ No RSS feeds in database!")
print("\nAdd a feed first:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
exit(1)
print(f"✓ Found {len(feeds)} feed(s)\n")
# Test each feed
total_success = 0
total_fail = 0
for feed_doc in feeds:
name = feed_doc.get('name', 'Unknown')
url = feed_doc.get('url', '')
active = feed_doc.get('active', True)
print("\n" + "="*80)
print(f"Feed: {name}")
print(f"URL: {url}")
print(f"Active: {'Yes' if active else 'No'}")
print("="*80)
if not active:
print("⏭ Skipping (inactive)")
continue
try:
# Parse RSS
print("\nFetching RSS feed...")
feed = feedparser.parse(url)
if not feed.entries:
print("❌ No entries found in feed")
continue
print(f"✓ Found {len(feed.entries)} entries")
# Test first 3 entries
print(f"\nTesting first 3 entries:")
print("-" * 80)
for i, entry in enumerate(feed.entries[:3], 1):
print(f"\n📰 Entry {i}:")
# Title
title = entry.get('title', 'No title')
print(f" Title: {title[:65]}")
# Test URL extraction
article_url = extract_article_url(entry)
if article_url:
print(f" ✓ URL: {article_url}")
total_success += 1
else:
print(f" ❌ Could not extract URL")
print(f" Available fields: {list(entry.keys())[:10]}")
print(f" link: {entry.get('link', 'N/A')}")
print(f" guid: {entry.get('guid', 'N/A')}")
print(f" id: {entry.get('id', 'N/A')}")
total_fail += 1
# Test summary
summary = extract_article_summary(entry)
if summary:
print(f" ✓ Summary: {summary[:70]}...")
else:
print(f" ⚠ No summary")
# Test date
pub_date = extract_published_date(entry)
if pub_date:
print(f" ✓ Date: {pub_date}")
else:
print(f" ⚠ No date")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
# Summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total URLs tested: {total_success + total_fail}")
print(f"✓ Successfully extracted: {total_success}")
print(f"❌ Failed to extract: {total_fail}")
if total_fail == 0:
print("\n🎉 All URLs extracted successfully!")
print("\nYou can now run the crawler:")
print(" cd ../news_crawler")
print(" pip install -r requirements.txt")
print(" python crawler_service.py 5")
else:
print(f"\n{total_fail} URL(s) could not be extracted")
print("Check the output above for details")
print("="*80 + "\n")

View File

@@ -0,0 +1 @@
# Utils package

View File

@@ -0,0 +1,98 @@
"""
Utility functions for RSS feed processing
"""
def extract_article_url(entry):
"""
Extract article URL from RSS entry.
Different RSS feeds use different fields for the article URL.
Args:
entry: feedparser entry object
Returns:
str: Article URL or None if not found
Examples:
- Most feeds use 'link'
- Some use 'guid' as the URL
- Some use 'id' as the URL
- Some have guid as a dict with 'href'
"""
# Try 'link' first (most common)
if entry.get('link') and entry.get('link', '').startswith('http'):
return entry.get('link')
# Try 'guid' if it's a valid URL
if entry.get('guid'):
guid = entry.get('guid')
# guid can be a string
if isinstance(guid, str) and guid.startswith('http'):
return guid
# or a dict with 'href'
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
return guid.get('href')
# Try 'id' if it's a valid URL
if entry.get('id') and entry.get('id', '').startswith('http'):
return entry.get('id')
# Try 'links' array (some feeds have multiple links)
if entry.get('links'):
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
# Prefer 'alternate' type, but accept any http link
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
return link.get('href')
# If no alternate found, return first http link
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
return link.get('href')
return None
def extract_article_summary(entry):
"""
Extract article summary/description from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Article summary or empty string
"""
# Try different fields
if entry.get('summary'):
return entry.get('summary', '')
elif entry.get('description'):
return entry.get('description', '')
elif entry.get('content'):
# content is usually a list of dicts
content = entry.get('content', [])
if content and isinstance(content, list) and len(content) > 0:
return content[0].get('value', '')
return ''
def extract_published_date(entry):
"""
Extract published date from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Published date or empty string
"""
# Try different fields
if entry.get('published'):
return entry.get('published', '')
elif entry.get('updated'):
return entry.get('updated', '')
elif entry.get('created'):
return entry.get('created', '')
return ''

33
docker-compose.prod.yml Normal file
View File

@@ -0,0 +1,33 @@
version: '3.8'
# Production version with authentication enabled
# Usage: docker-compose -f docker-compose.prod.yml up -d
services:
mongodb:
image: mongo:7.0
container_name: munich-news-mongodb
restart: unless-stopped
ports:
- "27017:27017"
environment:
MONGO_INITDB_ROOT_USERNAME: admin
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
MONGO_INITDB_DATABASE: munich_news
volumes:
- mongodb_data:/data/db
- mongodb_config:/data/configdb
networks:
- munich-news-network
command: mongod --bind_ip_all --auth
volumes:
mongodb_data:
driver: local
mongodb_config:
driver: local
networks:
munich-news-network:
driver: bridge

32
docker-compose.yml Normal file
View File

@@ -0,0 +1,32 @@
version: '3.8'
services:
mongodb:
image: mongo:7.0
container_name: munich-news-mongodb
restart: unless-stopped
ports:
- "27017:27017"
# For development: MongoDB runs without authentication
# For production: Uncomment the environment variables below and update MONGODB_URI
# environment:
# MONGO_INITDB_ROOT_USERNAME: admin
# MONGO_INITDB_ROOT_PASSWORD: password
# MONGO_INITDB_DATABASE: munich_news
volumes:
- mongodb_data:/data/db
- mongodb_config:/data/configdb
networks:
- munich-news-network
command: mongod --bind_ip_all
volumes:
mongodb_data:
driver: local
mongodb_config:
driver: local
networks:
munich-news-network:
driver: bridge

1320
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

21
frontend/package.json Normal file
View File

@@ -0,0 +1,21 @@
{
"name": "munich-news-frontend",
"version": "1.0.0",
"description": "Munich News Email Platform Frontend",
"main": "server.js",
"scripts": {
"start": "node server.js",
"dev": "nodemon server.js"
},
"keywords": ["news", "munich", "email"],
"author": "",
"license": "MIT",
"dependencies": {
"express": "^4.18.2",
"axios": "^1.6.2"
},
"devDependencies": {
"nodemon": "^3.0.2"
}
}

170
frontend/public/app.js Normal file
View File

@@ -0,0 +1,170 @@
// Load news on page load
document.addEventListener('DOMContentLoaded', () => {
loadNews();
loadStats();
});
async function loadNews() {
const newsGrid = document.getElementById('newsGrid');
newsGrid.innerHTML = '<div class="loading">Loading news...</div>';
try {
const response = await fetch('/api/news');
const data = await response.json();
if (data.articles && data.articles.length > 0) {
displayNews(data.articles);
} else {
newsGrid.innerHTML = '<div class="loading">No news available at the moment. Check back later!</div>';
}
} catch (error) {
console.error('Error loading news:', error);
newsGrid.innerHTML = '<div class="loading">Failed to load news. Please try again later.</div>';
}
}
function displayNews(articles) {
const newsGrid = document.getElementById('newsGrid');
newsGrid.innerHTML = '';
articles.forEach(article => {
const card = document.createElement('div');
card.className = 'news-card';
card.onclick = () => window.open(article.link, '_blank');
card.innerHTML = `
<div class="source">${article.source || 'Munich News'}</div>
<h3>${article.title}</h3>
<p>${article.summary || 'No summary available.'}</p>
<a href="${article.link}" target="_blank" class="read-more" onclick="event.stopPropagation()">Read more →</a>
`;
newsGrid.appendChild(card);
});
}
async function loadStats() {
try {
const response = await fetch('/api/stats');
const data = await response.json();
if (data.subscribers !== undefined) {
document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
}
} catch (error) {
console.error('Error loading stats:', error);
}
}
async function subscribe() {
const emailInput = document.getElementById('emailInput');
const subscribeBtn = document.getElementById('subscribeBtn');
const formMessage = document.getElementById('formMessage');
const email = emailInput.value.trim();
if (!email || !email.includes('@')) {
formMessage.textContent = 'Please enter a valid email address';
formMessage.className = 'form-message error';
return;
}
subscribeBtn.disabled = true;
subscribeBtn.textContent = 'Subscribing...';
formMessage.textContent = '';
try {
const response = await fetch('/api/subscribe', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ email: email })
});
const data = await response.json();
if (response.ok) {
formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
formMessage.className = 'form-message success';
emailInput.value = '';
loadStats(); // Refresh stats
} else {
formMessage.textContent = data.error || 'Failed to subscribe. Please try again.';
formMessage.className = 'form-message error';
}
} catch (error) {
formMessage.textContent = 'Network error. Please try again later.';
formMessage.className = 'form-message error';
} finally {
subscribeBtn.disabled = false;
subscribeBtn.textContent = 'Subscribe Free';
}
}
// Allow Enter key to submit
document.getElementById('emailInput').addEventListener('keypress', (e) => {
if (e.key === 'Enter') {
subscribe();
}
});
function showUnsubscribe() {
document.getElementById('unsubscribeModal').style.display = 'block';
}
function closeUnsubscribe() {
document.getElementById('unsubscribeModal').style.display = 'none';
document.getElementById('unsubscribeEmail').value = '';
document.getElementById('unsubscribeMessage').textContent = '';
}
async function unsubscribe() {
const emailInput = document.getElementById('unsubscribeEmail');
const unsubscribeMessage = document.getElementById('unsubscribeMessage');
const email = emailInput.value.trim();
if (!email || !email.includes('@')) {
unsubscribeMessage.textContent = 'Please enter a valid email address';
unsubscribeMessage.className = 'form-message error';
return;
}
try {
const response = await fetch('/api/unsubscribe', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ email: email })
});
const data = await response.json();
if (response.ok) {
unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
unsubscribeMessage.className = 'form-message success';
emailInput.value = '';
setTimeout(() => {
closeUnsubscribe();
loadStats();
}, 2000);
} else {
unsubscribeMessage.textContent = data.error || 'Failed to unsubscribe. Please try again.';
unsubscribeMessage.className = 'form-message error';
}
} catch (error) {
unsubscribeMessage.textContent = 'Network error. Please try again later.';
unsubscribeMessage.className = 'form-message error';
}
}
// Close modal when clicking outside
window.onclick = function(event) {
const modal = document.getElementById('unsubscribeModal');
if (event.target === modal) {
closeUnsubscribe();
}
}

View File

@@ -0,0 +1,65 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Munich News Daily - Your Daily Dose of Munich News</title>
<link rel="stylesheet" href="styles.css">
</head>
<body>
<div class="container">
<header class="hero">
<div class="hero-content">
<h1>📰 Munich News Daily</h1>
<p class="tagline">Get the latest Munich news delivered to your inbox every morning</p>
<p class="description">Stay informed about what's happening in Munich with our curated daily newsletter. No fluff, just the news that matters.</p>
<div class="subscription-form" id="subscriptionForm">
<input
type="email"
id="emailInput"
placeholder="Enter your email address"
required
>
<button id="subscribeBtn" onclick="subscribe()">Subscribe Free</button>
<p class="form-message" id="formMessage"></p>
</div>
<div class="stats">
<div class="stat-item">
<span class="stat-number" id="subscriberCount">-</span>
<span class="stat-label">Subscribers</span>
</div>
</div>
</div>
</header>
<section class="news-section">
<h2>Latest Munich News</h2>
<div class="news-grid" id="newsGrid">
<div class="loading">Loading news...</div>
</div>
</section>
<footer>
<p>&copy; 2024 Munich News Daily. Made with ❤️ for Munich.</p>
<p><a href="#" onclick="showUnsubscribe()">Unsubscribe</a></p>
</footer>
</div>
<!-- Unsubscribe Modal -->
<div class="modal" id="unsubscribeModal">
<div class="modal-content">
<span class="close" onclick="closeUnsubscribe()">&times;</span>
<h2>Unsubscribe</h2>
<p>Enter your email to unsubscribe from Munich News Daily:</p>
<input type="email" id="unsubscribeEmail" placeholder="Enter your email">
<button onclick="unsubscribe()">Unsubscribe</button>
<p class="form-message" id="unsubscribeMessage"></p>
</div>
</div>
<script src="app.js"></script>
</body>
</html>

306
frontend/public/styles.css Normal file
View File

@@ -0,0 +1,306 @@
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 20px;
}
.hero {
text-align: center;
padding: 60px 20px;
color: white;
}
.hero-content {
max-width: 700px;
margin: 0 auto;
}
.hero h1 {
font-size: 3.5rem;
margin-bottom: 20px;
font-weight: 700;
}
.tagline {
font-size: 1.5rem;
margin-bottom: 15px;
font-weight: 300;
}
.description {
font-size: 1.1rem;
margin-bottom: 40px;
opacity: 0.9;
}
.subscription-form {
display: flex;
flex-direction: column;
gap: 15px;
max-width: 500px;
margin: 0 auto 40px;
}
.subscription-form input {
padding: 15px 20px;
font-size: 1rem;
border: none;
border-radius: 8px;
outline: none;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.subscription-form button {
padding: 15px 30px;
font-size: 1.1rem;
font-weight: 600;
background: #ff6b6b;
color: white;
border: none;
border-radius: 8px;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.subscription-form button:hover {
background: #ff5252;
transform: translateY(-2px);
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
}
.subscription-form button:active {
transform: translateY(0);
}
.form-message {
margin-top: 10px;
font-size: 0.9rem;
min-height: 20px;
}
.form-message.success {
color: #4caf50;
}
.form-message.error {
color: #f44336;
}
.stats {
display: flex;
justify-content: center;
gap: 40px;
margin-top: 40px;
}
.stat-item {
text-align: center;
}
.stat-number {
display: block;
font-size: 2.5rem;
font-weight: 700;
margin-bottom: 5px;
}
.stat-label {
font-size: 0.9rem;
opacity: 0.8;
}
.news-section {
background: white;
border-radius: 20px;
padding: 40px;
margin: 40px 0;
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
}
.news-section h2 {
font-size: 2rem;
margin-bottom: 30px;
color: #333;
text-align: center;
}
.news-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
gap: 25px;
}
.news-card {
background: #f8f9fa;
border-radius: 12px;
padding: 20px;
transition: all 0.3s ease;
border-left: 4px solid #667eea;
cursor: pointer;
}
.news-card:hover {
transform: translateY(-5px);
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
background: white;
}
.news-card h3 {
font-size: 1.2rem;
margin-bottom: 10px;
color: #333;
line-height: 1.4;
}
.news-card p {
color: #666;
font-size: 0.95rem;
margin-bottom: 15px;
line-height: 1.5;
}
.news-card .source {
font-size: 0.85rem;
color: #667eea;
font-weight: 600;
margin-bottom: 10px;
}
.news-card .read-more {
color: #667eea;
text-decoration: none;
font-weight: 600;
font-size: 0.9rem;
display: inline-block;
margin-top: 10px;
}
.news-card .read-more:hover {
text-decoration: underline;
}
.loading {
text-align: center;
padding: 40px;
color: #666;
font-size: 1.1rem;
grid-column: 1 / -1;
}
footer {
text-align: center;
padding: 40px 20px;
color: white;
}
footer a {
color: white;
text-decoration: underline;
cursor: pointer;
}
footer a:hover {
opacity: 0.8;
}
/* Modal Styles */
.modal {
display: none;
position: fixed;
z-index: 1000;
left: 0;
top: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.5);
backdrop-filter: blur(5px);
}
.modal-content {
background-color: white;
margin: 15% auto;
padding: 30px;
border-radius: 12px;
width: 90%;
max-width: 500px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
}
.close {
color: #aaa;
float: right;
font-size: 28px;
font-weight: bold;
cursor: pointer;
}
.close:hover {
color: #000;
}
.modal-content h2 {
margin-bottom: 20px;
color: #333;
}
.modal-content input {
width: 100%;
padding: 12px;
margin: 15px 0;
border: 2px solid #ddd;
border-radius: 8px;
font-size: 1rem;
}
.modal-content button {
width: 100%;
padding: 12px;
background: #ff6b6b;
color: white;
border: none;
border-radius: 8px;
font-size: 1rem;
font-weight: 600;
cursor: pointer;
margin-top: 10px;
}
.modal-content button:hover {
background: #ff5252;
}
/* Responsive Design */
@media (max-width: 768px) {
.hero h1 {
font-size: 2.5rem;
}
.tagline {
font-size: 1.2rem;
}
.news-grid {
grid-template-columns: 1fr;
}
.stats {
flex-direction: column;
gap: 20px;
}
}

57
frontend/server.js Normal file
View File

@@ -0,0 +1,57 @@
const express = require('express');
const path = require('path');
const axios = require('axios');
const app = express();
const PORT = process.env.PORT || 3000;
const API_URL = process.env.API_URL || 'http://localhost:5001';
// Serve static files
app.use(express.static('public'));
app.use(express.json());
// API proxy
app.get('/api/news', async (req, res) => {
try {
const response = await axios.get(`${API_URL}/api/news`);
res.json(response.data);
} catch (error) {
res.status(500).json({ error: 'Failed to fetch news' });
}
});
app.get('/api/stats', async (req, res) => {
try {
const response = await axios.get(`${API_URL}/api/stats`);
res.json(response.data);
} catch (error) {
res.status(500).json({ error: 'Failed to fetch stats' });
}
});
app.post('/api/subscribe', async (req, res) => {
try {
const response = await axios.post(`${API_URL}/api/subscribe`, req.body);
res.json(response.data);
} catch (error) {
res.status(error.response?.status || 500).json(
error.response?.data || { error: 'Failed to subscribe' }
);
}
});
app.post('/api/unsubscribe', async (req, res) => {
try {
const response = await axios.post(`${API_URL}/api/unsubscribe`, req.body);
res.json(response.data);
} catch (error) {
res.status(error.response?.status || 500).json(
error.response?.data || { error: 'Failed to unsubscribe' }
);
}
});
app.listen(PORT, () => {
console.log(`Frontend server running on http://localhost:${PORT}`);
});

25
news_crawler/.gitignore vendored Normal file
View File

@@ -0,0 +1,25 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
ENV/
.venv
# Environment variables
.env
.env.local
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db

191
news_crawler/CHANGES.md Normal file
View File

@@ -0,0 +1,191 @@
# Recent Changes - Full Content Storage
## ✅ What Changed
### 1. Removed Content Length Limit
**Before:**
```python
'content': content_text[:10000] # Limited to 10k chars
```
**After:**
```python
'content': content_text # Full content, no limit
```
### 2. Simplified Database Schema
**Before:**
```javascript
{
summary: String, // Short summary
full_content: String // Limited content
}
```
**After:**
```javascript
{
content: String // Full article content, no limit
}
```
### 3. Enhanced API Response
**Before:**
```javascript
{
title: "...",
link: "...",
summary: "..."
}
```
**After:**
```javascript
{
title: "...",
author: "...", // NEW!
link: "...",
preview: "...", // First 200 chars
word_count: 1250, // NEW!
has_full_content: true // NEW!
}
```
## 📊 Database Structure
### Articles Collection
```javascript
{
_id: ObjectId,
title: String, // Article title
author: String, // Article author (extracted)
link: String, // Article URL (unique)
content: String, // FULL article content (no limit)
word_count: Number, // Word count
source: String, // RSS feed name
published_at: String, // Publication date
crawled_at: DateTime, // When crawled
created_at: DateTime // When added
}
```
## 🆕 New API Endpoint
### GET /api/news/<article_url>
Get full article content by URL.
**Example:**
```bash
# URL encode the article URL
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
```
**Response:**
```json
{
"title": "New U-Bahn Line Opens in Munich",
"author": "Max Mustermann",
"link": "https://example.com/article",
"content": "The full article text here... (complete, no truncation)",
"word_count": 1250,
"source": "Süddeutsche Zeitung München",
"published_at": "2024-11-10T10:00:00Z",
"crawled_at": "2024-11-10T16:30:00Z",
"created_at": "2024-11-10T16:00:00Z"
}
```
## 📈 Enhanced Stats
### GET /api/stats
Now includes crawled article count:
```json
{
"subscribers": 150,
"articles": 500,
"crawled_articles": 350 // NEW!
}
```
## 🎯 Benefits
1. **Complete Content** - No truncation, full articles stored
2. **Better for AI** - Full context for summarization/analysis
3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content`
4. **More Metadata** - Author, word count, crawl timestamp
5. **Better API** - Preview in list, full content on demand
## 🔄 Migration
If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field.
To migrate old articles:
```javascript
// MongoDB shell
db.articles.updateMany(
{ full_content: { $exists: true } },
[
{
$set: {
content: "$full_content"
}
},
{
$unset: ["full_content", "summary"]
}
]
)
```
## 🚀 Usage
### Crawl Articles
```bash
cd news_crawler
python crawler_service.py 10
```
### Get Article List (with previews)
```bash
curl http://localhost:5001/api/news
```
### Get Full Article Content
```bash
# Get the article URL from the list, then:
curl "http://localhost:5001/api/news/<encoded_url>"
```
### Check Stats
```bash
curl http://localhost:5001/api/stats
```
## 📝 Example Workflow
1. **Add RSS Feed**
```bash
curl -X POST http://localhost:5001/api/rss-feeds \
-H "Content-Type: application/json" \
-d '{"name": "News Source", "url": "https://example.com/rss"}'
```
2. **Crawl Articles**
```bash
cd news_crawler
python crawler_service.py 10
```
3. **View Articles**
```bash
curl http://localhost:5001/api/news
```
4. **Get Full Content**
```bash
# Copy article link from above, URL encode it
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
```
Now you have complete article content ready for AI processing! 🎉

13
news_crawler/Dockerfile Normal file
View File

@@ -0,0 +1,13 @@
FROM python:3.11-slim
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy crawler service
COPY crawler_service.py .
# Run crawler
CMD ["python", "crawler_service.py"]

View File

@@ -0,0 +1,353 @@
# Content Extraction Strategies
The crawler uses multiple strategies to dynamically extract article metadata from any website.
## 🎯 What Gets Extracted
1. **Title** - Article headline
2. **Author** - Article writer/journalist
3. **Published Date** - When article was published
4. **Content** - Main article text
5. **Description** - Meta description/summary
## 📋 Extraction Strategies
### 1. Title Extraction
Tries multiple methods in order of reliability:
#### Strategy 1: H1 Tag
```html
<h1>Article Title Here</h1>
```
✅ Most reliable - usually the main headline
#### Strategy 2: Open Graph Meta Tag
```html
<meta property="og:title" content="Article Title Here" />
```
✅ Used by Facebook, very reliable
#### Strategy 3: Twitter Card Meta Tag
```html
<meta name="twitter:title" content="Article Title Here" />
```
✅ Used by Twitter, reliable
#### Strategy 4: Title Tag (Fallback)
```html
<title>Article Title | Site Name</title>
```
⚠️ Often includes site name, needs cleaning
**Cleaning:**
- Removes " | Site Name"
- Removes " - Site Name"
---
### 2. Author Extraction
Tries multiple methods:
#### Strategy 1: Meta Author Tag
```html
<meta name="author" content="John Doe" />
```
✅ Standard HTML meta tag
#### Strategy 2: Rel="author" Link
```html
<a rel="author" href="/author/john-doe">John Doe</a>
```
✅ Semantic HTML
#### Strategy 3: Common Class Names
```html
<div class="author-name">John Doe</div>
<span class="byline">By John Doe</span>
<p class="writer">John Doe</p>
```
✅ Searches for: author-name, author, byline, writer
#### Strategy 4: Schema.org Markup
```html
<span itemprop="author">John Doe</span>
```
✅ Structured data
#### Strategy 5: JSON-LD Structured Data
```html
<script type="application/ld+json">
{
"@type": "NewsArticle",
"author": {
"@type": "Person",
"name": "John Doe"
}
}
</script>
```
✅ Most structured, very reliable
**Cleaning:**
- Removes "By " prefix
- Validates length (< 100 chars)
---
### 3. Date Extraction
Tries multiple methods:
#### Strategy 1: Time Tag with Datetime
```html
<time datetime="2024-11-10T10:00:00Z">November 10, 2024</time>
```
✅ Most reliable - ISO format
#### Strategy 2: Article Published Time Meta
```html
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
```
✅ Open Graph standard
#### Strategy 3: OG Published Time
```html
<meta property="og:published_time" content="2024-11-10T10:00:00Z" />
```
✅ Facebook standard
#### Strategy 4: Common Class Names
```html
<span class="publish-date">November 10, 2024</span>
<time class="published">2024-11-10</time>
<div class="timestamp">10:00 AM, Nov 10</div>
```
✅ Searches for: publish-date, published, date, timestamp
#### Strategy 5: Schema.org Markup
```html
<meta itemprop="datePublished" content="2024-11-10T10:00:00Z" />
```
✅ Structured data
#### Strategy 6: JSON-LD Structured Data
```html
<script type="application/ld+json">
{
"@type": "NewsArticle",
"datePublished": "2024-11-10T10:00:00Z"
}
</script>
```
✅ Most structured
---
### 4. Content Extraction
Tries multiple methods:
#### Strategy 1: Semantic HTML Tags
```html
<article>
<p>Article content here...</p>
</article>
```
✅ Best practice HTML5
#### Strategy 2: Common Class Names
```html
<div class="article-content">...</div>
<div class="article-body">...</div>
<div class="post-content">...</div>
<div class="entry-content">...</div>
<div class="story-body">...</div>
```
✅ Searches for common patterns
#### Strategy 3: Schema.org Markup
```html
<div itemprop="articleBody">
<p>Content here...</p>
</div>
```
✅ Structured data
#### Strategy 4: Main Tag
```html
<main>
<p>Content here...</p>
</main>
```
✅ Semantic HTML5
#### Strategy 5: Body Tag (Fallback)
```html
<body>
<p>Content here...</p>
</body>
```
⚠️ Last resort, may include navigation
**Content Filtering:**
- Removes `<script>`, `<style>`, `<nav>`, `<footer>`, `<header>`, `<aside>`
- Filters out short paragraphs (< 50 chars) - likely ads/navigation
- Keeps only substantial paragraphs
- **No length limit** - stores full article content
---
## 🔍 How It Works
### Example: Crawling a News Article
```python
# 1. Fetch HTML
response = requests.get(article_url)
soup = BeautifulSoup(response.content, 'html.parser')
# 2. Extract title (tries 4 strategies)
title = extract_title(soup)
# Result: "New U-Bahn Line Opens in Munich"
# 3. Extract author (tries 5 strategies)
author = extract_author(soup)
# Result: "Max Mustermann"
# 4. Extract date (tries 6 strategies)
published_date = extract_date(soup)
# Result: "2024-11-10T10:00:00Z"
# 5. Extract content (tries 5 strategies)
content = extract_main_content(soup)
# Result: "The new U-Bahn line connecting..."
# 6. Save to database
article_doc = {
'title': title,
'author': author,
'published_at': published_date,
'full_content': content,
'word_count': len(content.split())
}
```
---
## 📊 Success Rates by Strategy
Based on common news sites:
| Strategy | Success Rate | Notes |
|----------|-------------|-------|
| H1 for title | 95% | Almost universal |
| OG meta tags | 90% | Most modern sites |
| Time tag for date | 85% | HTML5 sites |
| JSON-LD | 70% | Growing adoption |
| Class name patterns | 60% | Varies by site |
| Schema.org | 50% | Not widely adopted |
---
## 🎨 Real-World Examples
### Example 1: Süddeutsche Zeitung
```html
<article>
<h1>New U-Bahn Line Opens</h1>
<span class="author">Max Mustermann</span>
<time datetime="2024-11-10T10:00:00Z">10. November 2024</time>
<div class="article-body">
<p>The new U-Bahn line...</p>
</div>
</article>
```
✅ Extracts: Title (H1), Author (class), Date (time), Content (article-body)
### Example 2: Medium Blog
```html
<article>
<h1>How to Build a News Crawler</h1>
<meta property="og:title" content="How to Build a News Crawler" />
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
<a rel="author" href="/author">Jane Smith</a>
<section>
<p>In this article...</p>
</section>
</article>
```
✅ Extracts: Title (OG meta), Author (rel), Date (article meta), Content (section)
### Example 3: WordPress Blog
```html
<div class="post">
<h1 class="entry-title">My Blog Post</h1>
<span class="byline">By John Doe</span>
<time class="published">November 10, 2024</time>
<div class="entry-content">
<p>Blog content here...</p>
</div>
</div>
```
✅ Extracts: Title (H1), Author (byline), Date (published), Content (entry-content)
---
## ⚠️ Edge Cases Handled
1. **Missing Fields**: Returns `None` instead of crashing
2. **Multiple Authors**: Takes first one found
3. **Relative Dates**: Stores as-is ("2 hours ago")
4. **Paywalls**: Extracts what's available
5. **JavaScript-rendered**: Only gets server-side HTML
6. **Ads/Navigation**: Filtered out by paragraph length
7. **Site Name in Title**: Cleaned automatically
---
## 🚀 Future Improvements
Potential enhancements:
- [ ] JavaScript rendering (Selenium/Playwright)
- [ ] Paywall bypass (where legal)
- [ ] Image extraction
- [ ] Video detection
- [ ] Related articles
- [ ] Tags/categories
- [ ] Reading time estimation
- [ ] Language detection
- [ ] Sentiment analysis
---
## 🧪 Testing
Test the extraction on a specific URL:
```python
from crawler_service import extract_article_content
url = "https://www.sueddeutsche.de/muenchen/article-123"
data = extract_article_content(url)
print(f"Title: {data['title']}")
print(f"Author: {data['author']}")
print(f"Date: {data['published_date']}")
print(f"Content length: {len(data['content'])} chars")
print(f"Word count: {data['word_count']}")
```
---
## 📚 Standards Supported
- ✅ HTML5 semantic tags
- ✅ Open Graph Protocol
- ✅ Twitter Cards
- ✅ Schema.org microdata
- ✅ JSON-LD structured data
- ✅ Dublin Core metadata
- ✅ Common CSS class patterns

View File

@@ -0,0 +1,306 @@
# How the News Crawler Works
## 🎯 Overview
The crawler dynamically extracts article metadata from any website using multiple fallback strategies.
## 📊 Flow Diagram
```
RSS Feed URL
Parse RSS Feed
For each article link:
┌─────────────────────────────────────┐
│ 1. Fetch HTML Page │
│ GET https://example.com/article │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 2. Parse with BeautifulSoup │
│ soup = BeautifulSoup(html) │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 3. Clean HTML │
│ Remove: scripts, styles, nav, │
│ footer, header, ads │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 4. Extract Title │
│ Try: H1 → OG meta → Twitter → │
│ Title tag │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 5. Extract Author │
│ Try: Meta author → rel=author → │
│ Class names → JSON-LD │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 6. Extract Date │
│ Try: <time> → Meta tags → │
│ Class names → JSON-LD │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 7. Extract Content │
│ Try: <article> → Class names → │
│ <main> → <body> │
│ Filter short paragraphs │
└─────────────────────────────────────┘
┌─────────────────────────────────────┐
│ 8. Save to MongoDB │
│ { │
│ title, author, date, │
│ content, word_count │
│ } │
└─────────────────────────────────────┘
Wait 1 second (rate limiting)
Next article
```
## 🔍 Detailed Example
### Input: RSS Feed Entry
```xml
<item>
<title>New U-Bahn Line Opens</title>
<link>https://www.sueddeutsche.de/muenchen/article-123</link>
<pubDate>Mon, 10 Nov 2024 10:00:00 +0100</pubDate>
</item>
```
### Step 1: Fetch HTML
```python
url = "https://www.sueddeutsche.de/muenchen/article-123"
response = requests.get(url)
html = response.content
```
### Step 2: Parse HTML
```python
soup = BeautifulSoup(html, 'html.parser')
```
### Step 3: Extract Title
```python
# Try H1
h1 = soup.find('h1')
# Result: "New U-Bahn Line Opens in Munich"
# If no H1, try OG meta
og_title = soup.find('meta', property='og:title')
# Fallback chain continues...
```
### Step 4: Extract Author
```python
# Try meta author
meta_author = soup.find('meta', name='author')
# Result: None
# Try class names
author_elem = soup.select_one('[class*="author"]')
# Result: "Max Mustermann"
```
### Step 5: Extract Date
```python
# Try time tag
time_tag = soup.find('time')
# Result: "2024-11-10T10:00:00Z"
```
### Step 6: Extract Content
```python
# Try article tag
article = soup.find('article')
paragraphs = article.find_all('p')
# Filter paragraphs
content = []
for p in paragraphs:
text = p.get_text().strip()
if len(text) >= 50: # Keep substantial paragraphs
content.append(text)
full_content = '\n\n'.join(content)
# Result: "The new U-Bahn line connecting the city center..."
```
### Step 7: Save to Database
```python
article_doc = {
'title': 'New U-Bahn Line Opens in Munich',
'author': 'Max Mustermann',
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
'summary': 'Short summary from RSS...',
'full_content': 'The new U-Bahn line connecting...',
'word_count': 1250,
'source': 'Süddeutsche Zeitung München',
'published_at': '2024-11-10T10:00:00Z',
'crawled_at': datetime.utcnow(),
'created_at': datetime.utcnow()
}
db.articles.update_one(
{'link': article_url},
{'$set': article_doc},
upsert=True
)
```
## 🎨 What Makes It "Dynamic"?
### Traditional Approach (Hardcoded)
```python
# Only works for one specific site
title = soup.find('h1', class_='article-title').text
author = soup.find('span', class_='author-name').text
```
❌ Breaks when site changes
❌ Doesn't work on other sites
### Our Approach (Dynamic)
```python
# Works on ANY site
title = extract_title(soup) # Tries 4 different methods
author = extract_author(soup) # Tries 5 different methods
```
✅ Adapts to different HTML structures
✅ Falls back to alternatives
✅ Works across multiple sites
## 🛡️ Robustness Features
### 1. Multiple Strategies
Each field has 4-6 extraction strategies
```python
def extract_title(soup):
# Try strategy 1
if h1 := soup.find('h1'):
return h1.text
# Try strategy 2
if og_title := soup.find('meta', property='og:title'):
return og_title['content']
# Try strategy 3...
# Try strategy 4...
```
### 2. Validation
```python
# Title must be reasonable length
if title and len(title) > 10:
return title
# Author must be < 100 chars
if author and len(author) < 100:
return author
```
### 3. Cleaning
```python
# Remove site name from title
if ' | ' in title:
title = title.split(' | ')[0]
# Remove "By" from author
author = author.replace('By ', '').strip()
```
### 4. Error Handling
```python
try:
data = extract_article_content(url)
except Timeout:
print("Timeout - skip")
except RequestException:
print("Network error - skip")
except Exception:
print("Unknown error - skip")
```
## 📈 Success Metrics
After crawling, you'll see:
```
📰 Crawling feed: Süddeutsche Zeitung München
🔍 Crawling: New U-Bahn Line Opens...
✓ Saved (1250 words)
Title: ✓ Found
Author: ✓ Found (Max Mustermann)
Date: ✓ Found (2024-11-10T10:00:00Z)
Content: ✓ Found (1250 words)
```
## 🗄️ Database Result
**Before Crawling:**
```javascript
{
title: "New U-Bahn Line Opens",
link: "https://example.com/article",
summary: "Short RSS summary...",
source: "Süddeutsche Zeitung"
}
```
**After Crawling:**
```javascript
{
title: "New U-Bahn Line Opens in Munich", // ← Enhanced
author: "Max Mustermann", // ← NEW!
link: "https://example.com/article",
summary: "Short RSS summary...",
full_content: "The new U-Bahn line...", // ← NEW! (1250 words)
word_count: 1250, // ← NEW!
source: "Süddeutsche Zeitung",
published_at: "2024-11-10T10:00:00Z", // ← Enhanced
crawled_at: ISODate("2024-11-10T16:30:00Z"), // ← NEW!
created_at: ISODate("2024-11-10T16:00:00Z")
}
```
## 🚀 Running the Crawler
```bash
cd news_crawler
pip install -r requirements.txt
python crawler_service.py 10
```
Output:
```
============================================================
🚀 Starting RSS Feed Crawler
============================================================
Found 3 active feed(s)
📰 Crawling feed: Süddeutsche Zeitung München
🔍 Crawling: New U-Bahn Line Opens...
✓ Saved (1250 words)
🔍 Crawling: Munich Weather Update...
✓ Saved (450 words)
✓ Crawled 2 articles
============================================================
✓ Crawling Complete!
Total feeds processed: 3
Total articles crawled: 15
Duration: 45.23 seconds
============================================================
```
Now you have rich, structured article data ready for AI processing! 🎉

127
news_crawler/QUICKSTART.md Normal file
View File

@@ -0,0 +1,127 @@
# News Crawler - Quick Start
## 1. Install Dependencies
```bash
cd news_crawler
pip install -r requirements.txt
```
## 2. Configure Environment
Make sure MongoDB is running and accessible. The crawler will use the same database as the backend.
Default connection: `mongodb://localhost:27017/`
To use a different MongoDB URI, create a `.env` file:
```env
MONGODB_URI=mongodb://localhost:27017/
```
## 3. Run the Crawler
```bash
# Crawl up to 10 articles per feed
python crawler_service.py
# Crawl up to 20 articles per feed
python crawler_service.py 20
```
## 4. Verify Results
Check your MongoDB database:
```bash
# Using mongosh
mongosh
use munich_news
db.articles.find({full_content: {$exists: true}}).count()
db.articles.findOne({full_content: {$exists: true}})
```
## 5. Schedule Regular Crawling
### Option A: Cron (Linux/Mac)
```bash
# Edit crontab
crontab -e
# Add this line to run every 6 hours
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
```
### Option B: Docker
```bash
# Build and run
docker-compose up
# Or run as a one-off
docker-compose run --rm crawler
```
### Option C: Manual
Just run the script whenever you want to fetch new articles:
```bash
python crawler_service.py
```
## What Gets Crawled?
The crawler:
1. Fetches all active RSS feeds from the database
2. For each feed, gets the latest articles
3. Crawls the full content from each article URL
4. Saves: title, full_content, word_count, crawled_at
5. Skips articles that already have content
## Output Example
```
============================================================
🚀 Starting RSS Feed Crawler
============================================================
Found 3 active feed(s)
📰 Crawling feed: Süddeutsche Zeitung München
URL: https://www.sueddeutsche.de/muenchen/rss
🔍 Crawling: New U-Bahn Line Opens in Munich...
✓ Saved (1250 words)
🔍 Crawling: Munich Weather Update...
✓ Saved (450 words)
✓ Crawled 2 articles from Süddeutsche Zeitung München
============================================================
✓ Crawling Complete!
Total feeds processed: 3
Total articles crawled: 15
Duration: 45.23 seconds
============================================================
```
## Troubleshooting
**No feeds found:**
- Make sure you've added RSS feeds via the backend API
- Check MongoDB connection
**Can't extract content:**
- Some sites block scrapers
- Some sites require JavaScript (not supported yet)
- Check if the URL is accessible
**Timeout errors:**
- Increase timeout in the code
- Check your internet connection
## Next Steps
Once articles are crawled, you can:
- View them in the frontend
- Use Ollama to summarize them
- Generate newsletters with full content
- Perform text analysis

225
news_crawler/README.md Normal file
View File

@@ -0,0 +1,225 @@
# News Crawler Microservice
A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB.
## Features
- 🔍 Extracts full article content from RSS feed links
- 📊 Calculates word count
- 🔄 Avoids re-crawling already processed articles
- ⏱️ Rate limiting (1 second delay between requests)
- 🎯 Smart content extraction using multiple selectors
- 🧹 Cleans up scripts, styles, and navigation elements
## Installation
1. Create a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Configure environment variables:
Create a `.env` file in the project root (or use the backend's `.env`):
```env
MONGODB_URI=mongodb://localhost:27017/
```
## Usage
### Standalone Execution
Run the crawler directly:
```bash
# Crawl up to 10 articles per feed (default)
python crawler_service.py
# Crawl up to 20 articles per feed
python crawler_service.py 20
```
### As a Module
```python
from crawler_service import crawl_all_feeds, crawl_rss_feed
# Crawl all active feeds
result = crawl_all_feeds(max_articles_per_feed=10)
print(result)
# Crawl a specific feed
crawl_rss_feed(
feed_url='https://example.com/rss',
feed_name='Example News',
max_articles=10
)
```
### Via Backend API
The backend has integrated endpoints:
```bash
# Start crawler
curl -X POST http://localhost:5001/api/crawler/start
# Check status
curl http://localhost:5001/api/crawler/status
# Crawl specific feed
curl -X POST http://localhost:5001/api/crawler/feed/<feed_id>
```
## How It Works
1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB
2. **Parse Feed**: Extracts article links from each feed
3. **Crawl Content**: For each article:
- Fetches HTML page
- Removes scripts, styles, navigation
- Extracts main content using smart selectors
- Calculates word count
4. **Store Data**: Saves to MongoDB with metadata
5. **Skip Duplicates**: Avoids re-crawling articles with existing content
## Content Extraction Strategy
The crawler tries multiple selectors in order:
1. `<article>` tag
2. Elements with class containing "article-content", "article-body"
3. Elements with class containing "post-content", "entry-content"
4. `<main>` tag
5. Fallback to all `<p>` tags in body
## Database Schema
Articles are stored with these fields:
```javascript
{
title: String, // Article title
link: String, // Article URL (unique)
summary: String, // Short summary
full_content: String, // Full article text (max 10,000 chars)
word_count: Number, // Number of words
source: String, // RSS feed name
published_at: String, // Publication date
crawled_at: DateTime, // When content was crawled
created_at: DateTime // When added to database
}
```
## Scheduling
### Using Cron (Linux/Mac)
```bash
# Run every 6 hours
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
```
### Using systemd Timer (Linux)
Create `/etc/systemd/system/news-crawler.service`:
```ini
[Unit]
Description=News Crawler Service
[Service]
Type=oneshot
WorkingDirectory=/path/to/news_crawler
ExecStart=/path/to/venv/bin/python crawler_service.py
User=your-user
```
Create `/etc/systemd/system/news-crawler.timer`:
```ini
[Unit]
Description=Run News Crawler every 6 hours
[Timer]
OnBootSec=5min
OnUnitActiveSec=6h
[Install]
WantedBy=timers.target
```
Enable and start:
```bash
sudo systemctl enable news-crawler.timer
sudo systemctl start news-crawler.timer
```
### Using Docker
Create `Dockerfile`:
```dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY crawler_service.py .
CMD ["python", "crawler_service.py"]
```
Build and run:
```bash
docker build -t news-crawler .
docker run --env-file ../.env news-crawler
```
## Configuration
Environment variables:
- `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`)
## Rate Limiting
- 1 second delay between article requests
- Respects server resources
- User-Agent header included
## Troubleshooting
**Issue: Can't extract content**
- Some sites block scrapers
- Try adjusting User-Agent header
- Some sites require JavaScript (consider Selenium)
**Issue: Timeout errors**
- Increase timeout in `extract_article_content()`
- Check network connectivity
**Issue: Memory usage**
- Reduce `max_articles_per_feed`
- Content limited to 10,000 characters per article
## Architecture
This is a standalone microservice that:
- Can run independently of the main backend
- Shares the same MongoDB database
- Can be deployed separately
- Can be scheduled independently
## Next Steps
Once articles are crawled, you can:
- Use Ollama to summarize articles
- Perform sentiment analysis
- Extract keywords and topics
- Generate newsletter content
- Create article recommendations

View File

@@ -0,0 +1,194 @@
# RSS URL Extraction - How It Works
## The Problem
Different RSS feed providers use different fields to store the article URL:
### Example 1: Standard RSS (uses `link`)
```xml
<item>
<title>Article Title</title>
<link>https://example.com/article/123</link>
<guid>internal-id-456</guid>
</item>
```
### Example 2: Some feeds (uses `guid` as URL)
```xml
<item>
<title>Article Title</title>
<guid>https://example.com/article/123</guid>
</item>
```
### Example 3: Atom feeds (uses `id`)
```xml
<entry>
<title>Article Title</title>
<id>https://example.com/article/123</id>
</entry>
```
### Example 4: Complex feeds (guid as object)
```xml
<item>
<title>Article Title</title>
<guid isPermaLink="true">https://example.com/article/123</guid>
</item>
```
### Example 5: Multiple links
```xml
<item>
<title>Article Title</title>
<link rel="alternate" type="text/html" href="https://example.com/article/123"/>
<link rel="enclosure" type="image/jpeg" href="https://example.com/image.jpg"/>
</item>
```
## Our Solution
The `extract_article_url()` function tries multiple strategies in order:
### Strategy 1: Check `link` field (most common)
```python
if entry.get('link') and entry.get('link', '').startswith('http'):
return entry.get('link')
```
✅ Works for: Most RSS 2.0 feeds
### Strategy 2: Check `guid` field
```python
if entry.get('guid'):
guid = entry.get('guid')
# guid can be a string
if isinstance(guid, str) and guid.startswith('http'):
return guid
# or a dict with 'href'
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
return guid.get('href')
```
✅ Works for: Feeds that use GUID as permalink
### Strategy 3: Check `id` field
```python
if entry.get('id') and entry.get('id', '').startswith('http'):
return entry.get('id')
```
✅ Works for: Atom feeds
### Strategy 4: Check `links` array
```python
if entry.get('links'):
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
# Prefer 'alternate' type
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
return link.get('href')
```
✅ Works for: Feeds with multiple links (prefers HTML content)
## Real-World Examples
### Süddeutsche Zeitung
```python
entry = {
'title': 'Munich News',
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
'guid': 'sz-internal-123'
}
# Returns: 'https://www.sueddeutsche.de/muenchen/article-123'
```
### Medium Blog
```python
entry = {
'title': 'Blog Post',
'guid': 'https://medium.com/@user/post-abc123',
'link': None
}
# Returns: 'https://medium.com/@user/post-abc123'
```
### YouTube RSS
```python
entry = {
'title': 'Video Title',
'id': 'https://www.youtube.com/watch?v=abc123',
'link': None
}
# Returns: 'https://www.youtube.com/watch?v=abc123'
```
### Complex Feed
```python
entry = {
'title': 'Article',
'links': [
{'rel': 'alternate', 'type': 'text/html', 'href': 'https://example.com/article'},
{'rel': 'enclosure', 'type': 'image/jpeg', 'href': 'https://example.com/image.jpg'}
]
}
# Returns: 'https://example.com/article' (prefers text/html)
```
## Validation
All extracted URLs must:
1. Start with `http://` or `https://`
2. Be a valid string (not None or empty)
If no valid URL is found:
```python
return None
# Crawler will skip this entry and log a warning
```
## Testing Different Feeds
To test if a feed works with our extractor:
```python
import feedparser
from rss_utils import extract_article_url
# Parse feed
feed = feedparser.parse('https://example.com/rss')
# Test each entry
for entry in feed.entries[:5]:
url = extract_article_url(entry)
if url:
print(f"{entry.get('title', 'No title')[:50]}")
print(f" URL: {url}")
else:
print(f"{entry.get('title', 'No title')[:50]}")
print(f" No valid URL found")
print(f" Available fields: {list(entry.keys())}")
```
## Supported Feed Types
✅ RSS 2.0
✅ RSS 1.0
✅ Atom
✅ Custom RSS variants
✅ Feeds with multiple links
✅ Feeds with GUID as permalink
## Edge Cases Handled
1. **GUID is not a URL**: Checks if it starts with `http`
2. **Multiple links**: Prefers `text/html` type
3. **GUID as dict**: Extracts `href` field
4. **Missing fields**: Returns None instead of crashing
5. **Non-HTTP URLs**: Filters out `mailto:`, `ftp:`, etc.
## Future Improvements
Potential enhancements:
- [ ] Support for `feedburner:origLink`
- [ ] Support for `pheedo:origLink`
- [ ] Resolve shortened URLs (bit.ly, etc.)
- [ ] Handle relative URLs (convert to absolute)
- [ ] Cache URL extraction results

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python
"""
Quick script to check what RSS feeds are in the database
"""
from pymongo import MongoClient
import os
import sys
# Add parent directory to path to import from backend
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'backend'))
try:
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', 'backend', '.env'))
except:
pass
# MongoDB setup
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
print(f"Connecting to: {MONGODB_URI}")
print(f"Database: {DB_NAME}\n")
try:
client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
# Test connection
client.server_info()
print("✓ Connected to MongoDB\n")
db = client[DB_NAME]
rss_feeds_collection = db['rss_feeds']
# Get all feeds
feeds = list(rss_feeds_collection.find())
if not feeds:
print("❌ No RSS feeds found in database\n")
print("Add feeds using the API:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
sys.exit(1)
print(f"Found {len(feeds)} RSS feed(s):\n")
print("="*80)
for i, feed in enumerate(feeds, 1):
print(f"\n{i}. {feed.get('name', 'Unknown')}")
print(f" URL: {feed.get('url', 'N/A')}")
print(f" Active: {feed.get('active', True)}")
print(f" Created: {feed.get('created_at', 'N/A')}")
print(f" ID: {feed.get('_id', 'N/A')}")
print("\n" + "="*80)
# Check articles
articles_collection = db['articles']
total_articles = articles_collection.count_documents({})
crawled_articles = articles_collection.count_documents({'full_content': {'$exists': True}})
print(f"\nArticles in database:")
print(f" Total: {total_articles}")
print(f" With full content: {crawled_articles}")
print(f" Without full content: {total_articles - crawled_articles}")
if total_articles > 0:
print("\nSample article:")
sample = articles_collection.find_one()
print(f" Title: {sample.get('title', 'N/A')[:60]}")
print(f" Link: {sample.get('link', 'N/A')}")
print(f" Has full_content: {bool(sample.get('full_content'))}")
print(f" Word count: {sample.get('word_count', 'N/A')}")
print("\n✓ Database check complete!")
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)

90
news_crawler/config.py Normal file
View File

@@ -0,0 +1,90 @@
"""
Configuration management for news crawler
"""
import os
from dotenv import load_dotenv
from pathlib import Path
# Load environment variables from backend/.env
backend_dir = Path(__file__).parent.parent / 'backend'
env_path = backend_dir / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
print(f"✓ Loaded configuration from: {env_path}")
else:
print(f"⚠ Warning: .env file not found at {env_path}")
class Config:
"""Centralized configuration for news crawler"""
# MongoDB Configuration
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
# Ollama Configuration
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
# Crawler Configuration
RATE_LIMIT_DELAY = 1 # seconds between requests
MAX_CONTENT_LENGTH = 50000 # characters
SUMMARY_MAX_WORDS = 150 # maximum words in AI summary
@classmethod
def print_config(cls):
"""Print current configuration (without sensitive data)"""
print("\n" + "="*60)
print("News Crawler Configuration")
print("="*60)
print(f"MongoDB URI: {cls.MONGODB_URI}")
print(f"Database: {cls.DB_NAME}")
print(f"\nOllama Configuration:")
print(f" Base URL: {cls.OLLAMA_BASE_URL}")
print(f" Model: {cls.OLLAMA_MODEL}")
print(f" Enabled: {cls.OLLAMA_ENABLED}")
print(f" Timeout: {cls.OLLAMA_TIMEOUT}s")
print(f" Has API Key: {bool(cls.OLLAMA_API_KEY)}")
print(f"\nCrawler Settings:")
print(f" Rate Limit: {cls.RATE_LIMIT_DELAY}s between requests")
print(f" Max Content: {cls.MAX_CONTENT_LENGTH} chars")
print(f" Summary Length: {cls.SUMMARY_MAX_WORDS} words")
print("="*60 + "\n")
@classmethod
def validate(cls):
"""Validate configuration and return list of issues"""
issues = []
# Check MongoDB
if not cls.MONGODB_URI:
issues.append("MONGODB_URI is not set")
# Check Ollama if enabled
if cls.OLLAMA_ENABLED:
if not cls.OLLAMA_BASE_URL:
issues.append("OLLAMA_BASE_URL is not set but Ollama is enabled")
if not cls.OLLAMA_MODEL:
issues.append("OLLAMA_MODEL is not set but Ollama is enabled")
if cls.OLLAMA_TIMEOUT < 5:
issues.append(f"OLLAMA_TIMEOUT ({cls.OLLAMA_TIMEOUT}s) is too low, recommend at least 5s")
return issues
if __name__ == '__main__':
# Test configuration
Config.print_config()
# Validate
issues = Config.validate()
if issues:
print("⚠ Configuration Issues:")
for issue in issues:
print(f" - {issue}")
else:
print("✓ Configuration is valid")

View File

@@ -0,0 +1,489 @@
"""
Web crawler service to extract full article content from RSS feed links
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import feedparser
import time
import os
from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
from config import Config
from ollama_client import OllamaClient
# Load environment variables
load_dotenv(dotenv_path='../.env')
# MongoDB setup
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
rss_feeds_collection = db['rss_feeds']
# Initialize Ollama client
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
api_key=Config.OLLAMA_API_KEY,
enabled=Config.OLLAMA_ENABLED,
timeout=Config.OLLAMA_TIMEOUT
)
# Print configuration on startup
if __name__ != '__main__':
Config.print_config()
if Config.OLLAMA_ENABLED:
print("🤖 Ollama AI summarization: ENABLED")
if ollama_client.is_available():
print("✓ Ollama server is reachable")
else:
print("⚠ Warning: Ollama server is not reachable")
else:
print(" Ollama AI summarization: DISABLED")
def get_active_rss_feeds():
"""Get all active RSS feeds from database"""
feeds = []
cursor = rss_feeds_collection.find({'active': True})
for feed in cursor:
feeds.append({
'id': str(feed['_id']),
'name': feed.get('name', ''),
'url': feed.get('url', '')
})
return feeds
def extract_article_content(url, timeout=10):
"""
Extract main article content from a URL with smart detection
Returns: dict with title, content, author, date, and metadata
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
script.decompose()
# === EXTRACT TITLE ===
title = extract_title(soup)
# === EXTRACT AUTHOR ===
author = extract_author(soup)
# === EXTRACT PUBLISHED DATE ===
published_date = extract_date(soup)
# === EXTRACT MAIN CONTENT ===
content_text = extract_main_content(soup)
# === EXTRACT META DESCRIPTION ===
meta_desc = soup.find('meta', attrs={'name': 'description'})
if not meta_desc:
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
description = meta_desc.get('content', '') if meta_desc else ''
return {
'title': title,
'author': author,
'content': content_text, # Full content, no limit
'description': description,
'published_date': published_date,
'word_count': len(content_text.split()) if content_text else 0,
'crawled_at': datetime.utcnow()
}
except requests.exceptions.Timeout:
print(f"Timeout crawling {url}")
return None
except requests.exceptions.RequestException as e:
print(f"Error crawling {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error crawling {url}: {e}")
return None
def extract_title(soup):
"""
Extract article title using multiple strategies
"""
# Strategy 1: Look for h1 tag
h1 = soup.find('h1')
if h1:
title = h1.get_text().strip()
if title and len(title) > 10: # Reasonable title length
return title
# Strategy 2: Look for meta og:title
og_title = soup.find('meta', attrs={'property': 'og:title'})
if og_title and og_title.get('content'):
return og_title.get('content').strip()
# Strategy 3: Look for meta twitter:title
twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
if twitter_title and twitter_title.get('content'):
return twitter_title.get('content').strip()
# Strategy 4: Look for title tag (fallback)
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
# Clean up common patterns like "Site Name | Article Title"
if ' | ' in title:
title = title.split(' | ')[0]
elif ' - ' in title:
title = title.split(' - ')[0]
return title
return None
def extract_author(soup):
"""
Extract article author using multiple strategies
"""
# Strategy 1: Look for meta author
meta_author = soup.find('meta', attrs={'name': 'author'})
if meta_author and meta_author.get('content'):
return meta_author.get('content').strip()
# Strategy 2: Look for rel="author"
rel_author = soup.find('a', attrs={'rel': 'author'})
if rel_author:
return rel_author.get_text().strip()
# Strategy 3: Look for common author class names
author_selectors = [
'[class*="author-name"]',
'[class*="author"]',
'[class*="byline"]',
'[class*="writer"]',
'[rel="author"]',
'[itemprop="author"]'
]
for selector in author_selectors:
author_elem = soup.select_one(selector)
if author_elem:
author = author_elem.get_text().strip()
# Clean up common patterns
author = author.replace('By ', '').replace('by ', '').strip()
if author and len(author) < 100: # Reasonable author name length
return author
# Strategy 4: Look for JSON-LD structured data
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
if json_ld:
try:
import json
data = json.loads(json_ld.string)
if isinstance(data, dict) and data.get('author'):
author_data = data.get('author')
if isinstance(author_data, dict):
return author_data.get('name', '')
elif isinstance(author_data, str):
return author_data
except:
pass
return None
def extract_date(soup):
"""
Extract published date using multiple strategies
"""
# Strategy 1: Look for time tag with datetime attribute
time_tag = soup.find('time')
if time_tag and time_tag.get('datetime'):
return time_tag.get('datetime')
# Strategy 2: Look for meta article:published_time
meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
if meta_published and meta_published.get('content'):
return meta_published.get('content')
# Strategy 3: Look for meta og:published_time
og_published = soup.find('meta', attrs={'property': 'og:published_time'})
if og_published and og_published.get('content'):
return og_published.get('content')
# Strategy 4: Look for common date class names
date_selectors = [
'[class*="publish-date"]',
'[class*="published"]',
'[class*="date"]',
'[class*="timestamp"]',
'[itemprop="datePublished"]'
]
for selector in date_selectors:
date_elem = soup.select_one(selector)
if date_elem:
# Try datetime attribute first
if date_elem.get('datetime'):
return date_elem.get('datetime')
# Otherwise get text
date_text = date_elem.get_text().strip()
if date_text and len(date_text) < 50:
return date_text
# Strategy 5: Look for JSON-LD structured data
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
if json_ld:
try:
import json
data = json.loads(json_ld.string)
if isinstance(data, dict):
return data.get('datePublished') or data.get('dateCreated')
except:
pass
return None
def extract_main_content(soup):
"""
Extract main article content using multiple strategies
"""
# Strategy 1: Try common article content selectors
content_selectors = [
'article',
'[class*="article-content"]',
'[class*="article-body"]',
'[class*="post-content"]',
'[class*="entry-content"]',
'[class*="content-body"]',
'[class*="story-body"]',
'[itemprop="articleBody"]',
'main'
]
article_content = None
for selector in content_selectors:
element = soup.select_one(selector)
if element:
article_content = element
break
# Fallback: get body
if not article_content:
article_content = soup.find('body')
if not article_content:
return ''
# Extract text from paragraphs
paragraphs = article_content.find_all('p')
# Filter out short paragraphs (likely navigation/ads)
content_paragraphs = []
for p in paragraphs:
text = p.get_text().strip()
# Keep paragraphs with at least 50 characters
if len(text) >= 50:
content_paragraphs.append(text)
content_text = '\n\n'.join(content_paragraphs)
return content_text
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
"""
Crawl articles from an RSS feed
Returns: dict with statistics
"""
print(f"\n📰 Crawling feed: {feed_name}")
print(f" URL: {feed_url}")
try:
# Parse RSS feed
feed = feedparser.parse(feed_url)
if not feed.entries:
print(f" ⚠ No entries found in feed")
return 0
crawled_count = 0
summarized_count = 0
failed_summaries = 0
for entry in feed.entries[:max_articles]:
# Extract article URL using utility function
article_url = extract_article_url(entry)
if not article_url:
print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
continue
# Check if article already exists and has content
existing = articles_collection.find_one({'link': article_url})
if existing and existing.get('content'):
print(f" ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
continue
print(f" 🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
# Extract full content
article_data = extract_article_content(article_url)
if article_data and article_data.get('content'):
# Summarize with Ollama if enabled
summary_result = None
if Config.OLLAMA_ENABLED and article_data.get('content'):
print(f" 🤖 Summarizing with AI...")
summary_result = ollama_client.summarize_article(
article_data['content'],
max_words=Config.SUMMARY_MAX_WORDS
)
if summary_result['success']:
print(f" ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
summarized_count += 1
else:
print(f" ⚠ Summarization failed: {summary_result['error']}")
failed_summaries += 1
# Prepare document
article_doc = {
'title': article_data.get('title') or entry.get('title', ''),
'author': article_data.get('author'),
'link': article_url,
'content': article_data.get('content', ''), # Full article content
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
'word_count': article_data.get('word_count', 0),
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
'source': feed_name,
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
'crawled_at': article_data.get('crawled_at'),
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
'created_at': datetime.utcnow()
}
try:
# Upsert: update if exists, insert if not
articles_collection.update_one(
{'link': article_url},
{'$set': article_doc},
upsert=True
)
crawled_count += 1
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
except DuplicateKeyError:
print(f" ⚠ Duplicate key error")
except Exception as e:
print(f" ✗ Error saving: {e}")
else:
print(f" ✗ Failed to extract content")
# Be nice to servers - add delay
time.sleep(1)
print(f" ✓ Crawled {crawled_count} articles from {feed_name}")
if Config.OLLAMA_ENABLED:
print(f" 🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
return {
'crawled': crawled_count,
'summarized': summarized_count,
'failed_summaries': failed_summaries
}
except Exception as e:
print(f" ✗ Error processing feed {feed_name}: {e}")
return 0
def crawl_all_feeds(max_articles_per_feed=10):
"""
Crawl all active RSS feeds
Returns: dict with statistics
"""
print("\n" + "="*60)
print("🚀 Starting RSS Feed Crawler")
print("="*60)
start_time = time.time()
feeds = get_active_rss_feeds()
if not feeds:
print("⚠ No active RSS feeds found")
return {
'total_feeds': 0,
'total_articles_crawled': 0,
'duration_seconds': 0
}
print(f"Found {len(feeds)} active feed(s)")
if Config.OLLAMA_ENABLED:
print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
total_crawled = 0
total_summarized = 0
total_failed = 0
for feed in feeds:
result = crawl_rss_feed(
feed['url'],
feed['name'],
max_articles=max_articles_per_feed
)
total_crawled += result['crawled']
total_summarized += result['summarized']
total_failed += result['failed_summaries']
duration = time.time() - start_time
print("\n" + "="*60)
print(f"✓ Crawling Complete!")
print(f" Total feeds processed: {len(feeds)}")
print(f" Total articles crawled: {total_crawled}")
if Config.OLLAMA_ENABLED:
print(f" Total articles summarized: {total_summarized}")
print(f" Failed summarizations: {total_failed}")
if total_summarized > 0:
success_rate = (total_summarized / (total_summarized + total_failed)) * 100
print(f" Success rate: {success_rate:.1f}%")
print(f" Duration: {duration:.2f} seconds")
if total_crawled > 0:
print(f" Average time per article: {duration/total_crawled:.1f}s")
print("="*60 + "\n")
return {
'total_feeds': len(feeds),
'total_articles_crawled': total_crawled,
'total_summarized': total_summarized,
'failed_summaries': total_failed,
'duration_seconds': round(duration, 2)
}
if __name__ == '__main__':
# Can be run standalone for testing
import sys
max_articles = 10
if len(sys.argv) > 1:
try:
max_articles = int(sys.argv[1])
except ValueError:
print("Usage: python crawler_service.py [max_articles_per_feed]")
sys.exit(1)
crawl_all_feeds(max_articles_per_feed=max_articles)

View File

@@ -0,0 +1,33 @@
version: '3.8'
services:
crawler:
build: .
container_name: news-crawler
environment:
- MONGODB_URI=mongodb://mongodb:27017/
networks:
- munich-news-network
depends_on:
- mongodb
# Run once and exit
restart: "no"
mongodb:
image: mongo:7.0
container_name: munich-news-mongodb
restart: unless-stopped
ports:
- "27017:27017"
volumes:
- mongodb_data:/data/db
networks:
- munich-news-network
volumes:
mongodb_data:
driver: local
networks:
munich-news-network:
driver: bridge

View File

@@ -0,0 +1,290 @@
"""
Ollama client for AI-powered article summarization
"""
import requests
import time
from datetime import datetime
class OllamaClient:
"""Client for communicating with Ollama server for text summarization"""
def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
"""
Initialize Ollama client
Args:
base_url: Ollama server URL (e.g., http://localhost:11434)
model: Model name to use (e.g., phi3:latest)
api_key: Optional API key for authentication
enabled: Whether Ollama is enabled
timeout: Request timeout in seconds (default 30)
"""
self.base_url = base_url.rstrip('/')
self.model = model
self.api_key = api_key
self.enabled = enabled
self.timeout = timeout
def summarize_article(self, content, max_words=150):
"""
Summarize article content using Ollama
Args:
content: Full article text
max_words: Maximum words in summary (default 150)
Returns:
{
'summary': str, # AI-generated summary
'summary_word_count': int, # Summary word count
'original_word_count': int, # Original article word count
'success': bool, # Whether summarization succeeded
'error': str or None, # Error message if failed
'duration': float # Time taken in seconds
}
"""
if not self.enabled:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': 0,
'success': False,
'error': 'Ollama is not enabled',
'duration': 0
}
if not content or len(content.strip()) == 0:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': 0,
'success': False,
'error': 'Content is empty',
'duration': 0
}
# Calculate original word count
original_word_count = len(content.split())
start_time = time.time()
try:
# Construct prompt
prompt = self._build_summarization_prompt(content, max_words)
# Prepare request
url = f"{self.base_url}/api/generate"
headers = {'Content-Type': 'application/json'}
if self.api_key:
headers['Authorization'] = f'Bearer {self.api_key}'
payload = {
'model': self.model,
'prompt': prompt,
'stream': False,
'options': {
'temperature': 0.7,
'num_predict': 250 # Limit response length
}
}
# Make request
response = requests.post(
url,
json=payload,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
# Parse response
result = response.json()
summary = result.get('response', '').strip()
if not summary:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': original_word_count,
'success': False,
'error': 'Ollama returned empty summary',
'duration': time.time() - start_time
}
summary_word_count = len(summary.split())
return {
'summary': summary,
'summary_word_count': summary_word_count,
'original_word_count': original_word_count,
'success': True,
'error': None,
'duration': time.time() - start_time
}
except requests.exceptions.Timeout:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': original_word_count,
'success': False,
'error': f'Request timed out after {self.timeout} seconds',
'duration': time.time() - start_time
}
except requests.exceptions.ConnectionError:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': original_word_count,
'success': False,
'error': f'Cannot connect to Ollama server at {self.base_url}',
'duration': time.time() - start_time
}
except requests.exceptions.HTTPError as e:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': original_word_count,
'success': False,
'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
'duration': time.time() - start_time
}
except Exception as e:
return {
'summary': None,
'summary_word_count': 0,
'original_word_count': original_word_count,
'success': False,
'error': f'Unexpected error: {str(e)}',
'duration': time.time() - start_time
}
def _build_summarization_prompt(self, content, max_words):
"""Build prompt for article summarization"""
# Truncate content if too long (keep first 5000 words)
words = content.split()
if len(words) > 5000:
content = ' '.join(words[:5000]) + '...'
prompt = f"""Summarize the following article in English in {max_words} words or less. Even if the article is in German or another language, provide the summary in English. Focus on the key points, main message, and important details. Be concise and clear.
Article:
{content}
English Summary (max {max_words} words):"""
return prompt
def is_available(self):
"""
Check if Ollama server is reachable
Returns:
bool: True if server is reachable, False otherwise
"""
if not self.enabled:
return False
try:
url = f"{self.base_url}/api/tags"
headers = {}
if self.api_key:
headers['Authorization'] = f'Bearer {self.api_key}'
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
return True
except:
return False
def test_connection(self):
"""
Test connection and return server info
Returns:
{
'available': bool,
'models': list,
'current_model': str,
'error': str or None
}
"""
if not self.enabled:
return {
'available': False,
'models': [],
'current_model': self.model,
'error': 'Ollama is not enabled'
}
try:
url = f"{self.base_url}/api/tags"
headers = {}
if self.api_key:
headers['Authorization'] = f'Bearer {self.api_key}'
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
result = response.json()
models = [m.get('name', '') for m in result.get('models', [])]
return {
'available': True,
'models': models,
'current_model': self.model,
'error': None
}
except requests.exceptions.ConnectionError:
return {
'available': False,
'models': [],
'current_model': self.model,
'error': f'Cannot connect to Ollama server at {self.base_url}'
}
except Exception as e:
return {
'available': False,
'models': [],
'current_model': self.model,
'error': str(e)
}
if __name__ == '__main__':
# Quick test
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')
client = OllamaClient(
base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
enabled=True
)
print("Testing Ollama connection...")
result = client.test_connection()
print(f"Available: {result['available']}")
print(f"Models: {result['models']}")
print(f"Current model: {result['current_model']}")
if result['available']:
print("\nTesting summarization...")
test_content = """
The new U-Bahn line connecting Munich's city center with the airport opened today.
Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
The line will significantly reduce travel time between the airport and downtown Munich.
Construction took five years and cost approximately 2 billion euros.
The new line includes 10 stations and runs every 10 minutes during peak hours.
"""
summary_result = client.summarize_article(test_content, max_words=50)
print(f"Success: {summary_result['success']}")
print(f"Summary: {summary_result['summary']}")
print(f"Original word count: {summary_result['original_word_count']}")
print(f"Summary word count: {summary_result['summary_word_count']}")
print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
print(f"Duration: {summary_result['duration']:.2f}s")

View File

@@ -0,0 +1,6 @@
beautifulsoup4==4.12.2
lxml==4.9.3
requests==2.31.0
feedparser==6.0.10
pymongo==4.6.1
python-dotenv==1.0.0

98
news_crawler/rss_utils.py Normal file
View File

@@ -0,0 +1,98 @@
"""
Utility functions for RSS feed processing
"""
def extract_article_url(entry):
"""
Extract article URL from RSS entry.
Different RSS feeds use different fields for the article URL.
Args:
entry: feedparser entry object
Returns:
str: Article URL or None if not found
Examples:
- Most feeds use 'link'
- Some use 'guid' as the URL
- Some use 'id' as the URL
- Some have guid as a dict with 'href'
"""
# Try 'link' first (most common)
if entry.get('link') and entry.get('link', '').startswith('http'):
return entry.get('link')
# Try 'guid' if it's a valid URL
if entry.get('guid'):
guid = entry.get('guid')
# guid can be a string
if isinstance(guid, str) and guid.startswith('http'):
return guid
# or a dict with 'href'
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
return guid.get('href')
# Try 'id' if it's a valid URL
if entry.get('id') and entry.get('id', '').startswith('http'):
return entry.get('id')
# Try 'links' array (some feeds have multiple links)
if entry.get('links'):
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
# Prefer 'alternate' type, but accept any http link
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
return link.get('href')
# If no alternate found, return first http link
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
return link.get('href')
return None
def extract_article_summary(entry):
"""
Extract article summary/description from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Article summary or empty string
"""
# Try different fields
if entry.get('summary'):
return entry.get('summary', '')
elif entry.get('description'):
return entry.get('description', '')
elif entry.get('content'):
# content is usually a list of dicts
content = entry.get('content', [])
if content and isinstance(content, list) and len(content) > 0:
return content[0].get('value', '')
return ''
def extract_published_date(entry):
"""
Extract published date from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Published date or empty string
"""
# Try different fields
if entry.get('published'):
return entry.get('published', '')
elif entry.get('updated'):
return entry.get('updated', '')
elif entry.get('created'):
return entry.get('created', '')
return ''

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python
"""
Test script to verify crawler functionality
"""
from crawler_service import extract_article_content, get_active_rss_feeds
import sys
def test_content_extraction():
"""Test content extraction from a sample URL"""
print("Testing content extraction...")
# Test with a simple news site
test_url = "https://www.bbc.com/news"
print(f"Extracting content from: {test_url}")
result = extract_article_content(test_url, timeout=10)
if result:
print("✓ Content extraction successful!")
print(f" Title: {result.get('title', 'N/A')[:50]}...")
print(f" Content length: {len(result.get('content', ''))} chars")
print(f" Word count: {result.get('word_count', 0)}")
return True
else:
print("✗ Content extraction failed")
return False
def test_database_connection():
"""Test MongoDB connection"""
print("\nTesting database connection...")
try:
feeds = get_active_rss_feeds()
print(f"✓ Database connection successful!")
print(f" Found {len(feeds)} active RSS feed(s)")
if feeds:
print("\n Active feeds:")
for feed in feeds:
print(f" - {feed['name']}: {feed['url']}")
else:
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
return True
except Exception as e:
print(f"✗ Database connection failed: {e}")
return False
def main():
print("="*60)
print("News Crawler - Test Suite")
print("="*60 + "\n")
# Test database connection
db_ok = test_database_connection()
# Test content extraction
extract_ok = test_content_extraction()
print("\n" + "="*60)
print("Test Results:")
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
print("="*60 + "\n")
if db_ok and extract_ok:
print("✓ All tests passed! Crawler is ready to use.")
print("\nRun the crawler with:")
print(" python crawler_service.py")
return 0
else:
print("✗ Some tests failed. Please check the errors above.")
return 1
if __name__ == '__main__':
sys.exit(main())

129
news_crawler/test_ollama.py Normal file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python
"""
Test script for Ollama integration
Tests connection, configuration, and summarization
"""
from config import Config
from ollama_client import OllamaClient
print("\n" + "="*70)
print("Ollama Integration Test")
print("="*70)
# Print configuration
Config.print_config()
# Validate configuration
issues = Config.validate()
if issues:
print("⚠ Configuration Issues:")
for issue in issues:
print(f" - {issue}")
print()
# Initialize client
client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
api_key=Config.OLLAMA_API_KEY,
enabled=Config.OLLAMA_ENABLED,
timeout=Config.OLLAMA_TIMEOUT
)
# Test 1: Check if Ollama is enabled
print("Test 1: Configuration Check")
print(f" Ollama Enabled: {Config.OLLAMA_ENABLED}")
if not Config.OLLAMA_ENABLED:
print(" ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
print("\n" + "="*70)
exit(0)
# Test 2: Test connection
print("\nTest 2: Connection Test")
conn_result = client.test_connection()
print(f" Available: {conn_result['available']}")
print(f" Current Model: {conn_result['current_model']}")
if conn_result['available']:
print(f" ✓ Connected to Ollama server")
if conn_result['models']:
print(f" Available models: {', '.join(conn_result['models'][:5])}")
if conn_result['current_model'] not in conn_result['models']:
print(f" ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
else:
print(f" ✗ Connection failed: {conn_result['error']}")
print("\n" + "="*70)
exit(1)
# Test 3: Test summarization with sample article
print("\nTest 3: Summarization Test")
print(" Testing with sample German article...")
sample_article = """
Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet.
Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil.
Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen.
Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen
und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung
den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich
über 50.000 Fahrgästen auf der neuen Strecke.
"""
result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
print(f"\n Success: {result['success']}")
if result['success']:
print(f" ✓ Summarization successful!")
print(f"\n Original word count: {result['original_word_count']}")
print(f" Summary word count: {result['summary_word_count']}")
print(f" Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
print(f" Duration: {result['duration']:.2f}s")
print(f"\n Summary (English):")
print(f" {'-'*70}")
print(f" {result['summary']}")
print(f" {'-'*70}")
else:
print(f" ✗ Summarization failed: {result['error']}")
# Test 4: Test with English article
print("\nTest 4: English Article Test")
print(" Testing with English article...")
english_article = """
The city council approved a new bike lane network spanning 50 kilometers across Munich.
The project aims to promote sustainable transportation and reduce car traffic in the city center.
Construction will begin next month and is expected to be completed within two years.
The bike lanes will connect major residential areas with business districts and public transport hubs.
Environmental groups have praised the initiative as a significant step toward carbon neutrality.
"""
result2 = client.summarize_article(english_article, max_words=50)
print(f"\n Success: {result2['success']}")
if result2['success']:
print(f" ✓ Summarization successful!")
print(f" Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
print(f" Duration: {result2['duration']:.2f}s")
print(f"\n Summary:")
print(f" {result2['summary']}")
else:
print(f" ✗ Summarization failed: {result2['error']}")
# Summary
print("\n" + "="*70)
print("Test Summary")
print("="*70)
print(f"✓ Configuration: Valid")
print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
print("="*70)
if result['success'] and result2['success']:
print("\n🎉 All tests passed! Ollama integration is working correctly.")
print("\nYou can now run the crawler with AI summarization:")
print(" python crawler_service.py 5")
else:
print("\n⚠ Some tests failed. Check the errors above.")
print()

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python
"""
Test script to verify RSS feed URL extraction
Tests actual feeds from the database
"""
import feedparser
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
# Load environment variables
load_dotenv(dotenv_path='../.env')
# MongoDB setup
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
client = MongoClient(MONGODB_URI)
db = client[DB_NAME]
rss_feeds_collection = db['rss_feeds']
def test_feed(feed_name, feed_url):
"""Test a single RSS feed"""
print(f"\n{'='*70}")
print(f"Testing: {feed_name}")
print(f"URL: {feed_url}")
print('='*70)
try:
# Parse the feed
print("Fetching RSS feed...")
feed = feedparser.parse(feed_url)
if not feed.entries:
print("❌ No entries found in feed")
return False
print(f"✓ Found {len(feed.entries)} entries\n")
# Test first 5 entries
success_count = 0
fail_count = 0
for i, entry in enumerate(feed.entries[:5], 1):
print(f"\n--- Entry {i} ---")
print(f"Title: {entry.get('title', 'No title')[:60]}")
# Test URL extraction
article_url = extract_article_url(entry)
if article_url:
print(f"✓ URL: {article_url}")
success_count += 1
else:
print(f"❌ No valid URL found")
print(f" Available fields: {list(entry.keys())}")
print(f" link: {entry.get('link', 'N/A')}")
print(f" guid: {entry.get('guid', 'N/A')}")
print(f" id: {entry.get('id', 'N/A')}")
fail_count += 1
# Test summary extraction
summary = extract_article_summary(entry)
if summary:
print(f"✓ Summary: {summary[:80]}...")
else:
print(f"⚠ No summary found")
# Test date extraction
pub_date = extract_published_date(entry)
if pub_date:
print(f"✓ Published: {pub_date}")
else:
print(f"⚠ No published date found")
print(f"\n{'='*70}")
print(f"Results for {feed_name}:")
print(f" ✓ Success: {success_count}/5")
print(f" ❌ Failed: {fail_count}/5")
print('='*70)
return fail_count == 0
except Exception as e:
print(f"❌ Error testing feed: {e}")
return False
def main():
print("\n" + "="*70)
print("RSS Feed URL Extraction Test")
print("="*70)
# Get all RSS feeds from database
print("\nFetching RSS feeds from database...")
feeds = list(rss_feeds_collection.find())
if not feeds:
print("❌ No RSS feeds found in database")
print("\nAdd feeds using:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
return 1
print(f"✓ Found {len(feeds)} feed(s) in database\n")
# Test each feed
results = {}
for feed in feeds:
feed_name = feed.get('name', 'Unknown')
feed_url = feed.get('url', '')
active = feed.get('active', True)
if not active:
print(f"\n⏭ Skipping inactive feed: {feed_name}")
continue
if not feed_url:
print(f"\n❌ Feed '{feed_name}' has no URL")
results[feed_name] = False
continue
results[feed_name] = test_feed(feed_name, feed_url)
# Summary
print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
for feed_name, success in results.items():
status = "✓ PASS" if success else "❌ FAIL"
print(f"{status} - {feed_name}")
total = len(results)
passed = sum(1 for s in results.values() if s)
print(f"\nTotal: {passed}/{total} feeds passed")
print("="*70 + "\n")
if passed == total:
print("✓ All feeds are working correctly!")
print("\nYou can now run the crawler:")
print(" python crawler_service.py")
return 0
else:
print("⚠ Some feeds have issues. Check the output above.")
return 1
if __name__ == '__main__':
import sys
sys.exit(main())

28
news_sender/.gitignore vendored Normal file
View File

@@ -0,0 +1,28 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
ENV/
.venv
# Environment variables
.env
.env.local
# Generated files
newsletter_preview.html
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db

303
news_sender/README.md Normal file
View File

@@ -0,0 +1,303 @@
# News Sender Microservice
Standalone service for sending Munich News Daily newsletters to subscribers.
## Features
- 📧 Sends beautiful HTML newsletters
- 🤖 Uses AI-generated article summaries
- 📊 Tracks sending statistics
- 🧪 Test mode for development
- 📝 Preview generation
- 🔄 Fetches data from shared MongoDB
## Installation
```bash
cd news_sender
pip install -r requirements.txt
```
## Configuration
The service uses the same `.env` file as the backend (`../backend/.env`):
```env
# MongoDB
MONGODB_URI=mongodb://localhost:27017/
# Email (Gmail example)
SMTP_SERVER=smtp.gmail.com
SMTP_PORT=587
EMAIL_USER=your-email@gmail.com
EMAIL_PASSWORD=your-app-password
# Newsletter Settings (optional)
NEWSLETTER_MAX_ARTICLES=10
WEBSITE_URL=http://localhost:3000
```
**Gmail Setup:**
1. Enable 2-factor authentication
2. Generate an App Password: https://support.google.com/accounts/answer/185833
3. Use the App Password (not your regular password)
## Usage
### 1. Preview Newsletter
Generate HTML preview without sending:
```bash
python sender_service.py preview
```
This creates `newsletter_preview.html` - open it in your browser to see how the newsletter looks.
### 2. Send Test Email
Send to a single email address for testing:
```bash
python sender_service.py test your-email@example.com
```
### 3. Send to All Subscribers
Send newsletter to all active subscribers:
```bash
# Send with default article count (10)
python sender_service.py send
# Send with custom article count
python sender_service.py send 15
```
### 4. Use as Python Module
```python
from sender_service import send_newsletter, preview_newsletter
# Send newsletter
result = send_newsletter(max_articles=10)
print(f"Sent to {result['sent_count']} subscribers")
# Generate preview
html = preview_newsletter(max_articles=5)
```
## How It Works
```
┌─────────────────────────────────────────────────────────┐
│ 1. Fetch Articles from MongoDB │
│ - Get latest articles with AI summaries │
│ - Sort by creation date (newest first) │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ 2. Fetch Active Subscribers │
│ - Get all subscribers with status='active' │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ 3. Render Newsletter HTML │
│ - Load newsletter_template.html │
│ - Populate with articles and metadata │
│ - Generate beautiful HTML email │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ 4. Send Emails │
│ - Connect to SMTP server │
│ - Send to each subscriber │
│ - Track success/failure │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ 5. Report Statistics │
│ - Total sent │
│ - Failed sends │
│ - Error details │
└─────────────────────────────────────────────────────────┘
```
## Output Example
```
======================================================================
📧 Munich News Daily - Newsletter Sender
======================================================================
Fetching latest 10 articles with AI summaries...
✓ Found 10 articles
Fetching active subscribers...
✓ Found 150 active subscriber(s)
Rendering newsletter HTML...
✓ Newsletter rendered
Sending newsletter: 'Munich News Daily - November 10, 2024'
----------------------------------------------------------------------
[1/150] Sending to user1@example.com... ✓
[2/150] Sending to user2@example.com... ✓
[3/150] Sending to user3@example.com... ✓
...
======================================================================
📊 Sending Complete
======================================================================
✓ Successfully sent: 148
✗ Failed: 2
📰 Articles included: 10
======================================================================
```
## Scheduling
### Using Cron (Linux/Mac)
Send newsletter daily at 8 AM:
```bash
# Edit crontab
crontab -e
# Add this line
0 8 * * * cd /path/to/news_sender && /path/to/venv/bin/python sender_service.py send
```
### Using systemd Timer (Linux)
Create `/etc/systemd/system/news-sender.service`:
```ini
[Unit]
Description=Munich News Sender
[Service]
Type=oneshot
WorkingDirectory=/path/to/news_sender
ExecStart=/path/to/venv/bin/python sender_service.py send
User=your-user
```
Create `/etc/systemd/system/news-sender.timer`:
```ini
[Unit]
Description=Send Munich News Daily at 8 AM
[Timer]
OnCalendar=daily
OnCalendar=*-*-* 08:00:00
[Install]
WantedBy=timers.target
```
Enable and start:
```bash
sudo systemctl enable news-sender.timer
sudo systemctl start news-sender.timer
```
### Using Docker
Create `Dockerfile`:
```dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY sender_service.py newsletter_template.html ./
CMD ["python", "sender_service.py", "send"]
```
Build and run:
```bash
docker build -t news-sender .
docker run --env-file ../backend/.env news-sender
```
## Troubleshooting
### "Email credentials not configured"
- Check that `EMAIL_USER` and `EMAIL_PASSWORD` are set in `.env`
- For Gmail, use an App Password, not your regular password
### "No articles with summaries found"
- Run the crawler first: `cd ../news_crawler && python crawler_service.py 10`
- Make sure Ollama is enabled and working
- Check MongoDB has articles with `summary` field
### "No active subscribers found"
- Add subscribers via the backend API
- Check subscriber status is 'active' in MongoDB
### SMTP Connection Errors
- Verify SMTP server and port are correct
- Check firewall isn't blocking SMTP port
- For Gmail, ensure "Less secure app access" is enabled or use App Password
### Emails Going to Spam
- Set up SPF, DKIM, and DMARC records for your domain
- Use a verified email address
- Avoid spam trigger words in subject/content
- Include unsubscribe link (already included in template)
## Architecture
This is a standalone microservice that:
- Runs independently of the backend
- Shares the same MongoDB database
- Can be deployed separately
- Can be scheduled independently
- Has no dependencies on backend code
## Integration with Other Services
```
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Backend │ │ Crawler │ │ Sender │
│ (Flask) │ │ (Scraper) │ │ (Email) │
└──────┬───────┘ └──────┬───────┘ └──────┬───────┘
│ │ │
│ │ │
└────────────────────┴─────────────────────┘
┌───────▼────────┐
│ MongoDB │
│ (Shared DB) │
└────────────────┘
```
## Next Steps
1. **Test the newsletter:**
```bash
python sender_service.py test your-email@example.com
```
2. **Schedule daily sending:**
- Set up cron job or systemd timer
- Choose appropriate time (e.g., 8 AM)
3. **Monitor sending:**
- Check logs for errors
- Track open rates (requires email tracking service)
- Monitor spam complaints
4. **Optimize:**
- Add email tracking pixels
- A/B test subject lines
- Personalize content per subscriber

View File

@@ -0,0 +1,162 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Munich News Daily</title>
<!--[if mso]>
<style type="text/css">
body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
</style>
<![endif]-->
</head>
<body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
<!-- Wrapper Table -->
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
<tr>
<td align="center" style="padding: 20px 0;">
<!-- Main Container -->
<table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
<!-- Header -->
<tr>
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
<h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
Munich News Daily
</h1>
<p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
{{ date }}
</p>
</td>
</tr>
<!-- Greeting -->
<tr>
<td style="padding: 30px 40px 20px 40px;">
<p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
Good morning ☀️
</p>
<p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
</p>
</td>
</tr>
<!-- Divider -->
<tr>
<td style="padding: 0 40px;">
<div style="height: 1px; background-color: #e0e0e0;"></div>
</td>
</tr>
<!-- Articles -->
{% for article in articles %}
<tr>
<td style="padding: 25px 40px;">
<!-- Article Number Badge -->
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
<tr>
<td>
<span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
{{ loop.index }}
</span>
</td>
</tr>
</table>
<!-- Article Title -->
<h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
{{ article.title }}
</h2>
<!-- Article Meta -->
<p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
<span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
{% if article.author %}
<span> • {{ article.author }}</span>
{% endif %}
</p>
<!-- Article Summary -->
<p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
{{ article.summary }}
</p>
<!-- Read More Link -->
<a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
Read more →
</a>
</td>
</tr>
<!-- Article Divider -->
{% if not loop.last %}
<tr>
<td style="padding: 0 40px;">
<div style="height: 1px; background-color: #f0f0f0;"></div>
</td>
</tr>
{% endif %}
{% endfor %}
<!-- Bottom Divider -->
<tr>
<td style="padding: 25px 40px 0 40px;">
<div style="height: 1px; background-color: #e0e0e0;"></div>
</td>
</tr>
<!-- Summary Box -->
<tr>
<td style="padding: 30px 40px;">
<table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
<tr>
<td style="padding: 25px; text-align: center;">
<p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
Today's Digest
</p>
<p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
{{ article_count }}
</p>
<p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
stories • AI-summarized • 5 min read
</p>
</td>
</tr>
</table>
</td>
</tr>
<!-- Footer -->
<tr>
<td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
<p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
Munich News Daily
</p>
<p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
AI-powered news summaries for busy people.<br>
Delivered daily to your inbox.
</p>
<!-- Footer Links -->
<p style="margin: 0; font-size: 12px; color: #666666;">
<a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
<span style="color: #444444;"></span>
<a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
</p>
<p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
© {{ year }} Munich News Daily. All rights reserved.
</p>
</td>
</tr>
</table>
<!-- End Main Container -->
</td>
</tr>
</table>
<!-- End Wrapper Table -->
</body>
</html>

View File

@@ -0,0 +1,3 @@
pymongo==4.6.1
python-dotenv==1.0.0
Jinja2==3.1.2

View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python
"""
News Sender Service - Standalone microservice for sending newsletters
Fetches articles from MongoDB and sends to subscribers via email
"""
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime
from pathlib import Path
from jinja2 import Template
from pymongo import MongoClient
import os
from dotenv import load_dotenv
# Load environment variables from backend/.env
backend_dir = Path(__file__).parent.parent / 'backend'
env_path = backend_dir / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
print(f"✓ Loaded configuration from: {env_path}")
else:
print(f"⚠ Warning: .env file not found at {env_path}")
class Config:
"""Configuration for news sender"""
# MongoDB
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
# Email
SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
EMAIL_USER = os.getenv('EMAIL_USER', '')
EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
# Newsletter
MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')
# MongoDB connection
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
subscribers_collection = db['subscribers']
def get_latest_articles(max_articles=10):
"""
Get latest articles with AI summaries from database
Returns:
list: Articles with summaries
"""
cursor = articles_collection.find(
{'summary': {'$exists': True, '$ne': None}}
).sort('created_at', -1).limit(max_articles)
articles = []
for doc in cursor:
articles.append({
'title': doc.get('title', ''),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', '')
})
return articles
def get_active_subscribers():
"""
Get all active subscribers from database
Returns:
list: Email addresses of active subscribers
"""
cursor = subscribers_collection.find({'status': 'active'})
return [doc['email'] for doc in cursor]
def render_newsletter_html(articles):
"""
Render newsletter HTML from template
Args:
articles: List of article dictionaries
Returns:
str: Rendered HTML content
"""
# Load template
template_path = Path(__file__).parent / 'newsletter_template.html'
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
template = Template(template_content)
# Prepare template data
now = datetime.now()
template_data = {
'date': now.strftime('%A, %B %d, %Y'),
'year': now.year,
'article_count': len(articles),
'articles': articles,
'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
'website_link': Config.WEBSITE_URL
}
# Render HTML
return template.render(**template_data)
def send_email(to_email, subject, html_content):
"""
Send email to a single recipient
Args:
to_email: Recipient email address
subject: Email subject
html_content: HTML content of email
Returns:
tuple: (success: bool, error: str or None)
"""
try:
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
msg['To'] = to_email
msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
msg['X-Mailer'] = 'Munich News Daily Sender'
# Add plain text version as fallback
plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
# Add HTML version
msg.attach(MIMEText(html_content, 'html', 'utf-8'))
server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
server.starttls()
server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
server.send_message(msg)
server.quit()
return True, None
except Exception as e:
return False, str(e)
def send_newsletter(max_articles=None, test_email=None):
"""
Send newsletter to all active subscribers
Args:
max_articles: Maximum number of articles to include (default from config)
test_email: If provided, send only to this email (for testing)
Returns:
dict: Statistics about sending
"""
print("\n" + "="*70)
print("📧 Munich News Daily - Newsletter Sender")
print("="*70)
# Validate email configuration
if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
print("❌ Email credentials not configured")
print(" Set EMAIL_USER and EMAIL_PASSWORD in .env file")
return {
'success': False,
'error': 'Email credentials not configured'
}
# Get articles
max_articles = max_articles or Config.MAX_ARTICLES
print(f"\nFetching latest {max_articles} articles with AI summaries...")
articles = get_latest_articles(max_articles)
if not articles:
print("❌ No articles with summaries found")
print(" Run the crawler with Ollama enabled first")
return {
'success': False,
'error': 'No articles with summaries'
}
print(f"✓ Found {len(articles)} articles")
# Get subscribers
if test_email:
subscribers = [test_email]
print(f"\n🧪 Test mode: Sending to {test_email} only")
else:
print("\nFetching active subscribers...")
subscribers = get_active_subscribers()
print(f"✓ Found {len(subscribers)} active subscriber(s)")
if not subscribers:
print("❌ No active subscribers found")
return {
'success': False,
'error': 'No active subscribers'
}
# Render newsletter
print("\nRendering newsletter HTML...")
html_content = render_newsletter_html(articles)
print("✓ Newsletter rendered")
# Send to subscribers
subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
print(f"\nSending newsletter: '{subject}'")
print("-" * 70)
sent_count = 0
failed_count = 0
errors = []
for i, email in enumerate(subscribers, 1):
print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')
success, error = send_email(email, subject, html_content)
if success:
print("")
sent_count += 1
else:
print(f"{error}")
failed_count += 1
errors.append({'email': email, 'error': error})
# Summary
print("\n" + "="*70)
print("📊 Sending Complete")
print("="*70)
print(f"✓ Successfully sent: {sent_count}")
print(f"✗ Failed: {failed_count}")
print(f"📰 Articles included: {len(articles)}")
print("="*70 + "\n")
return {
'success': True,
'sent_count': sent_count,
'failed_count': failed_count,
'total_subscribers': len(subscribers),
'article_count': len(articles),
'errors': errors
}
def preview_newsletter(max_articles=None):
"""
Generate newsletter HTML for preview (doesn't send)
Args:
max_articles: Maximum number of articles to include
Returns:
str: HTML content
"""
max_articles = max_articles or Config.MAX_ARTICLES
articles = get_latest_articles(max_articles)
if not articles:
return "<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>"
return render_newsletter_html(articles)
if __name__ == '__main__':
import sys
# Parse command line arguments
if len(sys.argv) > 1:
command = sys.argv[1]
if command == 'preview':
# Generate preview HTML
html = preview_newsletter()
output_file = 'newsletter_preview.html'
with open(output_file, 'w', encoding='utf-8') as f:
f.write(html)
print(f"✓ Preview saved to {output_file}")
print(f" Open it in your browser to see the newsletter")
elif command == 'test':
# Send test email
if len(sys.argv) < 3:
print("Usage: python sender_service.py test <email>")
sys.exit(1)
test_email = sys.argv[2]
send_newsletter(test_email=test_email)
elif command == 'send':
# Send to all subscribers
max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
send_newsletter(max_articles=max_articles)
else:
print("Unknown command. Usage:")
print(" python sender_service.py preview - Generate HTML preview")
print(" python sender_service.py test <email> - Send test email")
print(" python sender_service.py send [count] - Send to all subscribers")
else:
# Default: send newsletter
send_newsletter()

96
test_feeds_quick.py Normal file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python
"""
Quick test script - Run from project root with backend venv activated
Usage:
cd /path/to/munich-news
source backend/venv/bin/activate # or backend/venv/Scripts/activate on Windows
python test_feeds_quick.py
"""
import sys
sys.path.insert(0, 'backend')
from pymongo import MongoClient
from config import Config
import feedparser
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
print("="*80)
print("RSS Feed Test - Checking Database Feeds")
print("="*80)
# Connect to database
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
# Get RSS feeds
feeds = list(db['rss_feeds'].find())
if not feeds:
print("\n❌ No RSS feeds in database!")
print("\nAdd a feed first:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
sys.exit(1)
print(f"\n✓ Found {len(feeds)} feed(s) in database\n")
# Test each feed
for feed_doc in feeds:
name = feed_doc.get('name', 'Unknown')
url = feed_doc.get('url', '')
active = feed_doc.get('active', True)
print(f"\n{'='*80}")
print(f"Feed: {name}")
print(f"URL: {url}")
print(f"Active: {active}")
print('='*80)
if not active:
print("⏭ Skipping (inactive)")
continue
try:
# Parse RSS
print("Fetching RSS feed...")
feed = feedparser.parse(url)
if not feed.entries:
print("❌ No entries found")
continue
print(f"✓ Found {len(feed.entries)} entries\n")
# Test first 3 entries
for i, entry in enumerate(feed.entries[:3], 1):
print(f"\n--- Entry {i} ---")
title = entry.get('title', 'No title')
print(f"Title: {title[:70]}")
# Test URL extraction
article_url = extract_article_url(entry)
if article_url:
print(f"✓ URL extracted: {article_url}")
else:
print(f"❌ Could not extract URL")
print(f" Available fields: {list(entry.keys())[:10]}")
print(f" link: {entry.get('link', 'N/A')}")
print(f" guid: {entry.get('guid', 'N/A')}")
# Test summary
summary = extract_article_summary(entry)
if summary:
print(f"✓ Summary: {summary[:80]}...")
# Test date
pub_date = extract_published_date(entry)
if pub_date:
print(f"✓ Date: {pub_date}")
except Exception as e:
print(f"❌ Error: {e}")
print("\n" + "="*80)
print("Test complete!")
print("="*80)