update

2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,40 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 env/
 venv/
 ENV/
 .venv
 # Node
 node_modules/
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 # Environment variables
 .env
 .env.local
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 Thumbs.db
 # Git
 .git/
 .gitignore
 # Documentation
 *.md
 !README.md
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,187 @@
 # ===================================
 # Python
 # ===================================
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 # Virtual Environments
 env/
 venv/
 ENV/
 .venv
 .virtualenv
 backend/env/
 backend/venv/
 news_crawler/env/
 news_crawler/venv/
 news_sender/env/
 news_sender/venv/
 # Python Distribution / Packaging
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 *.manifest
 *.spec
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # ===================================
 # Node.js
 # ===================================
 node_modules/
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 .npm
 .yarn-integrity
 package-lock.json
 yarn.lock
 .pnp
 .pnp.js
 # ===================================
 # Environment Variables & Secrets
 # ===================================
 .env
 .env.local
 .env.development.local
 .env.test.local
 .env.production.local
 *.env
 # ===================================
 # Database
 # ===================================
 *.db
 *.sqlite
 *.sqlite3
 *.db-journal
 # MongoDB
 data/
 mongodb_data/
 # ===================================
 # IDE & Editors
 # ===================================
 # VSCode
 .vscode/
 .vscode-test/
 *.code-workspace
 # PyCharm / IntelliJ
 .idea/
 *.iml
 *.iws
 *.ipr
 out/
 # Sublime Text
 *.sublime-project
 *.sublime-workspace
 # Vim
 *.swp
 *.swo
 *~
 .vim/
 # Emacs
 *~
 \#*\#
 .\#*
 # ===================================
 # OS Files
 # ===================================
 # macOS
 .DS_Store
 .AppleDouble
 .LSOverride
 ._*
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 .com.apple.timemachine.donotpresent
 # Windows
 Thumbs.db
 Thumbs.db:encryptable
 ehthumbs.db
 ehthumbs_vista.db
 *.stackdump
 [Dd]esktop.ini
 $RECYCLE.BIN/
 *.cab
 *.msi
 *.msix
 *.msm
 *.msp
 *.lnk
 # Linux
 .directory
 .Trash-*
 # ===================================
 # Project Specific
 # ===================================
 # Generated files
 newsletter_preview.html
 *.log
 # Temporary files
 *.tmp
 *.temp
 *.bak
 *.backup
 # Docker volumes
 mongodb_data/
 ollama_data/
 # Spec artifacts (optional - uncomment if you don't want to track specs)
 # .kiro/specs/
 # Test outputs
 test-results/
 coverage/
--- a/.kiro/specs/ai-article-summarization/design.md
+++ b/.kiro/specs/ai-article-summarization/design.md
@@ -0,0 +1,487 @@
 # Design Document - AI Article Summarization
 ## Overview
 This design integrates Ollama AI into the news crawler workflow to automatically generate concise summaries of articles. The system will extract full article content, send it to Ollama for summarization, and store both the original content and the AI-generated summary in MongoDB.
 ## Architecture
 ### High-Level Flow
 ```
 RSS Feed → Extract Content → Summarize with Ollama → Store in MongoDB
                ↓                      ↓                    ↓
         Full Article Text    AI Summary (≤150 words)   Both Stored
 ```
 ### Component Diagram
 ```
 ┌─────────────────────────────────────────────────────────────┐
 │                    News Crawler Service                      │
 │                                                              │
 │  ┌────────────────┐      ┌──────────────────┐             │
 │  │ RSS Parser     │──────→│ Content Extractor│             │
 │  └────────────────┘      └──────────────────┘             │
 │                                   │                         │
 │                                   ↓                         │
 │                          ┌──────────────────┐              │
 │                          │ Ollama Client    │              │
 │                          │ (New Component)  │              │
 │                          └──────────────────┘              │
 │                                   │                         │
 │                                   ↓                         │
 │                          ┌──────────────────┐              │
 │                          │ Database Writer  │              │
 │                          └──────────────────┘              │
 └─────────────────────────────────────────────────────────────┘
                                   │
                                   ↓
                          ┌──────────────────┐
                          │  Ollama Server   │
                          │  (External)      │
                          └──────────────────┘
                                   │
                                   ↓
                          ┌──────────────────┐
                          │    MongoDB       │
                          └──────────────────┘
 ```
 ## Components and Interfaces
 ### 1. Ollama Client Module
 **File:** `news_crawler/ollama_client.py`
 **Purpose:** Handle communication with Ollama server for summarization
 **Interface:**
 ```python
 class OllamaClient:
    def __init__(self, base_url, model, api_key=None, enabled=True):
        """Initialize Ollama client with configuration"""
    def summarize_article(self, content: str, max_words: int = 150) -> dict:
        """
        Summarize article content using Ollama
        Args:
            content: Full article text
            max_words: Maximum words in summary (default 150)
        Returns:
            {
                'summary': str,           # AI-generated summary
                'word_count': int,        # Summary word count
                'success': bool,          # Whether summarization succeeded
                'error': str or None,     # Error message if failed
                'duration': float         # Time taken in seconds
            }
        """
    def is_available(self) -> bool:
        """Check if Ollama server is reachable"""
    def test_connection(self) -> dict:
        """Test connection and return server info"""
 ```
 **Key Methods:**
 1. **summarize_article()**
   - Constructs prompt for Ollama
   - Sends HTTP POST request
   - Handles timeouts and errors
   - Validates response
   - Returns structured result
 2. **is_available()**
   - Quick health check
   - Returns True/False
   - Used before attempting summarization
 3. **test_connection()**
   - Detailed connection test
   - Returns server info and model list
   - Used for diagnostics
 ### 2. Enhanced Crawler Service
 **File:** `news_crawler/crawler_service.py`
 **Changes:**
 ```python
 # Add Ollama client initialization
 from ollama_client import OllamaClient
 # Initialize at module level
 ollama_client = OllamaClient(
    base_url=os.getenv('OLLAMA_BASE_URL'),
    model=os.getenv('OLLAMA_MODEL'),
    api_key=os.getenv('OLLAMA_API_KEY'),
    enabled=os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
 )
 # Modify crawl_rss_feed() to include summarization
 def crawl_rss_feed(feed_url, feed_name, max_articles=10):
    # ... existing code ...
    # After extracting content
    article_data = extract_article_content(article_url)
    # NEW: Summarize with Ollama
    summary_result = None
    if ollama_client.enabled and article_data.get('content'):
        print(f"   🤖 Summarizing with AI...")
        summary_result = ollama_client.summarize_article(
            article_data['content'],
            max_words=150
        )
        if summary_result['success']:
            print(f"   ✓ Summary generated ({summary_result['word_count']} words)")
        else:
            print(f"   ⚠ Summarization failed: {summary_result['error']}")
    # Build article document with summary
    article_doc = {
        'title': article_data.get('title'),
        'author': article_data.get('author'),
        'link': article_url,
        'content': article_data.get('content'),
        'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
        'word_count': article_data.get('word_count'),
        'summary_word_count': summary_result['word_count'] if summary_result and summary_result['success'] else None,
        'source': feed_name,
        'published_at': extract_published_date(entry),
        'crawled_at': article_data.get('crawled_at'),
        'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
        'created_at': datetime.utcnow()
    }
 ```
 ### 3. Configuration Module
 **File:** `news_crawler/config.py` (new file)
 **Purpose:** Centralize configuration management
 ```python
 import os
 from dotenv import load_dotenv
 load_dotenv(dotenv_path='../.env')
 class Config:
    # MongoDB
    MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'
    # Ollama
    OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
    OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
    OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
    OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
    # Crawler
    RATE_LIMIT_DELAY = 1  # seconds between requests
    MAX_CONTENT_LENGTH = 50000  # characters
 ```
 ## Data Models
 ### Updated Article Schema
 ```javascript
 {
  _id: ObjectId,
  title: String,
  author: String,
  link: String,                    // Unique index
  content: String,                 // Full article content
  summary: String,                 // AI-generated summary (≤150 words)
  word_count: Number,              // Original content word count
  summary_word_count: Number,      // Summary word count
  source: String,
  published_at: String,
  crawled_at: DateTime,
  summarized_at: DateTime,         // When AI summary was generated
  created_at: DateTime
 }
 ```
 ### Ollama Request Format
 ```json
 {
  "model": "phi3:latest",
  "prompt": "Summarize the following article in 150 words or less. Focus on the key points and main message:\n\n[ARTICLE CONTENT]",
  "stream": false,
  "options": {
    "temperature": 0.7,
    "max_tokens": 200
  }
 }
 ```
 ### Ollama Response Format
 ```json
 {
  "model": "phi3:latest",
  "created_at": "2024-11-10T16:30:00Z",
  "response": "The AI-generated summary text here...",
  "done": true,
  "total_duration": 5000000000
 }
 ```
 ## Error Handling
 ### Error Scenarios and Responses
 | Scenario | Handling | User Impact |
 |----------|----------|-------------|
 | Ollama server down | Log warning, store original content | Article saved without summary |
 | Ollama timeout (>30s) | Cancel request, store original | Article saved without summary |
 | Empty summary returned | Log error, store original | Article saved without summary |
 | Invalid response format | Log error, store original | Article saved without summary |
 | Network error | Retry once, then store original | Article saved without summary |
 | Model not found | Log error, disable Ollama | All articles saved without summaries |
 ### Error Logging Format
 ```python
 {
    'timestamp': datetime.utcnow(),
    'article_url': article_url,
    'error_type': 'timeout|connection|invalid_response|empty_summary',
    'error_message': str(error),
    'ollama_config': {
        'base_url': OLLAMA_BASE_URL,
        'model': OLLAMA_MODEL,
        'enabled': OLLAMA_ENABLED
    }
 }
 ```
 ## Testing Strategy
 ### Unit Tests
 1. **test_ollama_client.py**
   - Test summarization with mock responses
   - Test timeout handling
   - Test error scenarios
   - Test connection checking
 2. **test_crawler_with_ollama.py**
   - Test crawler with Ollama enabled
   - Test crawler with Ollama disabled
   - Test fallback when Ollama fails
   - Test rate limiting
 ### Integration Tests
 1. **test_end_to_end.py**
   - Crawl real RSS feed
   - Summarize with real Ollama
   - Verify database storage
   - Check all fields populated
 ### Manual Testing
 1. Test with Ollama enabled and working
 2. Test with Ollama disabled
 3. Test with Ollama unreachable
 4. Test with slow Ollama responses
 5. Test with various article lengths
 ## Performance Considerations
 ### Timing Estimates
 - Article extraction: 2-5 seconds
 - Ollama summarization: 5-15 seconds (depends on article length and model)
 - Database write: <1 second
 - **Total per article: 8-21 seconds**
 ### Optimization Strategies
 1. **Sequential Processing**
   - Process one article at a time
   - Prevents overwhelming Ollama
   - Easier to debug
 2. **Timeout Management**
   - 30-second timeout per request
   - Prevents hanging on slow responses
 3. **Rate Limiting**
   - 1-second delay between articles
   - Respects server resources
 4. **Future: Batch Processing**
   - Queue articles for summarization
   - Process in batches
   - Use Celery for async processing
 ### Resource Usage
 - **Memory**: ~100MB per crawler instance
 - **Network**: ~1-5KB per article (to Ollama)
 - **Storage**: +150 words per article (~1KB)
 - **CPU**: Minimal (Ollama does the heavy lifting)
 ## Security Considerations
 1. **API Key Storage**
   - Store in environment variables
   - Never commit to git
   - Use secrets management in production
 2. **Content Sanitization**
   - Don't log full article content
   - Sanitize URLs in logs
   - Limit error message detail
 3. **Network Security**
   - Support HTTPS for Ollama
   - Validate SSL certificates
   - Use secure connections
 4. **Rate Limiting**
   - Prevent abuse of Ollama server
   - Implement backoff on errors
   - Monitor usage patterns
 ## Deployment Considerations
 ### Environment Variables
 ```bash
 # Required
 OLLAMA_BASE_URL=http://localhost:11434
 OLLAMA_MODEL=phi3:latest
 OLLAMA_ENABLED=true
 # Optional
 OLLAMA_API_KEY=your-api-key
 OLLAMA_TIMEOUT=30
 ```
 ### Docker Deployment
 ```yaml
 # docker-compose.yml
 services:
  crawler:
    build: ./news_crawler
    environment:
      - OLLAMA_BASE_URL=http://ollama:11434
      - OLLAMA_ENABLED=true
    depends_on:
      - ollama
      - mongodb
  ollama:
    image: ollama/ollama:latest
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
 ```
 ### Monitoring
 1. **Metrics to Track**
   - Summarization success rate
   - Average summarization time
   - Ollama server uptime
   - Error frequency by type
 2. **Logging**
   - Log all summarization attempts
   - Log errors with context
   - Log performance metrics
 3. **Alerts**
   - Alert if Ollama is down >5 minutes
   - Alert if success rate <80%
   - Alert if average time >20 seconds
 ## Migration Plan
 ### Phase 1: Add Ollama Client (Week 1)
 - Create ollama_client.py
 - Add configuration
 - Write unit tests
 - Test with sample articles
 ### Phase 2: Integrate with Crawler (Week 1)
 - Modify crawler_service.py
 - Add summarization step
 - Update database schema
 - Test end-to-end
 ### Phase 3: Update Backend API (Week 2)
 - Update news routes
 - Add summary fields to responses
 - Update frontend to display summaries
 - Deploy to production
 ### Phase 4: Monitor and Optimize (Ongoing)
 - Monitor performance
 - Tune prompts for better summaries
 - Optimize rate limiting
 - Add batch processing if needed
 ## Rollback Plan
 If issues arise:
 1. **Immediate**: Set `OLLAMA_ENABLED=false`
 2. **Short-term**: Revert crawler code changes
 3. **Long-term**: Remove Ollama integration
 System will continue to work with original content if Ollama is disabled.
 ## Success Metrics
 - ✅ 95%+ of articles successfully summarized
 - ✅ Average summarization time <15 seconds
 - ✅ Zero data loss (all articles stored even if summarization fails)
 - ✅ Ollama uptime >99%
 - ✅ Summary quality: readable and accurate (manual review)
 ## Future Enhancements
 1. **Multi-language Support**
   - Detect article language
   - Use appropriate model
   - Translate summaries
 2. **Custom Summary Lengths**
   - Allow configuration per feed
   - Support different lengths for different use cases
 3. **Sentiment Analysis**
   - Add sentiment score
   - Categorize as positive/negative/neutral
 4. **Keyword Extraction**
   - Extract key topics
   - Enable better search
 5. **Batch Processing**
   - Queue articles
   - Process in parallel
   - Use Celery for async
 6. **Caching**
   - Cache summaries
   - Avoid re-processing
   - Use Redis for cache
--- a/.kiro/specs/ai-article-summarization/requirements.md
+++ b/.kiro/specs/ai-article-summarization/requirements.md
@@ -0,0 +1,164 @@
 # Requirements Document
 ## Introduction
 This feature integrates Ollama AI into the news crawler to automatically summarize articles before storing them in the database. Instead of storing full article content, the system will generate concise 150-word summaries using AI, making the content more digestible for newsletter readers and reducing storage requirements.
 ## Glossary
 - **Crawler Service**: The standalone microservice that fetches and processes article content from RSS feeds
 - **Ollama Server**: The AI inference server that provides text summarization capabilities
 - **Article Content**: The full text extracted from a news article webpage
 - **Summary**: A concise AI-generated version of the article content (max 150 words)
 - **MongoDB**: The database where articles and summaries are stored
 ## Requirements
 ### Requirement 1: Ollama Integration in Crawler
 **User Story:** As a system administrator, I want the crawler to use Ollama for summarization, so that articles are automatically condensed before storage.
 #### Acceptance Criteria
 1. WHEN the crawler extracts article content, THE Crawler Service SHALL send the content to the Ollama Server for summarization
 2. WHEN sending content to Ollama, THE Crawler Service SHALL include a prompt requesting a summary of 150 words or less
 3. WHEN Ollama returns a summary, THE Crawler Service SHALL validate that the summary is not empty
 4. IF the Ollama Server is unavailable, THEN THE Crawler Service SHALL store the original content without summarization and log a warning
 5. WHEN summarization fails, THE Crawler Service SHALL continue processing other articles without stopping
 ### Requirement 2: Configuration Management
 **User Story:** As a system administrator, I want to configure Ollama settings, so that I can control the summarization behavior.
 #### Acceptance Criteria
 1. THE Crawler Service SHALL read Ollama configuration from environment variables
 2. THE Crawler Service SHALL support the following configuration options:
   - OLLAMA_BASE_URL (server URL)
   - OLLAMA_MODEL (model name)
   - OLLAMA_ENABLED (enable/disable flag)
   - OLLAMA_API_KEY (optional authentication)
 3. WHERE OLLAMA_ENABLED is false, THE Crawler Service SHALL store original content without summarization
 4. WHERE OLLAMA_ENABLED is true AND Ollama is unreachable, THE Crawler Service SHALL log an error and store original content
 ### Requirement 3: Summary Storage
 **User Story:** As a developer, I want summaries stored in the database, so that the frontend can display concise article previews.
 #### Acceptance Criteria
 1. WHEN a summary is generated, THE Crawler Service SHALL store it in the `summary` field in MongoDB
 2. WHEN storing an article, THE Crawler Service SHALL include both the original content and the AI summary
 3. THE Crawler Service SHALL store the following fields:
   - `content` (original full text)
   - `summary` (AI-generated, max 150 words)
   - `word_count` (original content word count)
   - `summary_word_count` (summary word count)
   - `summarized_at` (timestamp when summarized)
 4. WHEN an article already has a summary, THE Crawler Service SHALL not re-summarize it
 ### Requirement 4: Error Handling and Resilience
 **User Story:** As a system administrator, I want the crawler to handle AI failures gracefully, so that the system remains reliable.
 #### Acceptance Criteria
 1. IF Ollama returns an error, THEN THE Crawler Service SHALL log the error and store the original content
 2. IF Ollama times out (>30 seconds), THEN THE Crawler Service SHALL cancel the request and store the original content
 3. IF the summary is empty or invalid, THEN THE Crawler Service SHALL store the original content
 4. WHEN an error occurs, THE Crawler Service SHALL include an error indicator in the database record
 5. THE Crawler Service SHALL continue processing remaining articles after any summarization failure
 ### Requirement 5: Performance and Rate Limiting
 **User Story:** As a system administrator, I want the crawler to respect rate limits, so that it doesn't overwhelm the Ollama server.
 #### Acceptance Criteria
 1. THE Crawler Service SHALL wait at least 1 second between Ollama API calls
 2. THE Crawler Service SHALL set a timeout of 30 seconds for each Ollama request
 3. WHEN processing multiple articles, THE Crawler Service SHALL process them sequentially to avoid overloading Ollama
 4. THE Crawler Service SHALL log the time taken for each summarization
 5. THE Crawler Service SHALL display progress indicators showing summarization status
 ### Requirement 6: Monitoring and Logging
 **User Story:** As a system administrator, I want detailed logs of summarization activity, so that I can monitor and troubleshoot the system.
 #### Acceptance Criteria
 1. THE Crawler Service SHALL log when summarization starts for each article
 2. THE Crawler Service SHALL log the original word count and summary word count
 3. THE Crawler Service SHALL log any errors or warnings from Ollama
 4. THE Crawler Service SHALL display a summary of total articles summarized at the end
 5. THE Crawler Service SHALL include summarization statistics in the final report
 ### Requirement 7: API Endpoint Updates
 **User Story:** As a frontend developer, I want API endpoints to return summaries, so that I can display them to users.
 #### Acceptance Criteria
 1. WHEN fetching articles via GET /api/news, THE Backend API SHALL include the `summary` field if available
 2. WHEN fetching a single article via GET /api/news/<url>, THE Backend API SHALL include both `content` and `summary`
 3. THE Backend API SHALL include a `has_summary` boolean field indicating if AI summarization was performed
 4. THE Backend API SHALL include `summarized_at` timestamp if available
 5. WHERE no summary exists, THE Backend API SHALL return a preview of the original content (first 200 chars)
 ### Requirement 8: Backward Compatibility
 **User Story:** As a developer, I want the system to work with existing articles, so that no data migration is required.
 #### Acceptance Criteria
 1. THE Crawler Service SHALL work with articles that don't have summaries
 2. THE Backend API SHALL handle articles with or without summaries gracefully
 3. WHERE an article has no summary, THE Backend API SHALL generate a preview from the content field
 4. THE Crawler Service SHALL not re-process articles that already have summaries
 5. THE system SHALL continue to function if Ollama is disabled or unavailable
 ## Non-Functional Requirements
 ### Performance
 - Summarization SHALL complete within 30 seconds per article
 - The crawler SHALL process at least 10 articles per minute (including summarization)
 - Database operations SHALL not be significantly slower with summary storage
 ### Reliability
 - The system SHALL maintain 99% uptime even if Ollama is unavailable
 - Failed summarizations SHALL not prevent article storage
 - The crawler SHALL recover from Ollama errors without manual intervention
 ### Security
 - Ollama API keys SHALL be stored in environment variables, not in code
 - Article content SHALL not be logged to prevent sensitive data exposure
 - API communication with Ollama SHALL support HTTPS
 ### Scalability
 - The system SHALL support multiple Ollama servers for load balancing (future)
 - The crawler SHALL handle articles of any length (up to 50,000 words)
 - The database schema SHALL support future enhancements (tags, categories, etc.)
 ## Dependencies
 - Ollama server must be running and accessible
 - `requests` Python library for HTTP communication
 - Environment variables properly configured
 - MongoDB with sufficient storage for both content and summaries
 ## Assumptions
 - Ollama server is already set up and configured
 - The phi3:latest model (or configured model) supports summarization tasks
 - Network connectivity between crawler and Ollama server is reliable
 - Articles are in English or the configured Ollama model supports the article language
 ## Future Enhancements
 - Support for multiple languages
 - Customizable summary length
 - Sentiment analysis integration
 - Keyword extraction
 - Category classification
 - Batch summarization for improved performance
 - Caching of summaries to avoid re-processing
--- a/.kiro/specs/ai-article-summarization/tasks.md
+++ b/.kiro/specs/ai-article-summarization/tasks.md
@@ -0,0 +1,92 @@
 # Implementation Plan
 - [x] 1. Create Ollama client module
  - Create `news_crawler/ollama_client.py` with OllamaClient class
  - Implement `summarize_article()` method with prompt construction and API call
  - Implement `is_available()` method for health checks
  - Implement `test_connection()` method for diagnostics
  - Add timeout handling (30 seconds)
  - Add error handling for connection, timeout, and invalid responses
  - _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 4.1, 4.2, 4.3, 5.2_
 - [x] 2. Create configuration module for crawler
  - Create `news_crawler/config.py` with Config class
  - Load environment variables (OLLAMA_BASE_URL, OLLAMA_MODEL, OLLAMA_ENABLED, OLLAMA_API_KEY, OLLAMA_TIMEOUT)
  - Add validation for required configuration
  - Add default values for optional configuration
  - _Requirements: 2.1, 2.2, 2.3, 2.4_
 - [x] 3. Integrate Ollama client into crawler service
  - Import OllamaClient in `news_crawler/crawler_service.py`
  - Initialize Ollama client at module level using Config
  - Modify `crawl_rss_feed()` to call summarization after content extraction
  - Add conditional logic to skip summarization if OLLAMA_ENABLED is false
  - Add error handling to continue processing if summarization fails
  - Add logging for summarization start, success, and failure
  - Add rate limiting delay after summarization
  - _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 2.3, 2.4, 4.1, 4.5, 5.1, 5.3, 6.1, 6.2, 6.3_
 - [x] 4. Update database schema and storage
  - Modify article document structure in `crawl_rss_feed()` to include:
    - `summary` field (AI-generated summary)
    - `summary_word_count` field
    - `summarized_at` field (timestamp)
  - Update MongoDB upsert logic to handle new fields
  - Add check to skip re-summarization if article already has summary
  - _Requirements: 3.1, 3.2, 3.3, 3.4, 8.4_
 - [x] 5. Update backend API to return summaries
  - Modify `backend/routes/news_routes.py` GET /api/news endpoint
  - Add `summary`, `summary_word_count`, `summarized_at` fields to response
  - Add `has_summary` boolean field to indicate if AI summarization was performed
  - Modify GET /api/news/<url> endpoint to include summary fields
  - Add fallback to content preview if no summary exists
  - _Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3_
 - [x] 6. Update database schema documentation
  - Update `backend/DATABASE_SCHEMA.md` with new summary fields
  - Add example document showing summary fields
  - Document the summarization workflow
  - _Requirements: 3.1, 3.2, 3.3_
 - [x] 7. Add environment variable configuration
  - Update `backend/env.template` with Ollama configuration
  - Add comments explaining each Ollama setting
  - Document default values
  - _Requirements: 2.1, 2.2_
 - [x] 8. Create test script for Ollama integration
  - Create `news_crawler/test_ollama.py` to test Ollama connection
  - Test summarization with sample article
  - Test error handling (timeout, connection failure)
  - Display configuration and connection status
  - _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 4.1, 4.2_
 - [x] 9. Update crawler statistics and logging
  - Add summarization statistics to final report in `crawl_all_feeds()`
  - Track total articles summarized vs failed
  - Log average summarization time
  - Display progress indicators during summarization
  - _Requirements: 5.4, 6.1, 6.2, 6.3, 6.4, 6.5_
 - [x] 10. Create documentation for AI summarization
  - Create `news_crawler/AI_SUMMARIZATION.md` explaining the feature
  - Document configuration options
  - Provide troubleshooting guide
  - Add examples of usage
  - _Requirements: 2.1, 2.2, 2.3, 2.4, 6.1, 6.2, 6.3_
 - [x] 11. Update main README with AI summarization info
  - Add section about AI summarization feature
  - Document Ollama setup requirements
  - Add configuration examples
  - Update API endpoint documentation
  - _Requirements: 2.1, 2.2, 7.1, 7.2_
 - [x] 12. Test end-to-end workflow
  - Run crawler with Ollama enabled
  - Verify articles are summarized correctly
  - Check database contains all expected fields
  - Test API endpoints return summaries
  - Verify error handling when Ollama is disabled/unavailable
  - _Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5, 7.1, 7.2, 7.3, 7.4, 7.5, 8.1, 8.2, 8.3, 8.4, 8.5_
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -0,0 +1,209 @@
 # Munich News Daily - Architecture
 ## System Overview
 ```
 ┌─────────────────────────────────────────────────────────────┐
 │                        Users / Browsers                      │
 └────────────────────────┬────────────────────────────────────┘
                         │
                         ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                    Frontend (Port 3000)                      │
 │                  Node.js + Express + Vanilla JS              │
 │  - Subscription form                                         │
 │  - News display                                              │
 │  - RSS feed management UI (future)                           │
 └────────────────────────┬────────────────────────────────────┘
                         │ HTTP/REST
                         ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                    Backend API (Port 5001)                   │
 │                      Flask + Python                          │
 │                                                              │
 │  ┌──────────────────────────────────────────────────────┐  │
 │  │ Routes (Blueprints)                                  │  │
 │  │  - subscription_routes.py  (subscribe/unsubscribe)   │  │
 │  │  - news_routes.py          (get news, stats)         │  │
 │  │  - rss_routes.py           (manage RSS feeds)        │  │
 │  │  - ollama_routes.py        (AI features)             │  │
 │  └──────────────────────────────────────────────────────┘  │
 │                                                              │
 │  ┌──────────────────────────────────────────────────────┐  │
 │  │ Services (Business Logic)                            │  │
 │  │  - news_service.py         (fetch & save articles)   │  │
 │  │  - email_service.py        (send newsletters)        │  │
 │  │  - ollama_service.py       (AI integration)          │  │
 │  └──────────────────────────────────────────────────────┘  │
 │                                                              │
 │  ┌──────────────────────────────────────────────────────┐  │
 │  │ Core                                                 │  │
 │  │  - config.py               (configuration)           │  │
 │  │  - database.py             (DB connection)           │  │
 │  └──────────────────────────────────────────────────────┘  │
 └────────────────────────┬────────────────────────────────────┘
                         │
                         ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                    MongoDB (Port 27017)                      │
 │                                                              │
 │  Collections:                                                │
 │  - articles         (news articles with full content)        │
 │  - subscribers      (email subscribers)                      │
 │  - rss_feeds        (RSS feed sources)                       │
 └─────────────────────────┬───────────────────────────────────┘
                          │
                          │ Read/Write
                          │
 ┌─────────────────────────┴───────────────────────────────────┐
 │              News Crawler Microservice                       │
 │                    (Standalone)                              │
 │                                                              │
 │  - Fetches RSS feeds from MongoDB                            │
 │  - Crawls full article content                               │
 │  - Extracts text, metadata, word count                       │
 │  - Stores back to MongoDB                                    │
 │  - Can run independently or scheduled                        │
 └──────────────────────────────────────────────────────────────┘
                          │
                          │ (Optional)
                          ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                  Ollama AI Server (Port 11434)               │
 │                    (Optional, External)                      │
 │                                                              │
 │  - Article summarization                                     │
 │  - Content analysis                                          │
 │  - AI-powered features                                       │
 └──────────────────────────────────────────────────────────────┘
 ```
 ## Component Details
 ### Frontend (Port 3000)
 - **Technology**: Node.js, Express, Vanilla JavaScript
 - **Responsibilities**:
  - User interface
  - Subscription management
  - News display
  - API proxy to backend
 - **Communication**: HTTP REST to Backend
 ### Backend API (Port 5001)
 - **Technology**: Python, Flask
 - **Architecture**: Modular with Blueprints
 - **Responsibilities**:
  - REST API endpoints
  - Business logic
  - Database operations
  - Email sending
  - AI integration
 - **Communication**: 
  - HTTP REST from Frontend
  - MongoDB driver to Database
  - HTTP to Ollama (optional)
 ### MongoDB (Port 27017)
 - **Technology**: MongoDB 7.0
 - **Responsibilities**:
  - Persistent data storage
  - Articles, subscribers, RSS feeds
 - **Communication**: MongoDB protocol
 ### News Crawler (Standalone)
 - **Technology**: Python, BeautifulSoup
 - **Architecture**: Microservice (can run independently)
 - **Responsibilities**:
  - Fetch RSS feeds
  - Crawl article content
  - Extract and clean text
  - Store in database
 - **Communication**: MongoDB driver to Database
 - **Execution**: 
  - Manual: `python crawler_service.py`
  - Scheduled: Cron, systemd, Docker
  - On-demand: Via backend API (future)
 ### Ollama AI Server (Optional, External)
 - **Technology**: Ollama
 - **Responsibilities**:
  - AI model inference
  - Text summarization
  - Content analysis
 - **Communication**: HTTP REST API
 ## Data Flow
 ### 1. News Aggregation Flow
 ```
 RSS Feeds → Backend (news_service) → MongoDB (articles)
 ```
 ### 2. Content Crawling Flow
 ```
 MongoDB (rss_feeds) → Crawler → Article URLs → 
 Web Scraping → MongoDB (articles with full_content)
 ```
 ### 3. Subscription Flow
 ```
 User → Frontend → Backend (subscription_routes) → 
 MongoDB (subscribers)
 ```
 ### 4. Newsletter Flow (Future)
 ```
 Scheduler → Backend (email_service) → 
 MongoDB (articles + subscribers) → SMTP → Users
 ```
 ### 5. AI Processing Flow (Optional)
 ```
 MongoDB (articles) → Backend (ollama_service) → 
 Ollama Server → AI Summary → MongoDB (articles)
 ```
 ## Deployment Options
 ### Development
 - All services run locally
 - MongoDB via Docker Compose
 - Manual crawler execution
 ### Production
 - Backend: Cloud VM, Container, or PaaS
 - Frontend: Static hosting or same server
 - MongoDB: MongoDB Atlas or self-hosted
 - Crawler: Scheduled job (cron, systemd timer)
 - Ollama: Separate GPU server (optional)
 ## Scalability Considerations
 ### Current Architecture
 - Monolithic backend (single Flask instance)
 - Standalone crawler (can run multiple instances)
 - Shared MongoDB
 ### Future Improvements
 - Load balancer for backend
 - Message queue for crawler jobs (Celery + Redis)
 - Caching layer (Redis)
 - CDN for frontend
 - Read replicas for MongoDB
 ## Security
 - CORS enabled for frontend-backend communication
 - MongoDB authentication (production)
 - Environment variables for secrets
 - Input validation on all endpoints
 - Rate limiting (future)
 ## Monitoring (Future)
 - Application logs
 - MongoDB metrics
 - Crawler success/failure tracking
 - API response times
 - Error tracking (Sentry)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,136 @@
 # Changelog
 ## [Unreleased] - 2024-11-10
 ### Added - Major Refactoring
 #### Backend Modularization
 - ✅ Restructured backend into modular architecture
 - ✅ Created separate route blueprints:
  - `subscription_routes.py` - User subscriptions
  - `news_routes.py` - News fetching and stats
  - `rss_routes.py` - RSS feed management (CRUD)
  - `ollama_routes.py` - AI integration
 - ✅ Created service layer:
  - `news_service.py` - News fetching logic
  - `email_service.py` - Newsletter sending
  - `ollama_service.py` - AI communication
 - ✅ Centralized configuration in `config.py`
 - ✅ Separated database logic in `database.py`
 - ✅ Reduced main `app.py` from 700+ lines to 27 lines
 #### RSS Feed Management
 - ✅ Dynamic RSS feed management via API
 - ✅ Add/remove/list/toggle RSS feeds without code changes
 - ✅ Unique index on RSS feed URLs (prevents duplicates)
 - ✅ Default feeds auto-initialized on first run
 - ✅ Created `fix_duplicates.py` utility script
 #### News Crawler Microservice
 - ✅ Created standalone `news_crawler/` microservice
 - ✅ Web scraping with BeautifulSoup
 - ✅ Smart content extraction using multiple selectors
 - ✅ Full article content storage in MongoDB
 - ✅ Word count calculation
 - ✅ Duplicate prevention (skips already-crawled articles)
 - ✅ Rate limiting (1 second between requests)
 - ✅ Can run independently or scheduled
 - ✅ Docker support for crawler
 - ✅ Comprehensive documentation
 #### API Endpoints
 New endpoints added:
 - `GET /api/rss-feeds` - List all RSS feeds
 - `POST /api/rss-feeds` - Add new RSS feed
 - `DELETE /api/rss-feeds/<id>` - Remove RSS feed
 - `PATCH /api/rss-feeds/<id>/toggle` - Toggle feed active status
 #### Documentation
 - ✅ Created `ARCHITECTURE.md` - System architecture overview
 - ✅ Created `backend/STRUCTURE.md` - Backend structure guide
 - ✅ Created `news_crawler/README.md` - Crawler documentation
 - ✅ Created `news_crawler/QUICKSTART.md` - Quick start guide
 - ✅ Created `news_crawler/test_crawler.py` - Test suite
 - ✅ Updated main `README.md` with new features
 - ✅ Updated `DATABASE_SCHEMA.md` with new fields
 #### Configuration
 - ✅ Added `FLASK_PORT` environment variable
 - ✅ Fixed `OLLAMA_MODEL` typo in `.env`
 - ✅ Port 5001 default to avoid macOS AirPlay conflict
 ### Changed
 - Backend structure: Monolithic → Modular
 - RSS feeds: Hardcoded → Database-driven
 - Article storage: Summary only → Full content support
 - Configuration: Scattered → Centralized
 ### Technical Improvements
 - Separation of concerns (routes vs services)
 - Better testability
 - Easier maintenance
 - Scalable architecture
 - Independent microservices
 - Proper error handling
 - Comprehensive logging
 ### Database Schema Updates
 Articles collection now includes:
 - `full_content` - Full article text
 - `word_count` - Number of words
 - `crawled_at` - When content was crawled
 RSS Feeds collection added:
 - `name` - Feed name
 - `url` - Feed URL (unique)
 - `active` - Active status
 - `created_at` - Creation timestamp
 ### Files Added
 ```
 backend/
 ├── config.py
 ├── database.py
 ├── fix_duplicates.py
 ├── STRUCTURE.md
 ├── routes/
 │   ├── __init__.py
 │   ├── subscription_routes.py
 │   ├── news_routes.py
 │   ├── rss_routes.py
 │   └── ollama_routes.py
 └── services/
    ├── __init__.py
    ├── news_service.py
    ├── email_service.py
    └── ollama_service.py
 news_crawler/
 ├── crawler_service.py
 ├── test_crawler.py
 ├── requirements.txt
 ├── .gitignore
 ├── Dockerfile
 ├── docker-compose.yml
 ├── README.md
 └── QUICKSTART.md
 Root:
 ├── ARCHITECTURE.md
 └── CHANGELOG.md
 ```
 ### Files Removed
 - Old monolithic `backend/app.py` (replaced with modular version)
 ### Next Steps (Future Enhancements)
 - [ ] Frontend UI for RSS feed management
 - [ ] Automatic article summarization with Ollama
 - [ ] Scheduled newsletter sending
 - [ ] Article categorization and tagging
 - [ ] Search functionality
 - [ ] User preferences (categories, frequency)
 - [ ] Analytics dashboard
 - [ ] API rate limiting
 - [ ] Caching layer (Redis)
 - [ ] Message queue for crawler (Celery)
--- a/QUICK_REFERENCE.md
+++ b/QUICK_REFERENCE.md
@@ -0,0 +1,206 @@
 # Quick Reference Guide
 ## Starting the Application
 ### 1. Start MongoDB
 ```bash
 docker-compose up -d
 ```
 ### 2. Start Backend (Port 5001)
 ```bash
 cd backend
 source venv/bin/activate  # or: venv\Scripts\activate on Windows
 python app.py
 ```
 ### 3. Start Frontend (Port 3000)
 ```bash
 cd frontend
 npm start
 ```
 ### 4. Run Crawler (Optional)
 ```bash
 cd news_crawler
 pip install -r requirements.txt
 python crawler_service.py 10
 ```
 ## Common Commands
 ### RSS Feed Management
 **List all feeds:**
 ```bash
 curl http://localhost:5001/api/rss-feeds
 ```
 **Add a feed:**
 ```bash
 curl -X POST http://localhost:5001/api/rss-feeds \
  -H "Content-Type: application/json" \
  -d '{"name": "Feed Name", "url": "https://example.com/rss"}'
 ```
 **Remove a feed:**
 ```bash
 curl -X DELETE http://localhost:5001/api/rss-feeds/<feed_id>
 ```
 **Toggle feed status:**
 ```bash
 curl -X PATCH http://localhost:5001/api/rss-feeds/<feed_id>/toggle
 ```
 ### News & Subscriptions
 **Get latest news:**
 ```bash
 curl http://localhost:5001/api/news
 ```
 **Subscribe:**
 ```bash
 curl -X POST http://localhost:5001/api/subscribe \
  -H "Content-Type: application/json" \
  -d '{"email": "user@example.com"}'
 ```
 **Get stats:**
 ```bash
 curl http://localhost:5001/api/stats
 ```
 ### Ollama (AI)
 **Test connection:**
 ```bash
 curl http://localhost:5001/api/ollama/ping
 ```
 **List models:**
 ```bash
 curl http://localhost:5001/api/ollama/models
 ```
 ### Database
 **Connect to MongoDB:**
 ```bash
 mongosh
 use munich_news
 ```
 **Check articles:**
 ```javascript
 db.articles.find().limit(5)
 db.articles.countDocuments()
 db.articles.countDocuments({full_content: {$exists: true}})
 ```
 **Check subscribers:**
 ```javascript
 db.subscribers.find()
 db.subscribers.countDocuments({status: "active"})
 ```
 **Check RSS feeds:**
 ```javascript
 db.rss_feeds.find()
 ```
 ## File Locations
 ### Configuration
 - Backend: `backend/.env`
 - Frontend: `frontend/package.json`
 - Crawler: Uses backend's `.env` or own `.env`
 ### Logs
 - Backend: Terminal output
 - Frontend: Terminal output
 - Crawler: Terminal output
 ### Database
 - MongoDB data: Docker volume `mongodb_data`
 - Database name: `munich_news`
 ## Ports
 | Service | Port | URL |
 |---------|------|-----|
 | Frontend | 3000 | http://localhost:3000 |
 | Backend | 5001 | http://localhost:5001 |
 | MongoDB | 27017 | mongodb://localhost:27017 |
 | Ollama | 11434 | http://localhost:11434 |
 ## Troubleshooting
 ### Backend won't start
 - Check if port 5001 is available
 - Verify MongoDB is running
 - Check `.env` file exists
 ### Frontend can't connect
 - Verify backend is running on port 5001
 - Check CORS settings
 - Check API_URL in frontend
 ### Crawler fails
 - Install dependencies: `pip install -r requirements.txt`
 - Check MongoDB connection
 - Verify RSS feeds exist in database
 ### MongoDB connection error
 - Start MongoDB: `docker-compose up -d`
 - Check connection string in `.env`
 - Verify port 27017 is not blocked
 ### Port 5000 conflict (macOS)
 - AirPlay uses port 5000
 - Use port 5001 instead (set in `.env`)
 - Or disable AirPlay Receiver in System Preferences
 ## Project Structure
 ```
 munich-news/
 ├── backend/          # Main API (Flask)
 ├── frontend/         # Web UI (Express + JS)
 ├── news_crawler/     # Crawler microservice
 ├── .env             # Environment variables
 └── docker-compose.yml  # MongoDB setup
 ```
 ## Environment Variables
 ### Backend (.env)
 ```env
 MONGODB_URI=mongodb://localhost:27017/
 FLASK_PORT=5001
 SMTP_SERVER=smtp.gmail.com
 SMTP_PORT=587
 EMAIL_USER=your-email@gmail.com
 EMAIL_PASSWORD=your-app-password
 OLLAMA_BASE_URL=http://127.0.0.1:11434
 OLLAMA_MODEL=phi3:latest
 OLLAMA_ENABLED=true
 ```
 ## Development Workflow
 1. **Add RSS Feed** → Backend API
 2. **Run Crawler** → Fetches full content
 3. **View News** → Frontend displays articles
 4. **Users Subscribe** → Via frontend form
 5. **Send Newsletter** → Manual or scheduled
 ## Useful Links
 - Frontend: http://localhost:3000
 - Backend API: http://localhost:5001
 - MongoDB: mongodb://localhost:27017
 - Architecture: See `ARCHITECTURE.md`
 - Backend Structure: See `backend/STRUCTURE.md`
 - Crawler Guide: See `news_crawler/README.md`
--- a/README.md
+++ b/README.md
@@ -0,0 +1,327 @@
 # Munich News Daily 📰
 A TLDR/Morning Brew-style news email platform specifically for Munich. Get the latest Munich news delivered to your inbox every morning.
 ## Features
 - 📧 Email newsletter subscription system
 - 📰 Aggregated news from multiple Munich news sources
 - 🎨 Beautiful, modern web interface
 - 📊 Subscription statistics
 - 🔄 Real-time news updates
 ## Tech Stack
 - **Backend**: Python (Flask) - Modular architecture with blueprints
 - **Frontend**: Node.js (Express + Vanilla JavaScript)
 - **Database**: MongoDB
 - **News Crawler**: Standalone Python microservice
 - **News Sources**: RSS feeds from major Munich news outlets
 ## Setup Instructions
 ### Prerequisites
 - Python 3.8+
 - Node.js 14+
 - npm or yarn
 - Docker and Docker Compose (recommended for MongoDB) OR MongoDB (local installation or MongoDB Atlas account)
 ### Backend Setup
 1. Navigate to the backend directory:
 ```bash
 cd backend
 ```
 2. Create a virtual environment (recommended):
 ```bash
 python3 -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 3. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 4. Set up MongoDB using Docker Compose (recommended):
   ```bash
   # From the project root directory
   docker-compose up -d
   ```
   This will start MongoDB in a Docker container. The database will be available at `mongodb://localhost:27017/`
   **Useful Docker commands:**
   ```bash
   # Start MongoDB
   docker-compose up -d
   # Stop MongoDB
   docker-compose down
   # View MongoDB logs
   docker-compose logs -f mongodb
   # Restart MongoDB
   docker-compose restart mongodb
   # Remove MongoDB and all data (WARNING: deletes all data)
   docker-compose down -v
   ```
   **Alternative options:**
   - **Local MongoDB**: Install MongoDB locally and make sure it's running
   - **MongoDB Atlas** (Cloud): Create a free account at [mongodb.com/cloud/atlas](https://www.mongodb.com/cloud/atlas) and get your connection string
 5. Create a `.env` file in the backend directory:
   ```bash
   # Copy the template file
   cp env.template .env
   ```
   Then edit `.env` with your configuration:
   ```env
   # MongoDB connection (default: mongodb://localhost:27017/)
   # For Docker Compose (no authentication):
   MONGODB_URI=mongodb://localhost:27017/
   # For Docker Compose with authentication (if you modify docker-compose.yml):
   # MONGODB_URI=mongodb://admin:password@localhost:27017/
   # Or for MongoDB Atlas:
   # MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
   # Email configuration (optional for testing)
   SMTP_SERVER=smtp.gmail.com
   SMTP_PORT=587
   EMAIL_USER=your-email@gmail.com
   EMAIL_PASSWORD=your-app-password
   # Ollama Configuration (for AI-powered features)
   # Remote Ollama server URL
   OLLAMA_BASE_URL=http://your-remote-server-ip:11434
   # Optional: API key if your Ollama server requires authentication
   # OLLAMA_API_KEY=your-api-key-here
   # Model name to use (e.g., llama2, mistral, codellama, llama3)
   OLLAMA_MODEL=llama2
   # Enable/disable Ollama features (true/false)
   OLLAMA_ENABLED=false
   ```
 **Notes:**
 - For Gmail, you'll need to use an [App Password](https://support.google.com/accounts/answer/185833) instead of your regular password.
 - For Ollama, replace `your-remote-server-ip` with your actual server IP or domain. Set `OLLAMA_ENABLED=true` to enable AI features.
 6. Run the backend server:
 ```bash
 python app.py
 ```
 The backend will run on `http://localhost:5001` (port 5001 to avoid conflict with AirPlay on macOS)
 ### Frontend Setup
 1. Navigate to the frontend directory:
 ```bash
 cd frontend
 ```
 2. Install dependencies:
 ```bash
 npm install
 ```
 3. Run the frontend server:
 ```bash
 npm start
 ```
 The frontend will run on `http://localhost:3000`
 ## Usage
 1. Open your browser and go to `http://localhost:3000`
 2. Enter your email address to subscribe to the newsletter
 3. View the latest Munich news on the homepage
 4. The backend will aggregate news from multiple Munich news sources
 ## Sending Newsletters
 To send newsletters to all subscribers, you can add a scheduled task or manually trigger the `send_newsletter()` function in `app.py`. For production, consider using:
 - **Cron jobs** (Linux/Mac)
 - **Task Scheduler** (Windows)
 - **Celery** with Redis/RabbitMQ for more advanced scheduling
 - **Cloud functions** (AWS Lambda, Google Cloud Functions)
 Example cron job to send daily at 8 AM:
 ```
 0 8 * * * cd /path/to/munich-news/backend && python -c "from app import send_newsletter; send_newsletter()"
 ```
 ## Project Structure
 ```
 munich-news/
 ├── backend/                # Main API server
 │   ├── app.py              # Flask application entry point
 │   ├── config.py           # Configuration management
 │   ├── database.py         # Database connection
 │   ├── routes/             # API endpoints (blueprints)
 │   ├── services/           # Business logic
 │   ├── templates/          # Email templates
 │   └── requirements.txt    # Python dependencies
 ├── news_crawler/           # Crawler microservice
 │   ├── crawler_service.py  # Standalone crawler
 │   ├── ollama_client.py    # AI summarization client
 │   ├── requirements.txt    # Crawler dependencies
 │   └── README.md           # Crawler documentation
 ├── news_sender/            # Newsletter sender microservice
 │   ├── sender_service.py   # Standalone email sender
 │   ├── newsletter_template.html  # Email template
 │   ├── requirements.txt    # Sender dependencies
 │   └── README.md           # Sender documentation
 ├── frontend/               # Web interface
 │   ├── server.js           # Express server
 │   ├── package.json        # Node.js dependencies
 │   └── public/
 │       ├── index.html      # Main page
 │       ├── styles.css      # Styling
 │       └── app.js          # Frontend JavaScript
 ├── docker-compose.yml      # Docker Compose for MongoDB (development)
 ├── docker-compose.prod.yml # Docker Compose with authentication (production)
 └── README.md
 ```
 ## API Endpoints
 ### `POST /api/subscribe`
 Subscribe to the newsletter
 - Body: `{ "email": "user@example.com" }`
 ### `POST /api/unsubscribe`
 Unsubscribe from the newsletter
 - Body: `{ "email": "user@example.com" }`
 ### `GET /api/news`
 Get latest Munich news articles
 ### `GET /api/stats`
 Get subscription statistics
 - Returns: `{ "subscribers": number, "articles": number, "crawled_articles": number }`
 ### `GET /api/news/<article_url>`
 Get full article content by URL
 - Returns: Full article with content, author, word count, etc.
 ### `GET /api/ollama/ping`
 Test connection to Ollama server
 - Returns: Connection status and Ollama configuration
 - Response examples:
  - Success: `{ "status": "success", "message": "...", "response": "...", "ollama_config": {...} }`
  - Disabled: `{ "status": "disabled", "message": "...", "ollama_config": {...} }`
  - Error: `{ "status": "error", "message": "...", "error_details": "...", "troubleshooting": {...}, "ollama_config": {...} }`
 ### `GET /api/ollama/models`
 List available models on Ollama server
 - Returns: List of available models and current configuration
 - Response: `{ "status": "success", "models": [...], "current_model": "...", "ollama_config": {...} }`
 ### `GET /api/rss-feeds`
 Get all RSS feeds
 - Returns: `{ "feeds": [...] }`
 ### `POST /api/rss-feeds`
 Add a new RSS feed
 - Body: `{ "name": "Feed Name", "url": "https://example.com/rss" }`
 - Returns: `{ "message": "...", "id": "..." }`
 ### `DELETE /api/rss-feeds/<feed_id>`
 Remove an RSS feed
 - Returns: `{ "message": "..." }`
 ### `PATCH /api/rss-feeds/<feed_id>/toggle`
 Toggle RSS feed active status
 - Returns: `{ "message": "...", "active": boolean }`
 ## Database Schema
 ### Articles Collection
 ```javascript
 {
  _id: ObjectId,
  title: String,
  link: String (unique),
  summary: String,
  source: String,
  published_at: String,
  created_at: DateTime
 }
 ```
 ### Subscribers Collection
 ```javascript
 {
  _id: ObjectId,
  email: String (unique, lowercase),
  subscribed_at: DateTime,
  status: String ('active' | 'inactive')
 }
 ```
 **Indexes:**
 - `articles.link` - Unique index to prevent duplicate articles
 - `articles.created_at` - For efficient sorting
 - `subscribers.email` - Unique index for email lookups
 - `subscribers.subscribed_at` - For analytics
 ## News Crawler Microservice
 The project includes a standalone crawler microservice that fetches full article content from RSS feeds.
 ### Running the Crawler
 ```bash
 cd news_crawler
 # Install dependencies
 pip install -r requirements.txt
 # Run crawler
 python crawler_service.py 10
 ```
 See `news_crawler/README.md` for detailed documentation.
 ### What It Does
 - Crawls full article content from RSS feed links
 - Extracts text, word count, and metadata
 - Stores in MongoDB for AI processing
 - Skips already-crawled articles
 - Rate-limited (1 second between requests)
 ## Customization
 ### Adding News Sources
 Use the API to add RSS feeds dynamically:
 ```bash
 curl -X POST http://localhost:5001/api/rss-feeds \
  -H "Content-Type: application/json" \
  -d '{"name": "Your Source Name", "url": "https://example.com/rss"}'
 ```
 ### Styling
 Modify `frontend/public/styles.css` to customize the appearance.
 ## License
 MIT
 ## Contributing
 Feel free to submit issues and enhancement requests!
--- a/TEST_INSTRUCTIONS.md
+++ b/TEST_INSTRUCTIONS.md
@@ -0,0 +1,132 @@
 # Testing RSS Feed URL Extraction
 ## Quick Test (Recommended)
 Run this from the project root with backend virtual environment activated:
 ```bash
 # 1. Activate backend virtual environment
 cd backend
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 # 2. Go back to project root
 cd ..
 # 3. Run the test
 python test_feeds_quick.py
 ```
 This will:
 - ✓ Check what RSS feeds are in your database
 - ✓ Fetch each feed
 - ✓ Test URL extraction on first 3 articles
 - ✓ Show what fields are available
 - ✓ Verify summary and date extraction
 ## Expected Output
 ```
 ================================================================================
 RSS Feed Test - Checking Database Feeds
 ================================================================================
 ✓ Found 3 feed(s) in database
 ================================================================================
 Feed: Süddeutsche Zeitung München
 URL: https://www.sueddeutsche.de/muenchen/rss
 Active: True
 ================================================================================
 Fetching RSS feed...
 ✓ Found 20 entries
 --- Entry 1 ---
 Title: New U-Bahn Line Opens in Munich
 ✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-123
 ✓ Summary: The new U-Bahn line connecting the city center...
 ✓ Date: Mon, 10 Nov 2024 10:00:00 +0100
 --- Entry 2 ---
 Title: Munich Weather Update
 ✓ URL extracted: https://www.sueddeutsche.de/muenchen/article-124
 ✓ Summary: Weather forecast for the week...
 ✓ Date: Mon, 10 Nov 2024 09:30:00 +0100
 ...
 ```
 ## If No Feeds Found
 Add a feed first:
 ```bash
 curl -X POST http://localhost:5001/api/rss-feeds \
  -H "Content-Type: application/json" \
  -d '{"name": "Süddeutsche Politik", "url": "https://rss.sueddeutsche.de/rss/Politik"}'
 ```
 ## Testing News Crawler
 Once feeds are verified, test the crawler:
 ```bash
 # 1. Install crawler dependencies
 cd news_crawler
 pip install -r requirements.txt
 # 2. Run the test
 python test_rss_feeds.py
 # 3. Or run the actual crawler
 python crawler_service.py 5
 ```
 ## Troubleshooting
 ### "No module named 'pymongo'"
 - Activate the backend virtual environment first
 - Or install dependencies: `pip install -r backend/requirements.txt`
 ### "No RSS feeds in database"
 - Make sure backend is running
 - Add feeds via API (see above)
 - Or check if MongoDB is running: `docker-compose ps`
 ### "Could not extract URL"
 - The test will show available fields
 - Check if the feed uses `guid`, `id`, or `links` instead of `link`
 - Our utility should handle most cases automatically
 ### "No entries found"
 - The RSS feed URL might be invalid
 - Try opening the URL in a browser
 - Check if it returns valid XML
 ## Manual Database Check
 Using mongosh:
 ```bash
 mongosh
 use munich_news
 db.rss_feeds.find()
 db.articles.find().limit(3)
 ```
 ## What to Look For
 ✅ **Good signs:**
 - URLs are extracted successfully
 - URLs start with `http://` or `https://`
 - Summaries are present
 - Dates are extracted
 ⚠️ **Warning signs:**
 - "Could not extract URL" messages
 - Empty summaries (not critical)
 - Missing dates (not critical)
 ❌ **Problems:**
 - No entries found in feed
 - All URL extractions fail
 - Feed parsing errors
--- a/backend/DATABASE_SCHEMA.md
+++ b/backend/DATABASE_SCHEMA.md
@@ -0,0 +1,143 @@
 # MongoDB Database Schema
 This document describes the MongoDB collections and their structure for Munich News Daily.
 ## Collections
 ### 1. Articles Collection (`articles`)
 Stores all news articles aggregated from Munich news sources.
 **Document Structure:**
 ```javascript
 {
  _id: ObjectId,                    // Auto-generated MongoDB ID
  title: String,                    // Article title (required)
  author: String,                   // Article author (optional, extracted during crawl)
  link: String,                     // Article URL (required, unique)
  content: String,                  // Full article content (no length limit)
  summary: String,                  // AI-generated English summary (≤150 words)
  word_count: Number,               // Word count of full content
  summary_word_count: Number,       // Word count of AI summary
  source: String,                   // News source name (e.g., "Süddeutsche Zeitung München")
  published_at: String,             // Original publication date from RSS feed or crawled
  crawled_at: DateTime,             // When article content was crawled (UTC)
  summarized_at: DateTime,          // When AI summary was generated (UTC)
  created_at: DateTime              // When article was added to database (UTC)
 }
 ```
 **Indexes:**
 - `link` - Unique index to prevent duplicate articles
 - `created_at` - Index for efficient sorting by date
 **Example Document:**
 ```javascript
 {
  _id: ObjectId("507f1f77bcf86cd799439011"),
  title: "New U-Bahn Line Opens in Munich",
  author: "Max Mustermann",
  link: "https://www.sueddeutsche.de/muenchen/ubahn-1.123456",
  content: "The new U-Bahn line connecting the city center with the airport opened today. Mayor Dieter Reiter attended the opening ceremony... [full article text continues]",
  summary: "Munich's new U-Bahn line connecting the city center to the airport opened today with Mayor Dieter Reiter in attendance. The line features 10 stations and runs every 10 minutes during peak hours, significantly reducing travel time. Construction took five years and cost approximately 2 billion euros.",
  word_count: 1250,
  summary_word_count: 48,
  source: "Süddeutsche Zeitung München",
  published_at: "Mon, 15 Jan 2024 10:00:00 +0100",
  crawled_at: ISODate("2024-01-15T09:30:00.000Z"),
  summarized_at: ISODate("2024-01-15T09:30:15.000Z"),
  created_at: ISODate("2024-01-15T09:00:00.000Z")
 }
 ```
 ### 2. Subscribers Collection (`subscribers`)
 Stores all newsletter subscribers.
 **Document Structure:**
 ```javascript
 {
  _id: ObjectId,                    // Auto-generated MongoDB ID
  email: String,                    // Subscriber email (required, unique, lowercase)
  subscribed_at: DateTime,          // When user subscribed (UTC)
  status: String                    // Subscription status: 'active' or 'inactive'
 }
 ```
 **Indexes:**
 - `email` - Unique index for email lookups and preventing duplicates
 - `subscribed_at` - Index for analytics and sorting
 **Example Document:**
 ```javascript
 {
  _id: ObjectId("507f1f77bcf86cd799439012"),
  email: "user@example.com",
  subscribed_at: ISODate("2024-01-15T08:30:00.000Z"),
  status: "active"
 }
 ```
 ## Design Decisions
 ### Why MongoDB?
 1. **Flexibility**: Easy to add new fields without schema migrations
 2. **Scalability**: Handles large volumes of articles and subscribers efficiently
 3. **Performance**: Indexes on frequently queried fields (link, email, created_at)
 4. **Document Model**: Natural fit for news articles and subscriber data
 ### Schema Choices
 1. **Unique Link Index**: Prevents duplicate articles from being stored, even if fetched multiple times
 2. **Status Field**: Soft delete for subscribers (set to 'inactive' instead of deleting) - allows for analytics and easy re-subscription
 3. **UTC Timestamps**: All dates stored in UTC for consistency across timezones
 4. **Lowercase Emails**: Emails stored in lowercase to prevent case-sensitivity issues
 ### Future Enhancements
 Potential fields to add in the future:
 **Articles:**
 - `category`: String (e.g., "politics", "sports", "culture")
 - `tags`: Array of Strings
 - `image_url`: String
 - `sent_in_newsletter`: Boolean (track if article was sent)
 - `sent_at`: DateTime (when article was included in newsletter)
 **Subscribers:**
 - `preferences`: Object (newsletter frequency, categories, etc.)
 - `last_sent_at`: DateTime (last newsletter sent date)
 - `unsubscribed_at`: DateTime (when user unsubscribed)
 - `verification_token`: String (for email verification)
 ## AI Summarization Workflow
 When the crawler processes an article:
 1. **Extract Content**: Full article text is extracted from the webpage
 2. **Summarize with Ollama**: If `OLLAMA_ENABLED=true`, the content is sent to Ollama for summarization
 3. **Store Both**: Both the original `content` and AI-generated `summary` are stored
 4. **Fallback**: If Ollama is unavailable or fails, only the original content is stored
 ### Summary Field Details
 - **Language**: Always in English, regardless of source article language
 - **Length**: Maximum 150 words
 - **Format**: Plain text, concise and clear
 - **Purpose**: Quick preview for newsletters and frontend display
 ### Querying Articles
 ```javascript
 // Get articles with AI summaries
 db.articles.find({ summary: { $exists: true, $ne: null } })
 // Get articles without summaries
 db.articles.find({ summary: { $exists: false } })
 // Count summarized articles
 db.articles.countDocuments({ summary: { $exists: true, $ne: null } })
 ```
--- a/backend/STRUCTURE.md
+++ b/backend/STRUCTURE.md
@@ -0,0 +1,98 @@
 # Backend Structure
 The backend has been modularized for better maintainability and scalability.
 ## Directory Structure
 ```
 backend/
 ├── app.py                      # Main Flask application entry point
 ├── config.py                   # Configuration management
 ├── database.py                 # Database connection and initialization
 ├── requirements.txt            # Python dependencies
 ├── .env                        # Environment variables
 │
 ├── routes/                     # API route handlers (blueprints)
 │   ├── __init__.py
 │   ├── subscription_routes.py # /api/subscribe, /api/unsubscribe
 │   ├── news_routes.py          # /api/news, /api/stats
 │   ├── rss_routes.py           # /api/rss-feeds (CRUD operations)
 │   └── ollama_routes.py        # /api/ollama/* (AI features)
 │
 └── services/                   # Business logic layer
    ├── __init__.py
    ├── news_service.py         # News fetching and storage logic
    ├── email_service.py        # Newsletter email sending
    └── ollama_service.py       # Ollama AI integration
 ```
 ## Key Components
 ### app.py
 - Main Flask application
 - Registers all blueprints
 - Minimal code, just wiring things together
 ### config.py
 - Centralized configuration
 - Loads environment variables
 - Single source of truth for all settings
 ### database.py
 - MongoDB connection setup
 - Collection definitions
 - Database initialization with indexes
 ### routes/
 Each route file is a Flask Blueprint handling specific API endpoints:
 - **subscription_routes.py**: User subscription management
 - **news_routes.py**: News fetching and statistics
 - **rss_routes.py**: RSS feed management (add/remove/list/toggle)
 - **ollama_routes.py**: AI/Ollama integration endpoints
 ### services/
 Business logic separated from route handlers:
 - **news_service.py**: Fetches news from RSS feeds, saves to database
 - **email_service.py**: Sends newsletter emails to subscribers
 - **ollama_service.py**: Communicates with Ollama AI server
 ## Benefits of This Structure
 1. **Separation of Concerns**: Routes handle HTTP, services handle business logic
 2. **Testability**: Each module can be tested independently
 3. **Maintainability**: Easy to find and modify specific functionality
 4. **Scalability**: Easy to add new routes or services
 5. **Reusability**: Services can be used by multiple routes
 ## Adding New Features
 ### To add a new API endpoint:
 1. Create a new route file in `routes/` or add to existing one
 2. Create a Blueprint and define routes
 3. Register the blueprint in `app.py`
 ### To add new business logic:
 1. Create a new service file in `services/`
 2. Import and use in your route handlers
 ### Example:
 ```python
 # services/my_service.py
 def my_business_logic():
    return "Hello"
 # routes/my_routes.py
 from flask import Blueprint
 from services.my_service import my_business_logic
 my_bp = Blueprint('my', __name__)
@my_bp.route('/api/my-endpoint')
 def my_endpoint():
    result = my_business_logic()
    return {'message': result}
 # app.py
 from routes.my_routes import my_bp
 app.register_blueprint(my_bp)
 ```
--- a/backend/app.py
+++ b/backend/app.py
@@ -0,0 +1,29 @@
 from flask import Flask
 from flask_cors import CORS
 from config import Config
 from database import init_db
 from routes.subscription_routes import subscription_bp
 from routes.news_routes import news_bp
 from routes.rss_routes import rss_bp
 from routes.ollama_routes import ollama_bp
 from routes.newsletter_routes import newsletter_bp
 # Initialize Flask app
 app = Flask(__name__)
 CORS(app)
 # Initialize database
 init_db()
 # Register blueprints
 app.register_blueprint(subscription_bp)
 app.register_blueprint(news_bp)
 app.register_blueprint(rss_bp)
 app.register_blueprint(ollama_bp)
 app.register_blueprint(newsletter_bp)
 # Print configuration
 Config.print_config()
 if __name__ == '__main__':
    app.run(debug=True, port=Config.FLASK_PORT, host='127.0.0.1')
--- a/backend/config.py
+++ b/backend/config.py
@@ -0,0 +1,52 @@
 import os
 from dotenv import load_dotenv
 from pathlib import Path
 # Get the directory where this script is located
 backend_dir = Path(__file__).parent
 env_path = backend_dir / '.env'
 # Load .env file
 load_dotenv(dotenv_path=env_path)
 # Debug: Print if .env file exists (for troubleshooting)
 if env_path.exists():
    print(f"✓ Loading .env file from: {env_path}")
 else:
    print(f"⚠ Warning: .env file not found at {env_path}")
    print(f"  Current working directory: {os.getcwd()}")
    print(f"  Looking for .env in: {env_path}")
 class Config:
    """Application configuration"""
    # MongoDB
    MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'
    # Email
    SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
    SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
    EMAIL_USER = os.getenv('EMAIL_USER', '')
    EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
    # Ollama
    OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
    OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
    OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
    # Flask
    FLASK_PORT = int(os.getenv('FLASK_PORT', '5000'))
    @classmethod
    def print_config(cls):
        """Print configuration (without sensitive data)"""
        print("\nApplication Configuration:")
        print(f"  MongoDB URI: {cls.MONGODB_URI}")
        print(f"  Database: {cls.DB_NAME}")
        print(f"  Flask Port: {cls.FLASK_PORT}")
        print(f"  Ollama Base URL: {cls.OLLAMA_BASE_URL}")
        print(f"  Ollama Model: {cls.OLLAMA_MODEL}")
        print(f"  Ollama Enabled: {cls.OLLAMA_ENABLED}")
--- a/backend/database.py
+++ b/backend/database.py
@@ -0,0 +1,53 @@
 from pymongo import MongoClient
 from datetime import datetime
 from config import Config
 # MongoDB setup
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 # Collections
 articles_collection = db['articles']
 subscribers_collection = db['subscribers']
 rss_feeds_collection = db['rss_feeds']
 def init_db():
    """Initialize database with indexes"""
    # Create unique index on article links to prevent duplicates
    articles_collection.create_index('link', unique=True)
    # Create index on created_at for faster sorting
    articles_collection.create_index('created_at')
    # Create unique index on subscriber emails
    subscribers_collection.create_index('email', unique=True)
    # Create index on subscribed_at
    subscribers_collection.create_index('subscribed_at')
    # Create unique index on RSS feed URLs
    rss_feeds_collection.create_index('url', unique=True)
    # Initialize default RSS feeds if collection is empty
    if rss_feeds_collection.count_documents({}) == 0:
        default_feeds = [
            {
                'name': 'Süddeutsche Zeitung München',
                'url': 'https://www.sueddeutsche.de/muenchen/rss',
                'active': True,
                'created_at': datetime.utcnow()
            },
            {
                'name': 'Münchner Merkur',
                'url': 'https://www.merkur.de/muenchen/rss',
                'active': True,
                'created_at': datetime.utcnow()
            },
            {
                'name': 'Abendzeitung München',
                'url': 'https://www.abendzeitung-muenchen.de/rss',
                'active': True,
                'created_at': datetime.utcnow()
            }
        ]
        rss_feeds_collection.insert_many(default_feeds)
        print(f"Initialized {len(default_feeds)} default RSS feeds")
    print("Database initialized with indexes")
--- a/backend/env.template
+++ b/backend/env.template
@@ -0,0 +1,32 @@
 # MongoDB Configuration
 # For Docker Compose (no authentication):
 MONGODB_URI=mongodb://localhost:27017/
 # For Docker Compose with authentication:
 # MONGODB_URI=mongodb://admin:password@localhost:27017/
 # For MongoDB Atlas (cloud):
 # MONGODB_URI=mongodb+srv://username:password@cluster.mongodb.net/
 # Email Configuration (for sending newsletters)
 SMTP_SERVER=smtp.gmail.com
 SMTP_PORT=587
 EMAIL_USER=your-email@gmail.com
 EMAIL_PASSWORD=your-app-password
 # Note: For Gmail, use an App Password: https://support.google.com/accounts/answer/185833
 # Ollama Configuration (for AI-powered features)
 # Remote Ollama server URL (e.g., http://your-server-ip:11434 or https://your-domain.com)
 OLLAMA_BASE_URL=http://localhost:11434
 # Optional: API key if your Ollama server requires authentication
 # OLLAMA_API_KEY=your-api-key-here
 # Model name to use (e.g., llama2, mistral, codellama, llama3, phi3:latest)
 OLLAMA_MODEL=phi3:latest
 # Enable/disable Ollama features (true/false)
 # When enabled, the crawler will automatically summarize articles in English (≤150 words)
 OLLAMA_ENABLED=true
 # Timeout for Ollama requests in seconds (default: 30)
 OLLAMA_TIMEOUT=30
 # Flask Server Configuration
 # Port for Flask server (default: 5001 to avoid AirPlay conflict on macOS)
 FLASK_PORT=5001
--- a/backend/fix_duplicates.py
+++ b/backend/fix_duplicates.py
@@ -0,0 +1,61 @@
 """
 Script to fix duplicate RSS feeds and create unique index
 Run this once: python fix_duplicates.py
 """
 from pymongo import MongoClient
 from config import Config
 # Connect to MongoDB
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 rss_feeds_collection = db['rss_feeds']
 print("Fixing duplicate RSS feeds...")
 # Get all feeds
 all_feeds = list(rss_feeds_collection.find())
 print(f"Total feeds found: {len(all_feeds)}")
 # Find duplicates by URL
 seen_urls = {}
 duplicates_to_remove = []
 for feed in all_feeds:
    url = feed.get('url')
    if url in seen_urls:
        # This is a duplicate, mark for removal
        duplicates_to_remove.append(feed['_id'])
        print(f"  Duplicate found: {feed['name']} - {url}")
    else:
        # First occurrence, keep it
        seen_urls[url] = feed['_id']
 # Remove duplicates
 if duplicates_to_remove:
    result = rss_feeds_collection.delete_many({'_id': {'$in': duplicates_to_remove}})
    print(f"Removed {result.deleted_count} duplicate feeds")
 else:
    print("No duplicates found")
 # Drop existing indexes (if any)
 print("\nDropping existing indexes...")
 try:
    rss_feeds_collection.drop_indexes()
    print("Indexes dropped")
 except Exception as e:
    print(f"Note: {e}")
 # Create unique index on URL
 print("\nCreating unique index on 'url' field...")
 rss_feeds_collection.create_index('url', unique=True)
 print("✓ Unique index created successfully")
 # Verify
 remaining_feeds = list(rss_feeds_collection.find())
 print(f"\nFinal feed count: {len(remaining_feeds)}")
 print("\nRemaining feeds:")
 for feed in remaining_feeds:
    print(f"  - {feed['name']}: {feed['url']}")
 print("\n✓ Done! Duplicates removed and unique index created.")
 print("You can now restart your Flask app.")
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,8 @@
 Flask==3.0.0
 flask-cors==4.0.0
 feedparser==6.0.10
 python-dotenv==1.0.0
 pymongo==4.6.1
 requests==2.31.0
 Jinja2==3.1.2
--- a/backend/routes/init.py
+++ b/backend/routes/init.py
@@ -0,0 +1 @@
 # Routes package
--- a/backend/routes/news_routes.py
+++ b/backend/routes/news_routes.py
@@ -0,0 +1,123 @@
 from flask import Blueprint, jsonify
 from database import articles_collection
 from services.news_service import fetch_munich_news, save_articles_to_db
 news_bp = Blueprint('news', __name__)
@news_bp.route('/api/news', methods=['GET'])
 def get_news():
    """Get latest Munich news"""
    try:
        # Fetch fresh news and save to database
        articles = fetch_munich_news()
        save_articles_to_db(articles)
        # Get articles from MongoDB, sorted by created_at (newest first)
        cursor = articles_collection.find().sort('created_at', -1).limit(20)
        db_articles = []
        for doc in cursor:
            article = {
                'title': doc.get('title', ''),
                'author': doc.get('author'),
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
                'published': doc.get('published_at', ''),
                'word_count': doc.get('word_count'),
                'has_full_content': bool(doc.get('content')),
                'has_summary': bool(doc.get('summary'))
            }
            # Include AI summary if available
            if doc.get('summary'):
                article['summary'] = doc.get('summary', '')
                article['summary_word_count'] = doc.get('summary_word_count')
                article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
            # Fallback: Include preview of content if no summary (first 200 chars)
            elif doc.get('content'):
                article['preview'] = doc.get('content', '')[:200] + '...'
            db_articles.append(article)
        # Combine fresh articles with database articles and deduplicate
        seen_links = set()
        combined = []
        # Add fresh articles first (they're more recent)
        for article in articles:
            link = article.get('link', '')
            if link and link not in seen_links:
                seen_links.add(link)
                combined.append(article)
        # Add database articles
        for article in db_articles:
            link = article.get('link', '')
            if link and link not in seen_links:
                seen_links.add(link)
                combined.append(article)
        return jsonify({'articles': combined[:20]}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
 def get_article_by_url(article_url):
    """Get full article content by URL"""
    try:
        # Decode URL
        from urllib.parse import unquote
        decoded_url = unquote(article_url)
        # Find article by link
        article = articles_collection.find_one({'link': decoded_url})
        if not article:
            return jsonify({'error': 'Article not found'}), 404
        return jsonify({
            'title': article.get('title', ''),
            'author': article.get('author'),
            'link': article.get('link', ''),
            'content': article.get('content', ''),
            'summary': article.get('summary'),
            'word_count': article.get('word_count', 0),
            'summary_word_count': article.get('summary_word_count'),
            'source': article.get('source', ''),
            'published_at': article.get('published_at', ''),
            'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
            'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
            'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@news_bp.route('/api/stats', methods=['GET'])
 def get_stats():
    """Get subscription statistics"""
    try:
        from database import subscribers_collection
        # Count only active subscribers
        subscriber_count = subscribers_collection.count_documents({'status': 'active'})
        # Also get total article count
        article_count = articles_collection.count_documents({})
        # Count crawled articles
        crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})
        # Count summarized articles
        summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
        return jsonify({
            'subscribers': subscriber_count,
            'articles': article_count,
            'crawled_articles': crawled_count,
            'summarized_articles': summarized_count
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
--- a/backend/routes/newsletter_routes.py
+++ b/backend/routes/newsletter_routes.py
@@ -0,0 +1,62 @@
 from flask import Blueprint, Response
 from pathlib import Path
 from jinja2 import Template
 from datetime import datetime
 from database import articles_collection
 newsletter_bp = Blueprint('newsletter', __name__)
@newsletter_bp.route('/api/newsletter/preview', methods=['GET'])
 def preview_newsletter():
    """Preview the newsletter HTML (for testing)"""
    try:
        # Get latest articles with AI summaries
        cursor = articles_collection.find(
            {'summary': {'$exists': True, '$ne': None}}
        ).sort('created_at', -1).limit(10)
        articles = []
        for doc in cursor:
            articles.append({
                'title': doc.get('title', ''),
                'author': doc.get('author'),
                'link': doc.get('link', ''),
                'summary': doc.get('summary', ''),
                'source': doc.get('source', ''),
                'published_at': doc.get('published_at', '')
            })
        if not articles:
            return Response(
                "<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>",
                mimetype='text/html'
            )
        # Load template
        template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
        with open(template_path, 'r', encoding='utf-8') as f:
            template_content = f.read()
        template = Template(template_content)
        # Prepare data
        now = datetime.now()
        template_data = {
            'date': now.strftime('%A, %B %d, %Y'),
            'year': now.year,
            'article_count': len(articles),
            'articles': articles,
            'unsubscribe_link': 'http://localhost:3000/unsubscribe',
            'website_link': 'http://localhost:3000'
        }
        # Render and return HTML
        html_content = template.render(**template_data)
        return Response(html_content, mimetype='text/html')
    except Exception as e:
        return Response(
            f"<h1>Error</h1><p>{str(e)}</p>",
            mimetype='text/html'
        ), 500
--- a/backend/routes/ollama_routes.py
+++ b/backend/routes/ollama_routes.py
@@ -0,0 +1,158 @@
 from flask import Blueprint, jsonify
 from config import Config
 from services.ollama_service import call_ollama, list_ollama_models
 import os
 ollama_bp = Blueprint('ollama', __name__)
@ollama_bp.route('/api/ollama/ping', methods=['GET', 'POST'])
 def ping_ollama():
    """Test connection to Ollama server"""
    try:
        # Check if Ollama is enabled
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': False
                }
            }), 200
        # Send a simple test prompt
        test_prompt = "Say 'Hello! I am connected and working.' in one sentence."
        system_prompt = "You are a helpful assistant. Respond briefly and concisely."
        response_text, error_message = call_ollama(test_prompt, system_prompt)
        if response_text:
            return jsonify({
                'status': 'success',
                'message': 'Successfully connected to Ollama',
                'response': response_text,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 200
        else:
            # Try to get available models for better error message
            available_models, _ = list_ollama_models()
            troubleshooting = {
                'check_server': f'Verify Ollama is running at {Config.OLLAMA_BASE_URL}',
                'check_model': f'Verify model "{Config.OLLAMA_MODEL}" is available (run: ollama list)',
                'test_connection': f'Test manually: curl {Config.OLLAMA_BASE_URL}/api/generate -d \'{{"model":"{Config.OLLAMA_MODEL}","prompt":"test"}}\''
            }
            if available_models:
                troubleshooting['available_models'] = available_models
                troubleshooting['suggestion'] = f'Try setting OLLAMA_MODEL to one of: {", ".join(available_models[:5])}'
            return jsonify({
                'status': 'error',
                'message': error_message or 'Failed to get response from Ollama',
                'error_details': error_message,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                },
                'troubleshooting': troubleshooting
            }), 500
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error connecting to Ollama: {str(e)}',
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500
@ollama_bp.route('/api/ollama/config', methods=['GET'])
 def get_ollama_config():
    """Get current Ollama configuration (for debugging)"""
    try:
        from pathlib import Path
        backend_dir = Path(__file__).parent.parent
        env_path = backend_dir / '.env'
        return jsonify({
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED,
                'has_api_key': bool(Config.OLLAMA_API_KEY)
            },
            'env_file_path': str(env_path),
            'env_file_exists': env_path.exists(),
            'current_working_directory': os.getcwd()
        }), 200
    except Exception as e:
        return jsonify({
            'error': str(e),
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500
@ollama_bp.route('/api/ollama/models', methods=['GET'])
 def get_ollama_models():
    """List available models on Ollama server"""
    try:
        if not Config.OLLAMA_ENABLED:
            return jsonify({
                'status': 'disabled',
                'message': 'Ollama is not enabled. Set OLLAMA_ENABLED=true in your .env file.',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': False
                }
            }), 200
        models, error_message = list_ollama_models()
        if models is not None:
            return jsonify({
                'status': 'success',
                'models': models,
                'current_model': Config.OLLAMA_MODEL,
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 200
        else:
            return jsonify({
                'status': 'error',
                'message': error_message or 'Failed to list models',
                'ollama_config': {
                    'base_url': Config.OLLAMA_BASE_URL,
                    'model': Config.OLLAMA_MODEL,
                    'enabled': True
                }
            }), 500
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': f'Error listing models: {str(e)}',
            'ollama_config': {
                'base_url': Config.OLLAMA_BASE_URL,
                'model': Config.OLLAMA_MODEL,
                'enabled': Config.OLLAMA_ENABLED
            }
        }), 500
--- a/backend/routes/rss_routes.py
+++ b/backend/routes/rss_routes.py
@@ -0,0 +1,124 @@
 from flask import Blueprint, request, jsonify
 from datetime import datetime
 from pymongo.errors import DuplicateKeyError
 from bson.objectid import ObjectId
 import feedparser
 from database import rss_feeds_collection
 rss_bp = Blueprint('rss', __name__)
@rss_bp.route('/api/rss-feeds', methods=['GET'])
 def get_rss_feeds():
    """Get all RSS feeds"""
    try:
        cursor = rss_feeds_collection.find().sort('created_at', -1)
        feeds = []
        for feed in cursor:
            feeds.append({
                'id': str(feed['_id']),
                'name': feed.get('name', ''),
                'url': feed.get('url', ''),
                'active': feed.get('active', True),
                'created_at': feed.get('created_at', '').isoformat() if feed.get('created_at') else ''
            })
        return jsonify({'feeds': feeds}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds', methods=['POST'])
 def add_rss_feed():
    """Add a new RSS feed"""
    data = request.json
    name = data.get('name', '').strip()
    url = data.get('url', '').strip()
    if not name or not url:
        return jsonify({'error': 'Name and URL are required'}), 400
    if not url.startswith('http://') and not url.startswith('https://'):
        return jsonify({'error': 'URL must start with http:// or https://'}), 400
    try:
        # Test if the RSS feed is valid
        try:
            feed = feedparser.parse(url)
            if not feed.entries:
                return jsonify({'error': 'Invalid RSS feed or no entries found'}), 400
        except Exception as e:
            return jsonify({'error': f'Failed to parse RSS feed: {str(e)}'}), 400
        feed_doc = {
            'name': name,
            'url': url,
            'active': True,
            'created_at': datetime.utcnow()
        }
        try:
            result = rss_feeds_collection.insert_one(feed_doc)
            return jsonify({
                'message': 'RSS feed added successfully',
                'id': str(result.inserted_id)
            }), 201
        except DuplicateKeyError:
            return jsonify({'error': 'RSS feed URL already exists'}), 409
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds/<feed_id>', methods=['DELETE'])
 def remove_rss_feed(feed_id):
    """Remove an RSS feed"""
    try:
        # Validate ObjectId
        try:
            obj_id = ObjectId(feed_id)
        except Exception:
            return jsonify({'error': 'Invalid feed ID'}), 400
        result = rss_feeds_collection.delete_one({'_id': obj_id})
        if result.deleted_count > 0:
            return jsonify({'message': 'RSS feed removed successfully'}), 200
        else:
            return jsonify({'error': 'RSS feed not found'}), 404
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@rss_bp.route('/api/rss-feeds/<feed_id>/toggle', methods=['PATCH'])
 def toggle_rss_feed(feed_id):
    """Toggle RSS feed active status"""
    try:
        # Validate ObjectId
        try:
            obj_id = ObjectId(feed_id)
        except Exception:
            return jsonify({'error': 'Invalid feed ID'}), 400
        # Get current status
        feed = rss_feeds_collection.find_one({'_id': obj_id})
        if not feed:
            return jsonify({'error': 'RSS feed not found'}), 404
        # Toggle status
        new_status = not feed.get('active', True)
        result = rss_feeds_collection.update_one(
            {'_id': obj_id},
            {'$set': {'active': new_status}}
        )
        if result.modified_count > 0:
            return jsonify({
                'message': f'RSS feed {"activated" if new_status else "deactivated"} successfully',
                'active': new_status
            }), 200
        else:
            return jsonify({'error': 'Failed to update RSS feed'}), 500
    except Exception as e:
        return jsonify({'error': str(e)}), 500
--- a/backend/routes/subscription_routes.py
+++ b/backend/routes/subscription_routes.py
@@ -0,0 +1,63 @@
 from flask import Blueprint, request, jsonify
 from datetime import datetime
 from pymongo.errors import DuplicateKeyError
 from database import subscribers_collection
 subscription_bp = Blueprint('subscription', __name__)
@subscription_bp.route('/api/subscribe', methods=['POST'])
 def subscribe():
    """Subscribe a user to the newsletter"""
    data = request.json
    email = data.get('email', '').strip().lower()
    if not email or '@' not in email:
        return jsonify({'error': 'Invalid email address'}), 400
    try:
        subscriber_doc = {
            'email': email,
            'subscribed_at': datetime.utcnow(),
            'status': 'active'
        }
        # Try to insert, if duplicate key error, subscriber already exists
        try:
            subscribers_collection.insert_one(subscriber_doc)
            return jsonify({'message': 'Successfully subscribed!'}), 201
        except DuplicateKeyError:
            # Check if subscriber is active
            existing = subscribers_collection.find_one({'email': email})
            if existing and existing.get('status') == 'active':
                return jsonify({'message': 'Email already subscribed'}), 200
            else:
                # Reactivate if previously unsubscribed
                subscribers_collection.update_one(
                    {'email': email},
                    {'$set': {'status': 'active', 'subscribed_at': datetime.utcnow()}}
                )
                return jsonify({'message': 'Successfully re-subscribed!'}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500
@subscription_bp.route('/api/unsubscribe', methods=['POST'])
 def unsubscribe():
    """Unsubscribe a user from the newsletter"""
    data = request.json
    email = data.get('email', '').strip().lower()
    try:
        result = subscribers_collection.update_one(
            {'email': email},
            {'$set': {'status': 'inactive'}}
        )
        if result.matched_count > 0:
            return jsonify({'message': 'Successfully unsubscribed'}), 200
        else:
            return jsonify({'error': 'Email not found in subscribers'}), 404
    except Exception as e:
        return jsonify({'error': str(e)}), 500
--- a/backend/services/init.py
+++ b/backend/services/init.py
@@ -0,0 +1 @@
 # Services package
--- a/backend/services/email_service.py
+++ b/backend/services/email_service.py
@@ -0,0 +1,88 @@
 import smtplib
 from email.mime.text import MIMEText
 from email.mime.multipart import MIMEMultipart
 from datetime import datetime
 from pathlib import Path
 from jinja2 import Template
 from config import Config
 from database import subscribers_collection, articles_collection
 def send_newsletter(max_articles=10):
    """Send newsletter to all subscribers with AI-summarized articles"""
    if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
        print("Email credentials not configured")
        return
    # Get latest articles with AI summaries from database
    cursor = articles_collection.find(
        {'summary': {'$exists': True, '$ne': None}}
    ).sort('created_at', -1).limit(max_articles)
    articles = []
    for doc in cursor:
        articles.append({
            'title': doc.get('title', ''),
            'author': doc.get('author'),
            'link': doc.get('link', ''),
            'summary': doc.get('summary', ''),
            'source': doc.get('source', ''),
            'published_at': doc.get('published_at', '')
        })
    if not articles:
        print("No articles with summaries to send")
        return
    # Load email template
    template_path = Path(__file__).parent.parent / 'templates' / 'newsletter_template.html'
    with open(template_path, 'r', encoding='utf-8') as f:
        template_content = f.read()
    template = Template(template_content)
    # Prepare template data
    now = datetime.now()
    template_data = {
        'date': now.strftime('%A, %B %d, %Y'),
        'year': now.year,
        'article_count': len(articles),
        'articles': articles,
        'unsubscribe_link': 'http://localhost:3000',  # Update with actual unsubscribe link
        'website_link': 'http://localhost:3000'
    }
    # Render HTML
    html_content = template.render(**template_data)
    # Get all active subscribers
    subscribers_cursor = subscribers_collection.find({'status': 'active'})
    subscribers = [doc['email'] for doc in subscribers_cursor]
    # Send emails
    for subscriber in subscribers:
        try:
            msg = MIMEMultipart('alternative')
            msg['Subject'] = f'Munich News Daily - {datetime.now().strftime("%B %d, %Y")}'
            msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
            msg['To'] = subscriber
            msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
            msg['Message-ID'] = f'<{datetime.now().timestamp()}.{subscriber}@dongho.kim>'
            msg['X-Mailer'] = 'Munich News Daily'
            # Add plain text version as fallback
            plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
            msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
            # Add HTML version
            msg.attach(MIMEText(html_content, 'html', 'utf-8'))
            server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
            server.starttls()
            server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
            server.send_message(msg)
            server.quit()
            print(f"Newsletter sent to {subscriber}")
        except Exception as e:
            print(f"Error sending to {subscriber}: {e}")
--- a/backend/services/news_service.py
+++ b/backend/services/news_service.py
@@ -0,0 +1,90 @@
 import feedparser
 from datetime import datetime
 from pymongo.errors import DuplicateKeyError
 from database import articles_collection, rss_feeds_collection
 from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
 def get_active_rss_feeds():
    """Get all active RSS feeds from database"""
    feeds = []
    cursor = rss_feeds_collection.find({'active': True})
    for feed in cursor:
        feeds.append({
            'name': feed.get('name', ''),
            'url': feed.get('url', '')
        })
    return feeds
 def fetch_munich_news():
    """Fetch news from Munich news sources"""
    articles = []
    # Get RSS feeds from database instead of hardcoded list
    sources = get_active_rss_feeds()
    for source in sources:
        try:
            feed = feedparser.parse(source['url'])
            for entry in feed.entries[:5]:  # Get top 5 from each source
                # Extract article URL using utility function
                article_url = extract_article_url(entry)
                if not article_url:
                    print(f"  ⚠ No valid URL for: {entry.get('title', 'Unknown')[:50]}")
                    continue  # Skip entries without valid URL
                # Extract summary
                summary = extract_article_summary(entry)
                if summary:
                    summary = summary[:200] + '...' if len(summary) > 200 else summary
                articles.append({
                    'title': entry.get('title', ''),
                    'link': article_url,
                    'summary': summary,
                    'source': source['name'],
                    'published': extract_published_date(entry)
                })
        except Exception as e:
            print(f"Error fetching from {source['name']}: {e}")
    return articles
 def save_articles_to_db(articles):
    """Save articles to MongoDB, avoiding duplicates"""
    saved_count = 0
    for article in articles:
        try:
            # Prepare article document
            article_doc = {
                'title': article.get('title', ''),
                'link': article.get('link', ''),
                'summary': article.get('summary', ''),
                'source': article.get('source', ''),
                'published_at': article.get('published', ''),
                'created_at': datetime.utcnow()
            }
            # Use update_one with upsert to handle duplicates
            # This will insert if link doesn't exist, or update if it does
            result = articles_collection.update_one(
                {'link': article_doc['link']},
                {'$setOnInsert': article_doc},  # Only set on insert, don't update existing
                upsert=True
            )
            if result.upserted_id:
                saved_count += 1
        except DuplicateKeyError:
            # Link already exists, skip
            pass
        except Exception as e:
            print(f"Error saving article {article.get('link', 'unknown')}: {e}")
    if saved_count > 0:
        print(f"Saved {saved_count} new articles to database")
--- a/backend/services/ollama_service.py
+++ b/backend/services/ollama_service.py
@@ -0,0 +1,96 @@
 import requests
 from config import Config
 def list_ollama_models():
    """List available models on Ollama server"""
    if not Config.OLLAMA_ENABLED:
        return None, "Ollama is not enabled"
    try:
        url = f"{Config.OLLAMA_BASE_URL}/api/tags"
        headers = {}
        if Config.OLLAMA_API_KEY:
            headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        result = response.json()
        models = result.get('models', [])
        model_names = [model.get('name', '') for model in models]
        return model_names, None
    except requests.exceptions.RequestException as e:
        return None, f"Error listing models: {str(e)}"
    except Exception as e:
        return None, f"Unexpected error: {str(e)}"
 def call_ollama(prompt, system_prompt=None):
    """Call Ollama API to generate text"""
    if not Config.OLLAMA_ENABLED:
        return None, "Ollama is not enabled"
    try:
        url = f"{Config.OLLAMA_BASE_URL}/api/generate"
        payload = {
            "model": Config.OLLAMA_MODEL,
            "prompt": prompt,
            "stream": False
        }
        if system_prompt:
            payload["system"] = system_prompt
        headers = {}
        if Config.OLLAMA_API_KEY:
            headers["Authorization"] = f"Bearer {Config.OLLAMA_API_KEY}"
        print(f"Calling Ollama at {url} with model {Config.OLLAMA_MODEL}")
        response = requests.post(url, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        result = response.json()
        response_text = result.get('response', '').strip()
        if not response_text:
            return None, "Ollama returned empty response"
        return response_text, None
    except requests.exceptions.ConnectionError as e:
        error_msg = f"Cannot connect to Ollama server at {Config.OLLAMA_BASE_URL}. Is Ollama running?"
        print(f"Connection error: {error_msg}")
        return None, error_msg
    except requests.exceptions.Timeout:
        error_msg = "Request to Ollama timed out after 30 seconds"
        print(f"Timeout error: {error_msg}")
        return None, error_msg
    except requests.exceptions.HTTPError as e:
        # Check if it's a model not found error
        if e.response.status_code == 404:
            try:
                error_data = e.response.json()
                if 'model' in error_data.get('error', '').lower() and 'not found' in error_data.get('error', '').lower():
                    # Try to get available models
                    available_models, _ = list_ollama_models()
                    if available_models:
                        error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Available models: {', '.join(available_models)}"
                    else:
                        error_msg = f"Model '{Config.OLLAMA_MODEL}' not found. Use 'ollama list' on the server to see available models."
                else:
                    error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
            except (ValueError, KeyError):
                error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
        else:
            error_msg = f"HTTP error from Ollama: {e.response.status_code} - {e.response.text}"
        print(f"HTTP error: {error_msg}")
        return None, error_msg
    except requests.exceptions.RequestException as e:
        error_msg = f"Request error: {str(e)}"
        print(f"Request error: {error_msg}")
        return None, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        print(f"Unexpected error: {error_msg}")
        return None, error_msg
--- a/backend/templates/newsletter_template.html
+++ b/backend/templates/newsletter_template.html
@@ -0,0 +1,162 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <title>Munich News Daily</title>
    <!--[if mso]>
    <style type="text/css">
        body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
    </style>
    <![endif]-->
 </head>
 <body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
    <!-- Wrapper Table -->
    <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
        <tr>
            <td align="center" style="padding: 20px 0;">
                <!-- Main Container -->
                <table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
                    <!-- Header -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
                            <h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
                                Munich News Daily
                            </h1>
                            <p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
                                {{ date }}
                            </p>
                        </td>
                    </tr>
                    <!-- Greeting -->
                    <tr>
                        <td style="padding: 30px 40px 20px 40px;">
                            <p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
                                Good morning ☀️
                            </p>
                            <p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
                                Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
                            </p>
                        </td>
                    </tr>
                    <!-- Divider -->
                    <tr>
                        <td style="padding: 0 40px;">
                            <div style="height: 1px; background-color: #e0e0e0;"></div>
                        </td>
                    </tr>
                    <!-- Articles -->
                    {% for article in articles %}
                    <tr>
                        <td style="padding: 25px 40px;">
                            <!-- Article Number Badge -->
                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
                                <tr>
                                    <td>
                                        <span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
                                            {{ loop.index }}
                                        </span>
                                    </td>
                                </tr>
                            </table>
                            <!-- Article Title -->
                            <h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
                                {{ article.title }}
                            </h2>
                            <!-- Article Meta -->
                            <p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
                                <span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
                                {% if article.author %}
                                <span> • {{ article.author }}</span>
                                {% endif %}
                            </p>
                            <!-- Article Summary -->
                            <p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
                                {{ article.summary }}
                            </p>
                            <!-- Read More Link -->
                            <a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
                                Read more →
                            </a>
                        </td>
                    </tr>
                    <!-- Article Divider -->
                    {% if not loop.last %}
                    <tr>
                        <td style="padding: 0 40px;">
                            <div style="height: 1px; background-color: #f0f0f0;"></div>
                        </td>
                    </tr>
                    {% endif %}
                    {% endfor %}
                    <!-- Bottom Divider -->
                    <tr>
                        <td style="padding: 25px 40px 0 40px;">
                            <div style="height: 1px; background-color: #e0e0e0;"></div>
                        </td>
                    </tr>
                    <!-- Summary Box -->
                    <tr>
                        <td style="padding: 30px 40px;">
                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
                                <tr>
                                    <td style="padding: 25px; text-align: center;">
                                        <p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
                                            Today's Digest
                                        </p>
                                        <p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
                                            {{ article_count }}
                                        </p>
                                        <p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
                                            stories • AI-summarized • 5 min read
                                        </p>
                                    </td>
                                </tr>
                            </table>
                        </td>
                    </tr>
                    <!-- Footer -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
                            <p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
                                Munich News Daily
                            </p>
                            <p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
                                AI-powered news summaries for busy people.<br>
                                Delivered daily to your inbox.
                            </p>
                            <!-- Footer Links -->
                            <p style="margin: 0; font-size: 12px; color: #666666;">
                                <a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
                                <span style="color: #444444;"> • </span>
                                <a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
                            </p>
                            <p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
                                © {{ year }} Munich News Daily. All rights reserved.
                            </p>
                        </td>
                    </tr>
                </table>
                <!-- End Main Container -->
            </td>
        </tr>
    </table>
    <!-- End Wrapper Table -->
 </body>
 </html>
--- a/backend/test_rss_extraction.py
+++ b/backend/test_rss_extraction.py
@@ -0,0 +1,128 @@
 #!/usr/bin/env python
 """
 Test RSS feed URL extraction
 Run from backend directory with venv activated:
  cd backend
  source venv/bin/activate  # or venv\Scripts\activate on Windows
  python test_rss_extraction.py
 """
 from pymongo import MongoClient
 from config import Config
 import feedparser
 from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
 print("\n" + "="*80)
 print("RSS Feed URL Extraction Test")
 print("="*80)
 # Connect to database
 print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 # Get RSS feeds
 print("Fetching RSS feeds from database...")
 feeds = list(db['rss_feeds'].find())
 if not feeds:
    print("\n❌ No RSS feeds in database!")
    print("\nAdd a feed first:")
    print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
    print("    -H 'Content-Type: application/json' \\")
    print("    -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
    exit(1)
 print(f"✓ Found {len(feeds)} feed(s)\n")
 # Test each feed
 total_success = 0
 total_fail = 0
 for feed_doc in feeds:
    name = feed_doc.get('name', 'Unknown')
    url = feed_doc.get('url', '')
    active = feed_doc.get('active', True)
    print("\n" + "="*80)
    print(f"Feed: {name}")
    print(f"URL: {url}")
    print(f"Active: {'Yes' if active else 'No'}")
    print("="*80)
    if not active:
        print("⏭  Skipping (inactive)")
        continue
    try:
        # Parse RSS
        print("\nFetching RSS feed...")
        feed = feedparser.parse(url)
        if not feed.entries:
            print("❌ No entries found in feed")
            continue
        print(f"✓ Found {len(feed.entries)} entries")
        # Test first 3 entries
        print(f"\nTesting first 3 entries:")
        print("-" * 80)
        for i, entry in enumerate(feed.entries[:3], 1):
            print(f"\n📰 Entry {i}:")
            # Title
            title = entry.get('title', 'No title')
            print(f"   Title: {title[:65]}")
            # Test URL extraction
            article_url = extract_article_url(entry)
            if article_url:
                print(f"   ✓ URL: {article_url}")
                total_success += 1
            else:
                print(f"   ❌ Could not extract URL")
                print(f"      Available fields: {list(entry.keys())[:10]}")
                print(f"      link: {entry.get('link', 'N/A')}")
                print(f"      guid: {entry.get('guid', 'N/A')}")
                print(f"      id: {entry.get('id', 'N/A')}")
                total_fail += 1
            # Test summary
            summary = extract_article_summary(entry)
            if summary:
                print(f"   ✓ Summary: {summary[:70]}...")
            else:
                print(f"   ⚠  No summary")
            # Test date
            pub_date = extract_published_date(entry)
            if pub_date:
                print(f"   ✓ Date: {pub_date}")
            else:
                print(f"   ⚠  No date")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
 # Summary
 print("\n" + "="*80)
 print("SUMMARY")
 print("="*80)
 print(f"Total URLs tested: {total_success + total_fail}")
 print(f"✓ Successfully extracted: {total_success}")
 print(f"❌ Failed to extract: {total_fail}")
 if total_fail == 0:
    print("\n🎉 All URLs extracted successfully!")
    print("\nYou can now run the crawler:")
    print("  cd ../news_crawler")
    print("  pip install -r requirements.txt")
    print("  python crawler_service.py 5")
 else:
    print(f"\n⚠  {total_fail} URL(s) could not be extracted")
    print("Check the output above for details")
 print("="*80 + "\n")
--- a/backend/utils/init.py
+++ b/backend/utils/init.py
@@ -0,0 +1 @@
 # Utils package
--- a/backend/utils/rss_utils.py
+++ b/backend/utils/rss_utils.py
@@ -0,0 +1,98 @@
 """
 Utility functions for RSS feed processing
 """
 def extract_article_url(entry):
    """
    Extract article URL from RSS entry.
    Different RSS feeds use different fields for the article URL.
    Args:
        entry: feedparser entry object
    Returns:
        str: Article URL or None if not found
    Examples:
        - Most feeds use 'link'
        - Some use 'guid' as the URL
        - Some use 'id' as the URL
        - Some have guid as a dict with 'href'
    """
    # Try 'link' first (most common)
    if entry.get('link') and entry.get('link', '').startswith('http'):
        return entry.get('link')
    # Try 'guid' if it's a valid URL
    if entry.get('guid'):
        guid = entry.get('guid')
        # guid can be a string
        if isinstance(guid, str) and guid.startswith('http'):
            return guid
        # or a dict with 'href'
        elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
            return guid.get('href')
    # Try 'id' if it's a valid URL
    if entry.get('id') and entry.get('id', '').startswith('http'):
        return entry.get('id')
    # Try 'links' array (some feeds have multiple links)
    if entry.get('links'):
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                # Prefer 'alternate' type, but accept any http link
                if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
                    return link.get('href')
        # If no alternate found, return first http link
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                return link.get('href')
    return None
 def extract_article_summary(entry):
    """
    Extract article summary/description from RSS entry.
    Args:
        entry: feedparser entry object
    Returns:
        str: Article summary or empty string
    """
    # Try different fields
    if entry.get('summary'):
        return entry.get('summary', '')
    elif entry.get('description'):
        return entry.get('description', '')
    elif entry.get('content'):
        # content is usually a list of dicts
        content = entry.get('content', [])
        if content and isinstance(content, list) and len(content) > 0:
            return content[0].get('value', '')
    return ''
 def extract_published_date(entry):
    """
    Extract published date from RSS entry.
    Args:
        entry: feedparser entry object
    Returns:
        str: Published date or empty string
    """
    # Try different fields
    if entry.get('published'):
        return entry.get('published', '')
    elif entry.get('updated'):
        return entry.get('updated', '')
    elif entry.get('created'):
        return entry.get('created', '')
    return ''
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -0,0 +1,33 @@
 version: '3.8'
 # Production version with authentication enabled
 # Usage: docker-compose -f docker-compose.prod.yml up -d
 services:
  mongodb:
    image: mongo:7.0
    container_name: munich-news-mongodb
    restart: unless-stopped
    ports:
      - "27017:27017"
    environment:
      MONGO_INITDB_ROOT_USERNAME: admin
      MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASSWORD:-changeme}
      MONGO_INITDB_DATABASE: munich_news
    volumes:
      - mongodb_data:/data/db
      - mongodb_config:/data/configdb
    networks:
      - munich-news-network
    command: mongod --bind_ip_all --auth
 volumes:
  mongodb_data:
    driver: local
  mongodb_config:
    driver: local
 networks:
  munich-news-network:
    driver: bridge
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,32 @@
 version: '3.8'
 services:
  mongodb:
    image: mongo:7.0
    container_name: munich-news-mongodb
    restart: unless-stopped
    ports:
      - "27017:27017"
    # For development: MongoDB runs without authentication
    # For production: Uncomment the environment variables below and update MONGODB_URI
    # environment:
    #   MONGO_INITDB_ROOT_USERNAME: admin
    #   MONGO_INITDB_ROOT_PASSWORD: password
    #   MONGO_INITDB_DATABASE: munich_news
    volumes:
      - mongodb_data:/data/db
      - mongodb_config:/data/configdb
    networks:
      - munich-news-network
    command: mongod --bind_ip_all
 volumes:
  mongodb_data:
    driver: local
  mongodb_config:
    driver: local
 networks:
  munich-news-network:
    driver: bridge
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -0,0 +1,21 @@
 {
  "name": "munich-news-frontend",
  "version": "1.0.0",
  "description": "Munich News Email Platform Frontend",
  "main": "server.js",
  "scripts": {
    "start": "node server.js",
    "dev": "nodemon server.js"
  },
  "keywords": ["news", "munich", "email"],
  "author": "",
  "license": "MIT",
  "dependencies": {
    "express": "^4.18.2",
    "axios": "^1.6.2"
  },
  "devDependencies": {
    "nodemon": "^3.0.2"
  }
 }
--- a/frontend/public/app.js
+++ b/frontend/public/app.js
@@ -0,0 +1,170 @@
 // Load news on page load
 document.addEventListener('DOMContentLoaded', () => {
    loadNews();
    loadStats();
 });
 async function loadNews() {
    const newsGrid = document.getElementById('newsGrid');
    newsGrid.innerHTML = '<div class="loading">Loading news...</div>';
    try {
        const response = await fetch('/api/news');
        const data = await response.json();
        if (data.articles && data.articles.length > 0) {
            displayNews(data.articles);
        } else {
            newsGrid.innerHTML = '<div class="loading">No news available at the moment. Check back later!</div>';
        }
    } catch (error) {
        console.error('Error loading news:', error);
        newsGrid.innerHTML = '<div class="loading">Failed to load news. Please try again later.</div>';
    }
 }
 function displayNews(articles) {
    const newsGrid = document.getElementById('newsGrid');
    newsGrid.innerHTML = '';
    articles.forEach(article => {
        const card = document.createElement('div');
        card.className = 'news-card';
        card.onclick = () => window.open(article.link, '_blank');
        card.innerHTML = `
            <div class="source">${article.source || 'Munich News'}</div>
            <h3>${article.title}</h3>
            <p>${article.summary || 'No summary available.'}</p>
            <a href="${article.link}" target="_blank" class="read-more" onclick="event.stopPropagation()">Read more →</a>
        `;
        newsGrid.appendChild(card);
    });
 }
 async function loadStats() {
    try {
        const response = await fetch('/api/stats');
        const data = await response.json();
        if (data.subscribers !== undefined) {
            document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
        }
    } catch (error) {
        console.error('Error loading stats:', error);
    }
 }
 async function subscribe() {
    const emailInput = document.getElementById('emailInput');
    const subscribeBtn = document.getElementById('subscribeBtn');
    const formMessage = document.getElementById('formMessage');
    const email = emailInput.value.trim();
    if (!email || !email.includes('@')) {
        formMessage.textContent = 'Please enter a valid email address';
        formMessage.className = 'form-message error';
        return;
    }
    subscribeBtn.disabled = true;
    subscribeBtn.textContent = 'Subscribing...';
    formMessage.textContent = '';
    try {
        const response = await fetch('/api/subscribe', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json'
            },
            body: JSON.stringify({ email: email })
        });
        const data = await response.json();
        if (response.ok) {
            formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
            formMessage.className = 'form-message success';
            emailInput.value = '';
            loadStats(); // Refresh stats
        } else {
            formMessage.textContent = data.error || 'Failed to subscribe. Please try again.';
            formMessage.className = 'form-message error';
        }
    } catch (error) {
        formMessage.textContent = 'Network error. Please try again later.';
        formMessage.className = 'form-message error';
    } finally {
        subscribeBtn.disabled = false;
        subscribeBtn.textContent = 'Subscribe Free';
    }
 }
 // Allow Enter key to submit
 document.getElementById('emailInput').addEventListener('keypress', (e) => {
    if (e.key === 'Enter') {
        subscribe();
    }
 });
 function showUnsubscribe() {
    document.getElementById('unsubscribeModal').style.display = 'block';
 }
 function closeUnsubscribe() {
    document.getElementById('unsubscribeModal').style.display = 'none';
    document.getElementById('unsubscribeEmail').value = '';
    document.getElementById('unsubscribeMessage').textContent = '';
 }
 async function unsubscribe() {
    const emailInput = document.getElementById('unsubscribeEmail');
    const unsubscribeMessage = document.getElementById('unsubscribeMessage');
    const email = emailInput.value.trim();
    if (!email || !email.includes('@')) {
        unsubscribeMessage.textContent = 'Please enter a valid email address';
        unsubscribeMessage.className = 'form-message error';
        return;
    }
    try {
        const response = await fetch('/api/unsubscribe', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json'
            },
            body: JSON.stringify({ email: email })
        });
        const data = await response.json();
        if (response.ok) {
            unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
            unsubscribeMessage.className = 'form-message success';
            emailInput.value = '';
            setTimeout(() => {
                closeUnsubscribe();
                loadStats();
            }, 2000);
        } else {
            unsubscribeMessage.textContent = data.error || 'Failed to unsubscribe. Please try again.';
            unsubscribeMessage.className = 'form-message error';
        }
    } catch (error) {
        unsubscribeMessage.textContent = 'Network error. Please try again later.';
        unsubscribeMessage.className = 'form-message error';
    }
 }
 // Close modal when clicking outside
 window.onclick = function(event) {
    const modal = document.getElementById('unsubscribeModal');
    if (event.target === modal) {
        closeUnsubscribe();
    }
 }
--- a/frontend/public/index.html
+++ b/frontend/public/index.html
@@ -0,0 +1,65 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Munich News Daily - Your Daily Dose of Munich News</title>
    <link rel="stylesheet" href="styles.css">
 </head>
 <body>
    <div class="container">
        <header class="hero">
            <div class="hero-content">
                <h1>📰 Munich News Daily</h1>
                <p class="tagline">Get the latest Munich news delivered to your inbox every morning</p>
                <p class="description">Stay informed about what's happening in Munich with our curated daily newsletter. No fluff, just the news that matters.</p>
                <div class="subscription-form" id="subscriptionForm">
                    <input 
                        type="email" 
                        id="emailInput" 
                        placeholder="Enter your email address" 
                        required
                    >
                    <button id="subscribeBtn" onclick="subscribe()">Subscribe Free</button>
                    <p class="form-message" id="formMessage"></p>
                </div>
                <div class="stats">
                    <div class="stat-item">
                        <span class="stat-number" id="subscriberCount">-</span>
                        <span class="stat-label">Subscribers</span>
                    </div>
                </div>
            </div>
        </header>
        <section class="news-section">
            <h2>Latest Munich News</h2>
            <div class="news-grid" id="newsGrid">
                <div class="loading">Loading news...</div>
            </div>
        </section>
        <footer>
            <p>&copy; 2024 Munich News Daily. Made with ❤️ for Munich.</p>
            <p><a href="#" onclick="showUnsubscribe()">Unsubscribe</a></p>
        </footer>
    </div>
    <!-- Unsubscribe Modal -->
    <div class="modal" id="unsubscribeModal">
        <div class="modal-content">
            <span class="close" onclick="closeUnsubscribe()">&times;</span>
            <h2>Unsubscribe</h2>
            <p>Enter your email to unsubscribe from Munich News Daily:</p>
            <input type="email" id="unsubscribeEmail" placeholder="Enter your email">
            <button onclick="unsubscribe()">Unsubscribe</button>
            <p class="form-message" id="unsubscribeMessage"></p>
        </div>
    </div>
    <script src="app.js"></script>
 </body>
 </html>
--- a/frontend/public/styles.css
+++ b/frontend/public/styles.css
@@ -0,0 +1,306 @@
 * {
    margin: 0;
    padding: 0;
    box-sizing: border-box;
 }
 body {
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
    line-height: 1.6;
    color: #333;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    min-height: 100vh;
 }
 .container {
    max-width: 1200px;
    margin: 0 auto;
    padding: 20px;
 }
 .hero {
    text-align: center;
    padding: 60px 20px;
    color: white;
 }
 .hero-content {
    max-width: 700px;
    margin: 0 auto;
 }
 .hero h1 {
    font-size: 3.5rem;
    margin-bottom: 20px;
    font-weight: 700;
 }
 .tagline {
    font-size: 1.5rem;
    margin-bottom: 15px;
    font-weight: 300;
 }
 .description {
    font-size: 1.1rem;
    margin-bottom: 40px;
    opacity: 0.9;
 }
 .subscription-form {
    display: flex;
    flex-direction: column;
    gap: 15px;
    max-width: 500px;
    margin: 0 auto 40px;
 }
 .subscription-form input {
    padding: 15px 20px;
    font-size: 1rem;
    border: none;
    border-radius: 8px;
    outline: none;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 }
 .subscription-form button {
    padding: 15px 30px;
    font-size: 1.1rem;
    font-weight: 600;
    background: #ff6b6b;
    color: white;
    border: none;
    border-radius: 8px;
    cursor: pointer;
    transition: all 0.3s ease;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 }
 .subscription-form button:hover {
    background: #ff5252;
    transform: translateY(-2px);
    box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
 }
 .subscription-form button:active {
    transform: translateY(0);
 }
 .form-message {
    margin-top: 10px;
    font-size: 0.9rem;
    min-height: 20px;
 }
 .form-message.success {
    color: #4caf50;
 }
 .form-message.error {
    color: #f44336;
 }
 .stats {
    display: flex;
    justify-content: center;
    gap: 40px;
    margin-top: 40px;
 }
 .stat-item {
    text-align: center;
 }
 .stat-number {
    display: block;
    font-size: 2.5rem;
    font-weight: 700;
    margin-bottom: 5px;
 }
 .stat-label {
    font-size: 0.9rem;
    opacity: 0.8;
 }
 .news-section {
    background: white;
    border-radius: 20px;
    padding: 40px;
    margin: 40px 0;
    box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
 }
 .news-section h2 {
    font-size: 2rem;
    margin-bottom: 30px;
    color: #333;
    text-align: center;
 }
 .news-grid {
    display: grid;
    grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
    gap: 25px;
 }
 .news-card {
    background: #f8f9fa;
    border-radius: 12px;
    padding: 20px;
    transition: all 0.3s ease;
    border-left: 4px solid #667eea;
    cursor: pointer;
 }
 .news-card:hover {
    transform: translateY(-5px);
    box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
    background: white;
 }
 .news-card h3 {
    font-size: 1.2rem;
    margin-bottom: 10px;
    color: #333;
    line-height: 1.4;
 }
 .news-card p {
    color: #666;
    font-size: 0.95rem;
    margin-bottom: 15px;
    line-height: 1.5;
 }
 .news-card .source {
    font-size: 0.85rem;
    color: #667eea;
    font-weight: 600;
    margin-bottom: 10px;
 }
 .news-card .read-more {
    color: #667eea;
    text-decoration: none;
    font-weight: 600;
    font-size: 0.9rem;
    display: inline-block;
    margin-top: 10px;
 }
 .news-card .read-more:hover {
    text-decoration: underline;
 }
 .loading {
    text-align: center;
    padding: 40px;
    color: #666;
    font-size: 1.1rem;
    grid-column: 1 / -1;
 }
 footer {
    text-align: center;
    padding: 40px 20px;
    color: white;
 }
 footer a {
    color: white;
    text-decoration: underline;
    cursor: pointer;
 }
 footer a:hover {
    opacity: 0.8;
 }
 /* Modal Styles */
 .modal {
    display: none;
    position: fixed;
    z-index: 1000;
    left: 0;
    top: 0;
    width: 100%;
    height: 100%;
    background-color: rgba(0, 0, 0, 0.5);
    backdrop-filter: blur(5px);
 }
 .modal-content {
    background-color: white;
    margin: 15% auto;
    padding: 30px;
    border-radius: 12px;
    width: 90%;
    max-width: 500px;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
 }
 .close {
    color: #aaa;
    float: right;
    font-size: 28px;
    font-weight: bold;
    cursor: pointer;
 }
 .close:hover {
    color: #000;
 }
 .modal-content h2 {
    margin-bottom: 20px;
    color: #333;
 }
 .modal-content input {
    width: 100%;
    padding: 12px;
    margin: 15px 0;
    border: 2px solid #ddd;
    border-radius: 8px;
    font-size: 1rem;
 }
 .modal-content button {
    width: 100%;
    padding: 12px;
    background: #ff6b6b;
    color: white;
    border: none;
    border-radius: 8px;
    font-size: 1rem;
    font-weight: 600;
    cursor: pointer;
    margin-top: 10px;
 }
 .modal-content button:hover {
    background: #ff5252;
 }
 /* Responsive Design */
@media (max-width: 768px) {
    .hero h1 {
        font-size: 2.5rem;
    }
    .tagline {
        font-size: 1.2rem;
    }
    .news-grid {
        grid-template-columns: 1fr;
    }
    .stats {
        flex-direction: column;
        gap: 20px;
    }
 }
--- a/frontend/server.js
+++ b/frontend/server.js
@@ -0,0 +1,57 @@
 const express = require('express');
 const path = require('path');
 const axios = require('axios');
 const app = express();
 const PORT = process.env.PORT || 3000;
 const API_URL = process.env.API_URL || 'http://localhost:5001';
 // Serve static files
 app.use(express.static('public'));
 app.use(express.json());
 // API proxy
 app.get('/api/news', async (req, res) => {
  try {
    const response = await axios.get(`${API_URL}/api/news`);
    res.json(response.data);
  } catch (error) {
    res.status(500).json({ error: 'Failed to fetch news' });
  }
 });
 app.get('/api/stats', async (req, res) => {
  try {
    const response = await axios.get(`${API_URL}/api/stats`);
    res.json(response.data);
  } catch (error) {
    res.status(500).json({ error: 'Failed to fetch stats' });
  }
 });
 app.post('/api/subscribe', async (req, res) => {
  try {
    const response = await axios.post(`${API_URL}/api/subscribe`, req.body);
    res.json(response.data);
  } catch (error) {
    res.status(error.response?.status || 500).json(
      error.response?.data || { error: 'Failed to subscribe' }
    );
  }
 });
 app.post('/api/unsubscribe', async (req, res) => {
  try {
    const response = await axios.post(`${API_URL}/api/unsubscribe`, req.body);
    res.json(response.data);
  } catch (error) {
    res.status(error.response?.status || 500).json(
      error.response?.data || { error: 'Failed to unsubscribe' }
    );
  }
 });
 app.listen(PORT, () => {
  console.log(`Frontend server running on http://localhost:${PORT}`);
 });
--- a/news_crawler/.gitignore
+++ b/news_crawler/.gitignore
@@ -0,0 +1,25 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 env/
 venv/
 ENV/
 .venv
 # Environment variables
 .env
 .env.local
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 Thumbs.db
--- a/news_crawler/CHANGES.md
+++ b/news_crawler/CHANGES.md
@@ -0,0 +1,191 @@
 # Recent Changes - Full Content Storage
 ## ✅ What Changed
 ### 1. Removed Content Length Limit
 **Before:**
 ```python
 'content': content_text[:10000]  # Limited to 10k chars
 ```
 **After:**
 ```python
 'content': content_text  # Full content, no limit
 ```
 ### 2. Simplified Database Schema
 **Before:**
 ```javascript
 {
  summary: String,      // Short summary
  full_content: String  // Limited content
 }
 ```
 **After:**
 ```javascript
 {
  content: String  // Full article content, no limit
 }
 ```
 ### 3. Enhanced API Response
 **Before:**
 ```javascript
 {
  title: "...",
  link: "...",
  summary: "..."
 }
 ```
 **After:**
 ```javascript
 {
  title: "...",
  author: "...",        // NEW!
  link: "...",
  preview: "...",       // First 200 chars
  word_count: 1250,     // NEW!
  has_full_content: true // NEW!
 }
 ```
 ## 📊 Database Structure
 ### Articles Collection
 ```javascript
 {
  _id: ObjectId,
  title: String,           // Article title
  author: String,          // Article author (extracted)
  link: String,            // Article URL (unique)
  content: String,         // FULL article content (no limit)
  word_count: Number,      // Word count
  source: String,          // RSS feed name
  published_at: String,    // Publication date
  crawled_at: DateTime,    // When crawled
  created_at: DateTime     // When added
 }
 ```
 ## 🆕 New API Endpoint
 ### GET /api/news/<article_url>
 Get full article content by URL.
 **Example:**
 ```bash
 # URL encode the article URL
 curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
 ```
 **Response:**
 ```json
 {
  "title": "New U-Bahn Line Opens in Munich",
  "author": "Max Mustermann",
  "link": "https://example.com/article",
  "content": "The full article text here... (complete, no truncation)",
  "word_count": 1250,
  "source": "Süddeutsche Zeitung München",
  "published_at": "2024-11-10T10:00:00Z",
  "crawled_at": "2024-11-10T16:30:00Z",
  "created_at": "2024-11-10T16:00:00Z"
 }
 ```
 ## 📈 Enhanced Stats
 ### GET /api/stats
 Now includes crawled article count:
 ```json
 {
  "subscribers": 150,
  "articles": 500,
  "crawled_articles": 350  // NEW!
 }
 ```
 ## 🎯 Benefits
 1. **Complete Content** - No truncation, full articles stored
 2. **Better for AI** - Full context for summarization/analysis
 3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content`
 4. **More Metadata** - Author, word count, crawl timestamp
 5. **Better API** - Preview in list, full content on demand
 ## 🔄 Migration
 If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field.
 To migrate old articles:
 ```javascript
 // MongoDB shell
 db.articles.updateMany(
  { full_content: { $exists: true } },
  [
    {
      $set: {
        content: "$full_content"
      }
    },
    {
      $unset: ["full_content", "summary"]
    }
  ]
 )
 ```
 ## 🚀 Usage
 ### Crawl Articles
 ```bash
 cd news_crawler
 python crawler_service.py 10
 ```
 ### Get Article List (with previews)
 ```bash
 curl http://localhost:5001/api/news
 ```
 ### Get Full Article Content
 ```bash
 # Get the article URL from the list, then:
 curl "http://localhost:5001/api/news/<encoded_url>"
 ```
 ### Check Stats
 ```bash
 curl http://localhost:5001/api/stats
 ```
 ## 📝 Example Workflow
 1. **Add RSS Feed**
 ```bash
 curl -X POST http://localhost:5001/api/rss-feeds \
  -H "Content-Type: application/json" \
  -d '{"name": "News Source", "url": "https://example.com/rss"}'
 ```
 2. **Crawl Articles**
 ```bash
 cd news_crawler
 python crawler_service.py 10
 ```
 3. **View Articles**
 ```bash
 curl http://localhost:5001/api/news
 ```
 4. **Get Full Content**
 ```bash
 # Copy article link from above, URL encode it
 curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
 ```
 Now you have complete article content ready for AI processing! 🎉
--- a/news_crawler/Dockerfile
+++ b/news_crawler/Dockerfile
@@ -0,0 +1,13 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy crawler service
 COPY crawler_service.py .
 # Run crawler
 CMD ["python", "crawler_service.py"]
--- a/news_crawler/EXTRACTION_STRATEGIES.md
+++ b/news_crawler/EXTRACTION_STRATEGIES.md
@@ -0,0 +1,353 @@
 # Content Extraction Strategies
 The crawler uses multiple strategies to dynamically extract article metadata from any website.
 ## 🎯 What Gets Extracted
 1. **Title** - Article headline
 2. **Author** - Article writer/journalist
 3. **Published Date** - When article was published
 4. **Content** - Main article text
 5. **Description** - Meta description/summary
 ## 📋 Extraction Strategies
 ### 1. Title Extraction
 Tries multiple methods in order of reliability:
 #### Strategy 1: H1 Tag
 ```html
 <h1>Article Title Here</h1>
 ```
 ✅ Most reliable - usually the main headline
 #### Strategy 2: Open Graph Meta Tag
 ```html
 <meta property="og:title" content="Article Title Here" />
 ```
 ✅ Used by Facebook, very reliable
 #### Strategy 3: Twitter Card Meta Tag
 ```html
 <meta name="twitter:title" content="Article Title Here" />
 ```
 ✅ Used by Twitter, reliable
 #### Strategy 4: Title Tag (Fallback)
 ```html
 <title>Article Title | Site Name</title>
 ```
 ⚠️ Often includes site name, needs cleaning
 **Cleaning:**
 - Removes " | Site Name"
 - Removes " - Site Name"
 ---
 ### 2. Author Extraction
 Tries multiple methods:
 #### Strategy 1: Meta Author Tag
 ```html
 <meta name="author" content="John Doe" />
 ```
 ✅ Standard HTML meta tag
 #### Strategy 2: Rel="author" Link
 ```html
 <a rel="author" href="/author/john-doe">John Doe</a>
 ```
 ✅ Semantic HTML
 #### Strategy 3: Common Class Names
 ```html
 <div class="author-name">John Doe</div>
 <span class="byline">By John Doe</span>
 <p class="writer">John Doe</p>
 ```
 ✅ Searches for: author-name, author, byline, writer
 #### Strategy 4: Schema.org Markup
 ```html
 <span itemprop="author">John Doe</span>
 ```
 ✅ Structured data
 #### Strategy 5: JSON-LD Structured Data
 ```html
 <script type="application/ld+json">
 {
  "@type": "NewsArticle",
  "author": {
    "@type": "Person",
    "name": "John Doe"
  }
 }
 </script>
 ```
 ✅ Most structured, very reliable
 **Cleaning:**
 - Removes "By " prefix
 - Validates length (< 100 chars)
 ---
 ### 3. Date Extraction
 Tries multiple methods:
 #### Strategy 1: Time Tag with Datetime
 ```html
 <time datetime="2024-11-10T10:00:00Z">November 10, 2024</time>
 ```
 ✅ Most reliable - ISO format
 #### Strategy 2: Article Published Time Meta
 ```html
 <meta property="article:published_time" content="2024-11-10T10:00:00Z" />
 ```
 ✅ Open Graph standard
 #### Strategy 3: OG Published Time
 ```html
 <meta property="og:published_time" content="2024-11-10T10:00:00Z" />
 ```
 ✅ Facebook standard
 #### Strategy 4: Common Class Names
 ```html
 <span class="publish-date">November 10, 2024</span>
 <time class="published">2024-11-10</time>
 <div class="timestamp">10:00 AM, Nov 10</div>
 ```
 ✅ Searches for: publish-date, published, date, timestamp
 #### Strategy 5: Schema.org Markup
 ```html
 <meta itemprop="datePublished" content="2024-11-10T10:00:00Z" />
 ```
 ✅ Structured data
 #### Strategy 6: JSON-LD Structured Data
 ```html
 <script type="application/ld+json">
 {
  "@type": "NewsArticle",
  "datePublished": "2024-11-10T10:00:00Z"
 }
 </script>
 ```
 ✅ Most structured
 ---
 ### 4. Content Extraction
 Tries multiple methods:
 #### Strategy 1: Semantic HTML Tags
 ```html
 <article>
  <p>Article content here...</p>
 </article>
 ```
 ✅ Best practice HTML5
 #### Strategy 2: Common Class Names
 ```html
 <div class="article-content">...</div>
 <div class="article-body">...</div>
 <div class="post-content">...</div>
 <div class="entry-content">...</div>
 <div class="story-body">...</div>
 ```
 ✅ Searches for common patterns
 #### Strategy 3: Schema.org Markup
 ```html
 <div itemprop="articleBody">
  <p>Content here...</p>
 </div>
 ```
 ✅ Structured data
 #### Strategy 4: Main Tag
 ```html
 <main>
  <p>Content here...</p>
 </main>
 ```
 ✅ Semantic HTML5
 #### Strategy 5: Body Tag (Fallback)
 ```html
 <body>
  <p>Content here...</p>
 </body>
 ```
 ⚠️ Last resort, may include navigation
 **Content Filtering:**
 - Removes `<script>`, `<style>`, `<nav>`, `<footer>`, `<header>`, `<aside>`
 - Filters out short paragraphs (< 50 chars) - likely ads/navigation
 - Keeps only substantial paragraphs
 - **No length limit** - stores full article content
 ---
 ## 🔍 How It Works
 ### Example: Crawling a News Article
 ```python
 # 1. Fetch HTML
 response = requests.get(article_url)
 soup = BeautifulSoup(response.content, 'html.parser')
 # 2. Extract title (tries 4 strategies)
 title = extract_title(soup)
 # Result: "New U-Bahn Line Opens in Munich"
 # 3. Extract author (tries 5 strategies)
 author = extract_author(soup)
 # Result: "Max Mustermann"
 # 4. Extract date (tries 6 strategies)
 published_date = extract_date(soup)
 # Result: "2024-11-10T10:00:00Z"
 # 5. Extract content (tries 5 strategies)
 content = extract_main_content(soup)
 # Result: "The new U-Bahn line connecting..."
 # 6. Save to database
 article_doc = {
    'title': title,
    'author': author,
    'published_at': published_date,
    'full_content': content,
    'word_count': len(content.split())
 }
 ```
 ---
 ## 📊 Success Rates by Strategy
 Based on common news sites:
 | Strategy | Success Rate | Notes |
 |----------|-------------|-------|
 | H1 for title | 95% | Almost universal |
 | OG meta tags | 90% | Most modern sites |
 | Time tag for date | 85% | HTML5 sites |
 | JSON-LD | 70% | Growing adoption |
 | Class name patterns | 60% | Varies by site |
 | Schema.org | 50% | Not widely adopted |
 ---
 ## 🎨 Real-World Examples
 ### Example 1: Süddeutsche Zeitung
 ```html
 <article>
  <h1>New U-Bahn Line Opens</h1>
  <span class="author">Max Mustermann</span>
  <time datetime="2024-11-10T10:00:00Z">10. November 2024</time>
  <div class="article-body">
    <p>The new U-Bahn line...</p>
  </div>
 </article>
 ```
 ✅ Extracts: Title (H1), Author (class), Date (time), Content (article-body)
 ### Example 2: Medium Blog
 ```html
 <article>
  <h1>How to Build a News Crawler</h1>
  <meta property="og:title" content="How to Build a News Crawler" />
  <meta property="article:published_time" content="2024-11-10T10:00:00Z" />
  <a rel="author" href="/author">Jane Smith</a>
  <section>
    <p>In this article...</p>
  </section>
 </article>
 ```
 ✅ Extracts: Title (OG meta), Author (rel), Date (article meta), Content (section)
 ### Example 3: WordPress Blog
 ```html
 <div class="post">
  <h1 class="entry-title">My Blog Post</h1>
  <span class="byline">By John Doe</span>
  <time class="published">November 10, 2024</time>
  <div class="entry-content">
    <p>Blog content here...</p>
  </div>
 </div>
 ```
 ✅ Extracts: Title (H1), Author (byline), Date (published), Content (entry-content)
 ---
 ## ⚠️ Edge Cases Handled
 1. **Missing Fields**: Returns `None` instead of crashing
 2. **Multiple Authors**: Takes first one found
 3. **Relative Dates**: Stores as-is ("2 hours ago")
 4. **Paywalls**: Extracts what's available
 5. **JavaScript-rendered**: Only gets server-side HTML
 6. **Ads/Navigation**: Filtered out by paragraph length
 7. **Site Name in Title**: Cleaned automatically
 ---
 ## 🚀 Future Improvements
 Potential enhancements:
 - [ ] JavaScript rendering (Selenium/Playwright)
 - [ ] Paywall bypass (where legal)
 - [ ] Image extraction
 - [ ] Video detection
 - [ ] Related articles
 - [ ] Tags/categories
 - [ ] Reading time estimation
 - [ ] Language detection
 - [ ] Sentiment analysis
 ---
 ## 🧪 Testing
 Test the extraction on a specific URL:
 ```python
 from crawler_service import extract_article_content
 url = "https://www.sueddeutsche.de/muenchen/article-123"
 data = extract_article_content(url)
 print(f"Title: {data['title']}")
 print(f"Author: {data['author']}")
 print(f"Date: {data['published_date']}")
 print(f"Content length: {len(data['content'])} chars")
 print(f"Word count: {data['word_count']}")
 ```
 ---
 ## 📚 Standards Supported
 - ✅ HTML5 semantic tags
 - ✅ Open Graph Protocol
 - ✅ Twitter Cards
 - ✅ Schema.org microdata
 - ✅ JSON-LD structured data
 - ✅ Dublin Core metadata
 - ✅ Common CSS class patterns
--- a/news_crawler/HOW_IT_WORKS.md
+++ b/news_crawler/HOW_IT_WORKS.md
@@ -0,0 +1,306 @@
 # How the News Crawler Works
 ## 🎯 Overview
 The crawler dynamically extracts article metadata from any website using multiple fallback strategies.
 ## 📊 Flow Diagram
 ```
 RSS Feed URL
    ↓
 Parse RSS Feed
    ↓
 For each article link:
    ↓
 ┌─────────────────────────────────────┐
 │  1. Fetch HTML Page                 │
 │     GET https://example.com/article │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  2. Parse with BeautifulSoup        │
 │     soup = BeautifulSoup(html)      │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  3. Clean HTML                      │
 │     Remove: scripts, styles, nav,   │
 │     footer, header, ads             │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  4. Extract Title                   │
 │     Try: H1 → OG meta → Twitter →   │
 │     Title tag                       │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  5. Extract Author                  │
 │     Try: Meta author → rel=author → │
 │     Class names → JSON-LD           │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  6. Extract Date                    │
 │     Try: <time> → Meta tags →       │
 │     Class names → JSON-LD           │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  7. Extract Content                 │
 │     Try: <article> → Class names →  │
 │     <main> → <body>                 │
 │     Filter short paragraphs         │
 └─────────────────────────────────────┘
    ↓
 ┌─────────────────────────────────────┐
 │  8. Save to MongoDB                 │
 │     {                               │
 │       title, author, date,          │
 │       content, word_count           │
 │     }                               │
 └─────────────────────────────────────┘
    ↓
 Wait 1 second (rate limiting)
    ↓
 Next article
 ```
 ## 🔍 Detailed Example
 ### Input: RSS Feed Entry
 ```xml
 <item>
  <title>New U-Bahn Line Opens</title>
  <link>https://www.sueddeutsche.de/muenchen/article-123</link>
  <pubDate>Mon, 10 Nov 2024 10:00:00 +0100</pubDate>
 </item>
 ```
 ### Step 1: Fetch HTML
 ```python
 url = "https://www.sueddeutsche.de/muenchen/article-123"
 response = requests.get(url)
 html = response.content
 ```
 ### Step 2: Parse HTML
 ```python
 soup = BeautifulSoup(html, 'html.parser')
 ```
 ### Step 3: Extract Title
 ```python
 # Try H1
 h1 = soup.find('h1')
 # Result: "New U-Bahn Line Opens in Munich"
 # If no H1, try OG meta
 og_title = soup.find('meta', property='og:title')
 # Fallback chain continues...
 ```
 ### Step 4: Extract Author
 ```python
 # Try meta author
 meta_author = soup.find('meta', name='author')
 # Result: None
 # Try class names
 author_elem = soup.select_one('[class*="author"]')
 # Result: "Max Mustermann"
 ```
 ### Step 5: Extract Date
 ```python
 # Try time tag
 time_tag = soup.find('time')
 # Result: "2024-11-10T10:00:00Z"
 ```
 ### Step 6: Extract Content
 ```python
 # Try article tag
 article = soup.find('article')
 paragraphs = article.find_all('p')
 # Filter paragraphs
 content = []
 for p in paragraphs:
    text = p.get_text().strip()
    if len(text) >= 50:  # Keep substantial paragraphs
        content.append(text)
 full_content = '\n\n'.join(content)
 # Result: "The new U-Bahn line connecting the city center..."
 ```
 ### Step 7: Save to Database
 ```python
 article_doc = {
    'title': 'New U-Bahn Line Opens in Munich',
    'author': 'Max Mustermann',
    'link': 'https://www.sueddeutsche.de/muenchen/article-123',
    'summary': 'Short summary from RSS...',
    'full_content': 'The new U-Bahn line connecting...',
    'word_count': 1250,
    'source': 'Süddeutsche Zeitung München',
    'published_at': '2024-11-10T10:00:00Z',
    'crawled_at': datetime.utcnow(),
    'created_at': datetime.utcnow()
 }
 db.articles.update_one(
    {'link': article_url},
    {'$set': article_doc},
    upsert=True
 )
 ```
 ## 🎨 What Makes It "Dynamic"?
 ### Traditional Approach (Hardcoded)
 ```python
 # Only works for one specific site
 title = soup.find('h1', class_='article-title').text
 author = soup.find('span', class_='author-name').text
 ```
 ❌ Breaks when site changes
 ❌ Doesn't work on other sites
 ### Our Approach (Dynamic)
 ```python
 # Works on ANY site
 title = extract_title(soup)  # Tries 4 different methods
 author = extract_author(soup)  # Tries 5 different methods
 ```
 ✅ Adapts to different HTML structures
 ✅ Falls back to alternatives
 ✅ Works across multiple sites
 ## 🛡️ Robustness Features
 ### 1. Multiple Strategies
 Each field has 4-6 extraction strategies
 ```python
 def extract_title(soup):
    # Try strategy 1
    if h1 := soup.find('h1'):
        return h1.text
    # Try strategy 2
    if og_title := soup.find('meta', property='og:title'):
        return og_title['content']
    # Try strategy 3...
    # Try strategy 4...
 ```
 ### 2. Validation
 ```python
 # Title must be reasonable length
 if title and len(title) > 10:
    return title
 # Author must be < 100 chars
 if author and len(author) < 100:
    return author
 ```
 ### 3. Cleaning
 ```python
 # Remove site name from title
 if ' | ' in title:
    title = title.split(' | ')[0]
 # Remove "By" from author
 author = author.replace('By ', '').strip()
 ```
 ### 4. Error Handling
 ```python
 try:
    data = extract_article_content(url)
 except Timeout:
    print("Timeout - skip")
 except RequestException:
    print("Network error - skip")
 except Exception:
    print("Unknown error - skip")
 ```
 ## 📈 Success Metrics
 After crawling, you'll see:
 ```
 📰 Crawling feed: Süddeutsche Zeitung München
   🔍 Crawling: New U-Bahn Line Opens...
   ✓ Saved (1250 words)
   Title: ✓ Found
   Author: ✓ Found (Max Mustermann)
   Date: ✓ Found (2024-11-10T10:00:00Z)
   Content: ✓ Found (1250 words)
 ```
 ## 🗄️ Database Result
 **Before Crawling:**
 ```javascript
 {
  title: "New U-Bahn Line Opens",
  link: "https://example.com/article",
  summary: "Short RSS summary...",
  source: "Süddeutsche Zeitung"
 }
 ```
 **After Crawling:**
 ```javascript
 {
  title: "New U-Bahn Line Opens in Munich",  // ← Enhanced
  author: "Max Mustermann",                   // ← NEW!
  link: "https://example.com/article",
  summary: "Short RSS summary...",
  full_content: "The new U-Bahn line...",    // ← NEW! (1250 words)
  word_count: 1250,                           // ← NEW!
  source: "Süddeutsche Zeitung",
  published_at: "2024-11-10T10:00:00Z",      // ← Enhanced
  crawled_at: ISODate("2024-11-10T16:30:00Z"), // ← NEW!
  created_at: ISODate("2024-11-10T16:00:00Z")
 }
 ```
 ## 🚀 Running the Crawler
 ```bash
 cd news_crawler
 pip install -r requirements.txt
 python crawler_service.py 10
 ```
 Output:
 ```
 ============================================================
 🚀 Starting RSS Feed Crawler
 ============================================================
 Found 3 active feed(s)
 📰 Crawling feed: Süddeutsche Zeitung München
   🔍 Crawling: New U-Bahn Line Opens...
   ✓ Saved (1250 words)
   🔍 Crawling: Munich Weather Update...
   ✓ Saved (450 words)
   ✓ Crawled 2 articles
 ============================================================
 ✓ Crawling Complete!
  Total feeds processed: 3
  Total articles crawled: 15
  Duration: 45.23 seconds
 ============================================================
 ```
 Now you have rich, structured article data ready for AI processing! 🎉
--- a/news_crawler/QUICKSTART.md
+++ b/news_crawler/QUICKSTART.md
@@ -0,0 +1,127 @@
 # News Crawler - Quick Start
 ## 1. Install Dependencies
 ```bash
 cd news_crawler
 pip install -r requirements.txt
 ```
 ## 2. Configure Environment
 Make sure MongoDB is running and accessible. The crawler will use the same database as the backend.
 Default connection: `mongodb://localhost:27017/`
 To use a different MongoDB URI, create a `.env` file:
 ```env
 MONGODB_URI=mongodb://localhost:27017/
 ```
 ## 3. Run the Crawler
 ```bash
 # Crawl up to 10 articles per feed
 python crawler_service.py
 # Crawl up to 20 articles per feed
 python crawler_service.py 20
 ```
 ## 4. Verify Results
 Check your MongoDB database:
 ```bash
 # Using mongosh
 mongosh
 use munich_news
 db.articles.find({full_content: {$exists: true}}).count()
 db.articles.findOne({full_content: {$exists: true}})
 ```
 ## 5. Schedule Regular Crawling
 ### Option A: Cron (Linux/Mac)
 ```bash
 # Edit crontab
 crontab -e
 # Add this line to run every 6 hours
 0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
 ```
 ### Option B: Docker
 ```bash
 # Build and run
 docker-compose up
 # Or run as a one-off
 docker-compose run --rm crawler
 ```
 ### Option C: Manual
 Just run the script whenever you want to fetch new articles:
 ```bash
 python crawler_service.py
 ```
 ## What Gets Crawled?
 The crawler:
 1. Fetches all active RSS feeds from the database
 2. For each feed, gets the latest articles
 3. Crawls the full content from each article URL
 4. Saves: title, full_content, word_count, crawled_at
 5. Skips articles that already have content
 ## Output Example
 ```
 ============================================================
 🚀 Starting RSS Feed Crawler
 ============================================================
 Found 3 active feed(s)
 📰 Crawling feed: Süddeutsche Zeitung München
   URL: https://www.sueddeutsche.de/muenchen/rss
   🔍 Crawling: New U-Bahn Line Opens in Munich...
   ✓ Saved (1250 words)
   🔍 Crawling: Munich Weather Update...
   ✓ Saved (450 words)
   ✓ Crawled 2 articles from Süddeutsche Zeitung München
 ============================================================
 ✓ Crawling Complete!
  Total feeds processed: 3
  Total articles crawled: 15
  Duration: 45.23 seconds
 ============================================================
 ```
 ## Troubleshooting
 **No feeds found:**
 - Make sure you've added RSS feeds via the backend API
 - Check MongoDB connection
 **Can't extract content:**
 - Some sites block scrapers
 - Some sites require JavaScript (not supported yet)
 - Check if the URL is accessible
 **Timeout errors:**
 - Increase timeout in the code
 - Check your internet connection
 ## Next Steps
 Once articles are crawled, you can:
 - View them in the frontend
 - Use Ollama to summarize them
 - Generate newsletters with full content
 - Perform text analysis
--- a/news_crawler/README.md
+++ b/news_crawler/README.md
@@ -0,0 +1,225 @@
 # News Crawler Microservice
 A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB.
 ## Features
 - 🔍 Extracts full article content from RSS feed links
 - 📊 Calculates word count
 - 🔄 Avoids re-crawling already processed articles
 - ⏱️ Rate limiting (1 second delay between requests)
 - 🎯 Smart content extraction using multiple selectors
 - 🧹 Cleans up scripts, styles, and navigation elements
 ## Installation
 1. Create a virtual environment:
 ```bash
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 2. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 3. Configure environment variables:
 Create a `.env` file in the project root (or use the backend's `.env`):
 ```env
 MONGODB_URI=mongodb://localhost:27017/
 ```
 ## Usage
 ### Standalone Execution
 Run the crawler directly:
 ```bash
 # Crawl up to 10 articles per feed (default)
 python crawler_service.py
 # Crawl up to 20 articles per feed
 python crawler_service.py 20
 ```
 ### As a Module
 ```python
 from crawler_service import crawl_all_feeds, crawl_rss_feed
 # Crawl all active feeds
 result = crawl_all_feeds(max_articles_per_feed=10)
 print(result)
 # Crawl a specific feed
 crawl_rss_feed(
    feed_url='https://example.com/rss',
    feed_name='Example News',
    max_articles=10
 )
 ```
 ### Via Backend API
 The backend has integrated endpoints:
 ```bash
 # Start crawler
 curl -X POST http://localhost:5001/api/crawler/start
 # Check status
 curl http://localhost:5001/api/crawler/status
 # Crawl specific feed
 curl -X POST http://localhost:5001/api/crawler/feed/<feed_id>
 ```
 ## How It Works
 1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB
 2. **Parse Feed**: Extracts article links from each feed
 3. **Crawl Content**: For each article:
   - Fetches HTML page
   - Removes scripts, styles, navigation
   - Extracts main content using smart selectors
   - Calculates word count
 4. **Store Data**: Saves to MongoDB with metadata
 5. **Skip Duplicates**: Avoids re-crawling articles with existing content
 ## Content Extraction Strategy
 The crawler tries multiple selectors in order:
 1. `<article>` tag
 2. Elements with class containing "article-content", "article-body"
 3. Elements with class containing "post-content", "entry-content"
 4. `<main>` tag
 5. Fallback to all `<p>` tags in body
 ## Database Schema
 Articles are stored with these fields:
 ```javascript
 {
  title: String,           // Article title
  link: String,            // Article URL (unique)
  summary: String,         // Short summary
  full_content: String,    // Full article text (max 10,000 chars)
  word_count: Number,      // Number of words
  source: String,          // RSS feed name
  published_at: String,    // Publication date
  crawled_at: DateTime,    // When content was crawled
  created_at: DateTime     // When added to database
 }
 ```
 ## Scheduling
 ### Using Cron (Linux/Mac)
 ```bash
 # Run every 6 hours
 0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
 ```
 ### Using systemd Timer (Linux)
 Create `/etc/systemd/system/news-crawler.service`:
 ```ini
 [Unit]
 Description=News Crawler Service
 [Service]
 Type=oneshot
 WorkingDirectory=/path/to/news_crawler
 ExecStart=/path/to/venv/bin/python crawler_service.py
 User=your-user
 ```
 Create `/etc/systemd/system/news-crawler.timer`:
 ```ini
 [Unit]
 Description=Run News Crawler every 6 hours
 [Timer]
 OnBootSec=5min
 OnUnitActiveSec=6h
 [Install]
 WantedBy=timers.target
 ```
 Enable and start:
 ```bash
 sudo systemctl enable news-crawler.timer
 sudo systemctl start news-crawler.timer
 ```
 ### Using Docker
 Create `Dockerfile`:
 ```dockerfile
 FROM python:3.11-slim
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY crawler_service.py .
 CMD ["python", "crawler_service.py"]
 ```
 Build and run:
 ```bash
 docker build -t news-crawler .
 docker run --env-file ../.env news-crawler
 ```
 ## Configuration
 Environment variables:
 - `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`)
 ## Rate Limiting
 - 1 second delay between article requests
 - Respects server resources
 - User-Agent header included
 ## Troubleshooting
 **Issue: Can't extract content**
 - Some sites block scrapers
 - Try adjusting User-Agent header
 - Some sites require JavaScript (consider Selenium)
 **Issue: Timeout errors**
 - Increase timeout in `extract_article_content()`
 - Check network connectivity
 **Issue: Memory usage**
 - Reduce `max_articles_per_feed`
 - Content limited to 10,000 characters per article
 ## Architecture
 This is a standalone microservice that:
 - Can run independently of the main backend
 - Shares the same MongoDB database
 - Can be deployed separately
 - Can be scheduled independently
 ## Next Steps
 Once articles are crawled, you can:
 - Use Ollama to summarize articles
 - Perform sentiment analysis
 - Extract keywords and topics
 - Generate newsletter content
 - Create article recommendations
--- a/news_crawler/RSS_URL_EXTRACTION.md
+++ b/news_crawler/RSS_URL_EXTRACTION.md
@@ -0,0 +1,194 @@
 # RSS URL Extraction - How It Works
 ## The Problem
 Different RSS feed providers use different fields to store the article URL:
 ### Example 1: Standard RSS (uses `link`)
 ```xml
 <item>
  <title>Article Title</title>
  <link>https://example.com/article/123</link>
  <guid>internal-id-456</guid>
 </item>
 ```
 ### Example 2: Some feeds (uses `guid` as URL)
 ```xml
 <item>
  <title>Article Title</title>
  <guid>https://example.com/article/123</guid>
 </item>
 ```
 ### Example 3: Atom feeds (uses `id`)
 ```xml
 <entry>
  <title>Article Title</title>
  <id>https://example.com/article/123</id>
 </entry>
 ```
 ### Example 4: Complex feeds (guid as object)
 ```xml
 <item>
  <title>Article Title</title>
  <guid isPermaLink="true">https://example.com/article/123</guid>
 </item>
 ```
 ### Example 5: Multiple links
 ```xml
 <item>
  <title>Article Title</title>
  <link rel="alternate" type="text/html" href="https://example.com/article/123"/>
  <link rel="enclosure" type="image/jpeg" href="https://example.com/image.jpg"/>
 </item>
 ```
 ## Our Solution
 The `extract_article_url()` function tries multiple strategies in order:
 ### Strategy 1: Check `link` field (most common)
 ```python
 if entry.get('link') and entry.get('link', '').startswith('http'):
    return entry.get('link')
 ```
 ✅ Works for: Most RSS 2.0 feeds
 ### Strategy 2: Check `guid` field
 ```python
 if entry.get('guid'):
    guid = entry.get('guid')
    # guid can be a string
    if isinstance(guid, str) and guid.startswith('http'):
        return guid
    # or a dict with 'href'
    elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
        return guid.get('href')
 ```
 ✅ Works for: Feeds that use GUID as permalink
 ### Strategy 3: Check `id` field
 ```python
 if entry.get('id') and entry.get('id', '').startswith('http'):
    return entry.get('id')
 ```
 ✅ Works for: Atom feeds
 ### Strategy 4: Check `links` array
 ```python
 if entry.get('links'):
    for link in entry.get('links', []):
        if isinstance(link, dict) and link.get('href', '').startswith('http'):
            # Prefer 'alternate' type
            if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
                return link.get('href')
 ```
 ✅ Works for: Feeds with multiple links (prefers HTML content)
 ## Real-World Examples
 ### Süddeutsche Zeitung
 ```python
 entry = {
    'title': 'Munich News',
    'link': 'https://www.sueddeutsche.de/muenchen/article-123',
    'guid': 'sz-internal-123'
 }
 # Returns: 'https://www.sueddeutsche.de/muenchen/article-123'
 ```
 ### Medium Blog
 ```python
 entry = {
    'title': 'Blog Post',
    'guid': 'https://medium.com/@user/post-abc123',
    'link': None
 }
 # Returns: 'https://medium.com/@user/post-abc123'
 ```
 ### YouTube RSS
 ```python
 entry = {
    'title': 'Video Title',
    'id': 'https://www.youtube.com/watch?v=abc123',
    'link': None
 }
 # Returns: 'https://www.youtube.com/watch?v=abc123'
 ```
 ### Complex Feed
 ```python
 entry = {
    'title': 'Article',
    'links': [
        {'rel': 'alternate', 'type': 'text/html', 'href': 'https://example.com/article'},
        {'rel': 'enclosure', 'type': 'image/jpeg', 'href': 'https://example.com/image.jpg'}
    ]
 }
 # Returns: 'https://example.com/article' (prefers text/html)
 ```
 ## Validation
 All extracted URLs must:
 1. Start with `http://` or `https://`
 2. Be a valid string (not None or empty)
 If no valid URL is found:
 ```python
 return None
 # Crawler will skip this entry and log a warning
 ```
 ## Testing Different Feeds
 To test if a feed works with our extractor:
 ```python
 import feedparser
 from rss_utils import extract_article_url
 # Parse feed
 feed = feedparser.parse('https://example.com/rss')
 # Test each entry
 for entry in feed.entries[:5]:
    url = extract_article_url(entry)
    if url:
        print(f"✓ {entry.get('title', 'No title')[:50]}")
        print(f"  URL: {url}")
    else:
        print(f"✗ {entry.get('title', 'No title')[:50]}")
        print(f"  No valid URL found")
        print(f"  Available fields: {list(entry.keys())}")
 ```
 ## Supported Feed Types
 ✅ RSS 2.0  
 ✅ RSS 1.0  
 ✅ Atom  
 ✅ Custom RSS variants  
 ✅ Feeds with multiple links  
 ✅ Feeds with GUID as permalink  
 ## Edge Cases Handled
 1. **GUID is not a URL**: Checks if it starts with `http`
 2. **Multiple links**: Prefers `text/html` type
 3. **GUID as dict**: Extracts `href` field
 4. **Missing fields**: Returns None instead of crashing
 5. **Non-HTTP URLs**: Filters out `mailto:`, `ftp:`, etc.
 ## Future Improvements
 Potential enhancements:
 - [ ] Support for `feedburner:origLink`
 - [ ] Support for `pheedo:origLink`
 - [ ] Resolve shortened URLs (bit.ly, etc.)
 - [ ] Handle relative URLs (convert to absolute)
 - [ ] Cache URL extraction results
--- a/news_crawler/check_database.py
+++ b/news_crawler/check_database.py
@@ -0,0 +1,79 @@
 #!/usr/bin/env python
 """
 Quick script to check what RSS feeds are in the database
 """
 from pymongo import MongoClient
 import os
 import sys
 # Add parent directory to path to import from backend
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'backend'))
 try:
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', 'backend', '.env'))
 except:
    pass
 # MongoDB setup
 MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
 DB_NAME = 'munich_news'
 print(f"Connecting to: {MONGODB_URI}")
 print(f"Database: {DB_NAME}\n")
 try:
    client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
    # Test connection
    client.server_info()
    print("✓ Connected to MongoDB\n")
    db = client[DB_NAME]
    rss_feeds_collection = db['rss_feeds']
    # Get all feeds
    feeds = list(rss_feeds_collection.find())
    if not feeds:
        print("❌ No RSS feeds found in database\n")
        print("Add feeds using the API:")
        print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
        print("    -H 'Content-Type: application/json' \\")
        print("    -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
        sys.exit(1)
    print(f"Found {len(feeds)} RSS feed(s):\n")
    print("="*80)
    for i, feed in enumerate(feeds, 1):
        print(f"\n{i}. {feed.get('name', 'Unknown')}")
        print(f"   URL: {feed.get('url', 'N/A')}")
        print(f"   Active: {feed.get('active', True)}")
        print(f"   Created: {feed.get('created_at', 'N/A')}")
        print(f"   ID: {feed.get('_id', 'N/A')}")
    print("\n" + "="*80)
    # Check articles
    articles_collection = db['articles']
    total_articles = articles_collection.count_documents({})
    crawled_articles = articles_collection.count_documents({'full_content': {'$exists': True}})
    print(f"\nArticles in database:")
    print(f"  Total: {total_articles}")
    print(f"  With full content: {crawled_articles}")
    print(f"  Without full content: {total_articles - crawled_articles}")
    if total_articles > 0:
        print("\nSample article:")
        sample = articles_collection.find_one()
        print(f"  Title: {sample.get('title', 'N/A')[:60]}")
        print(f"  Link: {sample.get('link', 'N/A')}")
        print(f"  Has full_content: {bool(sample.get('full_content'))}")
        print(f"  Word count: {sample.get('word_count', 'N/A')}")
    print("\n✓ Database check complete!")
 except Exception as e:
    print(f"❌ Error: {e}")
    sys.exit(1)
--- a/news_crawler/config.py
+++ b/news_crawler/config.py
@@ -0,0 +1,90 @@
 """
 Configuration management for news crawler
 """
 import os
 from dotenv import load_dotenv
 from pathlib import Path
 # Load environment variables from backend/.env
 backend_dir = Path(__file__).parent.parent / 'backend'
 env_path = backend_dir / '.env'
 if env_path.exists():
    load_dotenv(dotenv_path=env_path)
    print(f"✓ Loaded configuration from: {env_path}")
 else:
    print(f"⚠ Warning: .env file not found at {env_path}")
 class Config:
    """Centralized configuration for news crawler"""
    # MongoDB Configuration
    MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'
    # Ollama Configuration
    OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
    OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
    OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
    OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
    # Crawler Configuration
    RATE_LIMIT_DELAY = 1  # seconds between requests
    MAX_CONTENT_LENGTH = 50000  # characters
    SUMMARY_MAX_WORDS = 150  # maximum words in AI summary
    @classmethod
    def print_config(cls):
        """Print current configuration (without sensitive data)"""
        print("\n" + "="*60)
        print("News Crawler Configuration")
        print("="*60)
        print(f"MongoDB URI: {cls.MONGODB_URI}")
        print(f"Database: {cls.DB_NAME}")
        print(f"\nOllama Configuration:")
        print(f"  Base URL: {cls.OLLAMA_BASE_URL}")
        print(f"  Model: {cls.OLLAMA_MODEL}")
        print(f"  Enabled: {cls.OLLAMA_ENABLED}")
        print(f"  Timeout: {cls.OLLAMA_TIMEOUT}s")
        print(f"  Has API Key: {bool(cls.OLLAMA_API_KEY)}")
        print(f"\nCrawler Settings:")
        print(f"  Rate Limit: {cls.RATE_LIMIT_DELAY}s between requests")
        print(f"  Max Content: {cls.MAX_CONTENT_LENGTH} chars")
        print(f"  Summary Length: {cls.SUMMARY_MAX_WORDS} words")
        print("="*60 + "\n")
    @classmethod
    def validate(cls):
        """Validate configuration and return list of issues"""
        issues = []
        # Check MongoDB
        if not cls.MONGODB_URI:
            issues.append("MONGODB_URI is not set")
        # Check Ollama if enabled
        if cls.OLLAMA_ENABLED:
            if not cls.OLLAMA_BASE_URL:
                issues.append("OLLAMA_BASE_URL is not set but Ollama is enabled")
            if not cls.OLLAMA_MODEL:
                issues.append("OLLAMA_MODEL is not set but Ollama is enabled")
            if cls.OLLAMA_TIMEOUT < 5:
                issues.append(f"OLLAMA_TIMEOUT ({cls.OLLAMA_TIMEOUT}s) is too low, recommend at least 5s")
        return issues
 if __name__ == '__main__':
    # Test configuration
    Config.print_config()
    # Validate
    issues = Config.validate()
    if issues:
        print("⚠ Configuration Issues:")
        for issue in issues:
            print(f"  - {issue}")
    else:
        print("✓ Configuration is valid")
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -0,0 +1,489 @@
 """
 Web crawler service to extract full article content from RSS feed links
 """
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
 from pymongo import MongoClient
 from pymongo.errors import DuplicateKeyError
 import feedparser
 import time
 import os
 from dotenv import load_dotenv
 from rss_utils import extract_article_url, extract_article_summary, extract_published_date
 from config import Config
 from ollama_client import OllamaClient
 # Load environment variables
 load_dotenv(dotenv_path='../.env')
 # MongoDB setup
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 articles_collection = db['articles']
 rss_feeds_collection = db['rss_feeds']
 # Initialize Ollama client
 ollama_client = OllamaClient(
    base_url=Config.OLLAMA_BASE_URL,
    model=Config.OLLAMA_MODEL,
    api_key=Config.OLLAMA_API_KEY,
    enabled=Config.OLLAMA_ENABLED,
    timeout=Config.OLLAMA_TIMEOUT
 )
 # Print configuration on startup
 if __name__ != '__main__':
    Config.print_config()
    if Config.OLLAMA_ENABLED:
        print("🤖 Ollama AI summarization: ENABLED")
        if ollama_client.is_available():
            print("✓ Ollama server is reachable")
        else:
            print("⚠ Warning: Ollama server is not reachable")
    else:
        print("ℹ Ollama AI summarization: DISABLED")
 def get_active_rss_feeds():
    """Get all active RSS feeds from database"""
    feeds = []
    cursor = rss_feeds_collection.find({'active': True})
    for feed in cursor:
        feeds.append({
            'id': str(feed['_id']),
            'name': feed.get('name', ''),
            'url': feed.get('url', '')
        })
    return feeds
 def extract_article_content(url, timeout=10):
    """
    Extract main article content from a URL with smart detection
    Returns: dict with title, content, author, date, and metadata
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
            script.decompose()
        # === EXTRACT TITLE ===
        title = extract_title(soup)
        # === EXTRACT AUTHOR ===
        author = extract_author(soup)
        # === EXTRACT PUBLISHED DATE ===
        published_date = extract_date(soup)
        # === EXTRACT MAIN CONTENT ===
        content_text = extract_main_content(soup)
        # === EXTRACT META DESCRIPTION ===
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if not meta_desc:
            meta_desc = soup.find('meta', attrs={'property': 'og:description'})
        description = meta_desc.get('content', '') if meta_desc else ''
        return {
            'title': title,
            'author': author,
            'content': content_text,  # Full content, no limit
            'description': description,
            'published_date': published_date,
            'word_count': len(content_text.split()) if content_text else 0,
            'crawled_at': datetime.utcnow()
        }
    except requests.exceptions.Timeout:
        print(f"Timeout crawling {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error crawling {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error crawling {url}: {e}")
        return None
 def extract_title(soup):
    """
    Extract article title using multiple strategies
    """
    # Strategy 1: Look for h1 tag
    h1 = soup.find('h1')
    if h1:
        title = h1.get_text().strip()
        if title and len(title) > 10:  # Reasonable title length
            return title
    # Strategy 2: Look for meta og:title
    og_title = soup.find('meta', attrs={'property': 'og:title'})
    if og_title and og_title.get('content'):
        return og_title.get('content').strip()
    # Strategy 3: Look for meta twitter:title
    twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
    if twitter_title and twitter_title.get('content'):
        return twitter_title.get('content').strip()
    # Strategy 4: Look for title tag (fallback)
    title_tag = soup.find('title')
    if title_tag:
        title = title_tag.get_text().strip()
        # Clean up common patterns like "Site Name | Article Title"
        if ' | ' in title:
            title = title.split(' | ')[0]
        elif ' - ' in title:
            title = title.split(' - ')[0]
        return title
    return None
 def extract_author(soup):
    """
    Extract article author using multiple strategies
    """
    # Strategy 1: Look for meta author
    meta_author = soup.find('meta', attrs={'name': 'author'})
    if meta_author and meta_author.get('content'):
        return meta_author.get('content').strip()
    # Strategy 2: Look for rel="author"
    rel_author = soup.find('a', attrs={'rel': 'author'})
    if rel_author:
        return rel_author.get_text().strip()
    # Strategy 3: Look for common author class names
    author_selectors = [
        '[class*="author-name"]',
        '[class*="author"]',
        '[class*="byline"]',
        '[class*="writer"]',
        '[rel="author"]',
        '[itemprop="author"]'
    ]
    for selector in author_selectors:
        author_elem = soup.select_one(selector)
        if author_elem:
            author = author_elem.get_text().strip()
            # Clean up common patterns
            author = author.replace('By ', '').replace('by ', '').strip()
            if author and len(author) < 100:  # Reasonable author name length
                return author
    # Strategy 4: Look for JSON-LD structured data
    json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
    if json_ld:
        try:
            import json
            data = json.loads(json_ld.string)
            if isinstance(data, dict) and data.get('author'):
                author_data = data.get('author')
                if isinstance(author_data, dict):
                    return author_data.get('name', '')
                elif isinstance(author_data, str):
                    return author_data
        except:
            pass
    return None
 def extract_date(soup):
    """
    Extract published date using multiple strategies
    """
    # Strategy 1: Look for time tag with datetime attribute
    time_tag = soup.find('time')
    if time_tag and time_tag.get('datetime'):
        return time_tag.get('datetime')
    # Strategy 2: Look for meta article:published_time
    meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
    if meta_published and meta_published.get('content'):
        return meta_published.get('content')
    # Strategy 3: Look for meta og:published_time
    og_published = soup.find('meta', attrs={'property': 'og:published_time'})
    if og_published and og_published.get('content'):
        return og_published.get('content')
    # Strategy 4: Look for common date class names
    date_selectors = [
        '[class*="publish-date"]',
        '[class*="published"]',
        '[class*="date"]',
        '[class*="timestamp"]',
        '[itemprop="datePublished"]'
    ]
    for selector in date_selectors:
        date_elem = soup.select_one(selector)
        if date_elem:
            # Try datetime attribute first
            if date_elem.get('datetime'):
                return date_elem.get('datetime')
            # Otherwise get text
            date_text = date_elem.get_text().strip()
            if date_text and len(date_text) < 50:
                return date_text
    # Strategy 5: Look for JSON-LD structured data
    json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
    if json_ld:
        try:
            import json
            data = json.loads(json_ld.string)
            if isinstance(data, dict):
                return data.get('datePublished') or data.get('dateCreated')
        except:
            pass
    return None
 def extract_main_content(soup):
    """
    Extract main article content using multiple strategies
    """
    # Strategy 1: Try common article content selectors
    content_selectors = [
        'article',
        '[class*="article-content"]',
        '[class*="article-body"]',
        '[class*="post-content"]',
        '[class*="entry-content"]',
        '[class*="content-body"]',
        '[class*="story-body"]',
        '[itemprop="articleBody"]',
        'main'
    ]
    article_content = None
    for selector in content_selectors:
        element = soup.select_one(selector)
        if element:
            article_content = element
            break
    # Fallback: get body
    if not article_content:
        article_content = soup.find('body')
    if not article_content:
        return ''
    # Extract text from paragraphs
    paragraphs = article_content.find_all('p')
    # Filter out short paragraphs (likely navigation/ads)
    content_paragraphs = []
    for p in paragraphs:
        text = p.get_text().strip()
        # Keep paragraphs with at least 50 characters
        if len(text) >= 50:
            content_paragraphs.append(text)
    content_text = '\n\n'.join(content_paragraphs)
    return content_text
 def crawl_rss_feed(feed_url, feed_name, max_articles=10):
    """
    Crawl articles from an RSS feed
    Returns: dict with statistics
    """
    print(f"\n📰 Crawling feed: {feed_name}")
    print(f"   URL: {feed_url}")
    try:
        # Parse RSS feed
        feed = feedparser.parse(feed_url)
        if not feed.entries:
            print(f"   ⚠ No entries found in feed")
            return 0
        crawled_count = 0
        summarized_count = 0
        failed_summaries = 0
        for entry in feed.entries[:max_articles]:
            # Extract article URL using utility function
            article_url = extract_article_url(entry)
            if not article_url:
                print(f"   ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
                continue
            # Check if article already exists and has content
            existing = articles_collection.find_one({'link': article_url})
            if existing and existing.get('content'):
                print(f"   ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
                continue
            print(f"   🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
            # Extract full content
            article_data = extract_article_content(article_url)
            if article_data and article_data.get('content'):
                # Summarize with Ollama if enabled
                summary_result = None
                if Config.OLLAMA_ENABLED and article_data.get('content'):
                    print(f"   🤖 Summarizing with AI...")
                    summary_result = ollama_client.summarize_article(
                        article_data['content'],
                        max_words=Config.SUMMARY_MAX_WORDS
                    )
                    if summary_result['success']:
                        print(f"   ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
                        summarized_count += 1
                    else:
                        print(f"   ⚠ Summarization failed: {summary_result['error']}")
                        failed_summaries += 1
                # Prepare document
                article_doc = {
                    'title': article_data.get('title') or entry.get('title', ''),
                    'author': article_data.get('author'),
                    'link': article_url,
                    'content': article_data.get('content', ''),  # Full article content
                    'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
                    'word_count': article_data.get('word_count', 0),
                    'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
                    'source': feed_name,
                    'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
                    'crawled_at': article_data.get('crawled_at'),
                    'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
                    'created_at': datetime.utcnow()
                }
                try:
                    # Upsert: update if exists, insert if not
                    articles_collection.update_one(
                        {'link': article_url},
                        {'$set': article_doc},
                        upsert=True
                    )
                    crawled_count += 1
                    print(f"   ✓ Saved ({article_data.get('word_count', 0)} words)")
                except DuplicateKeyError:
                    print(f"   ⚠ Duplicate key error")
                except Exception as e:
                    print(f"   ✗ Error saving: {e}")
            else:
                print(f"   ✗ Failed to extract content")
            # Be nice to servers - add delay
            time.sleep(1)
        print(f"   ✓ Crawled {crawled_count} articles from {feed_name}")
        if Config.OLLAMA_ENABLED:
            print(f"   🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
        return {
            'crawled': crawled_count,
            'summarized': summarized_count,
            'failed_summaries': failed_summaries
        }
    except Exception as e:
        print(f"   ✗ Error processing feed {feed_name}: {e}")
        return 0
 def crawl_all_feeds(max_articles_per_feed=10):
    """
    Crawl all active RSS feeds
    Returns: dict with statistics
    """
    print("\n" + "="*60)
    print("🚀 Starting RSS Feed Crawler")
    print("="*60)
    start_time = time.time()
    feeds = get_active_rss_feeds()
    if not feeds:
        print("⚠ No active RSS feeds found")
        return {
            'total_feeds': 0,
            'total_articles_crawled': 0,
            'duration_seconds': 0
        }
    print(f"Found {len(feeds)} active feed(s)")
    if Config.OLLAMA_ENABLED:
        print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
    total_crawled = 0
    total_summarized = 0
    total_failed = 0
    for feed in feeds:
        result = crawl_rss_feed(
            feed['url'],
            feed['name'],
            max_articles=max_articles_per_feed
        )
        total_crawled += result['crawled']
        total_summarized += result['summarized']
        total_failed += result['failed_summaries']
    duration = time.time() - start_time
    print("\n" + "="*60)
    print(f"✓ Crawling Complete!")
    print(f"  Total feeds processed: {len(feeds)}")
    print(f"  Total articles crawled: {total_crawled}")
    if Config.OLLAMA_ENABLED:
        print(f"  Total articles summarized: {total_summarized}")
        print(f"  Failed summarizations: {total_failed}")
        if total_summarized > 0:
            success_rate = (total_summarized / (total_summarized + total_failed)) * 100
            print(f"  Success rate: {success_rate:.1f}%")
    print(f"  Duration: {duration:.2f} seconds")
    if total_crawled > 0:
        print(f"  Average time per article: {duration/total_crawled:.1f}s")
    print("="*60 + "\n")
    return {
        'total_feeds': len(feeds),
        'total_articles_crawled': total_crawled,
        'total_summarized': total_summarized,
        'failed_summaries': total_failed,
        'duration_seconds': round(duration, 2)
    }
 if __name__ == '__main__':
    # Can be run standalone for testing
    import sys
    max_articles = 10
    if len(sys.argv) > 1:
        try:
            max_articles = int(sys.argv[1])
        except ValueError:
            print("Usage: python crawler_service.py [max_articles_per_feed]")
            sys.exit(1)
    crawl_all_feeds(max_articles_per_feed=max_articles)
--- a/news_crawler/docker-compose.yml
+++ b/news_crawler/docker-compose.yml
@@ -0,0 +1,33 @@
 version: '3.8'
 services:
  crawler:
    build: .
    container_name: news-crawler
    environment:
      - MONGODB_URI=mongodb://mongodb:27017/
    networks:
      - munich-news-network
    depends_on:
      - mongodb
    # Run once and exit
    restart: "no"
  mongodb:
    image: mongo:7.0
    container_name: munich-news-mongodb
    restart: unless-stopped
    ports:
      - "27017:27017"
    volumes:
      - mongodb_data:/data/db
    networks:
      - munich-news-network
 volumes:
  mongodb_data:
    driver: local
 networks:
  munich-news-network:
    driver: bridge
--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -0,0 +1,290 @@
 """
 Ollama client for AI-powered article summarization
 """
 import requests
 import time
 from datetime import datetime
 class OllamaClient:
    """Client for communicating with Ollama server for text summarization"""
    def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
        """
        Initialize Ollama client
        Args:
            base_url: Ollama server URL (e.g., http://localhost:11434)
            model: Model name to use (e.g., phi3:latest)
            api_key: Optional API key for authentication
            enabled: Whether Ollama is enabled
            timeout: Request timeout in seconds (default 30)
        """
        self.base_url = base_url.rstrip('/')
        self.model = model
        self.api_key = api_key
        self.enabled = enabled
        self.timeout = timeout
    def summarize_article(self, content, max_words=150):
        """
        Summarize article content using Ollama
        Args:
            content: Full article text
            max_words: Maximum words in summary (default 150)
        Returns:
            {
                'summary': str,              # AI-generated summary
                'summary_word_count': int,   # Summary word count
                'original_word_count': int,  # Original article word count
                'success': bool,             # Whether summarization succeeded
                'error': str or None,        # Error message if failed
                'duration': float            # Time taken in seconds
            }
        """
        if not self.enabled:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': 0,
                'success': False,
                'error': 'Ollama is not enabled',
                'duration': 0
            }
        if not content or len(content.strip()) == 0:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': 0,
                'success': False,
                'error': 'Content is empty',
                'duration': 0
            }
        # Calculate original word count
        original_word_count = len(content.split())
        start_time = time.time()
        try:
            # Construct prompt
            prompt = self._build_summarization_prompt(content, max_words)
            # Prepare request
            url = f"{self.base_url}/api/generate"
            headers = {'Content-Type': 'application/json'}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'
            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.7,
                    'num_predict': 250  # Limit response length
                }
            }
            # Make request
            response = requests.post(
                url,
                json=payload,
                headers=headers,
                timeout=self.timeout
            )
            response.raise_for_status()
            # Parse response
            result = response.json()
            summary = result.get('response', '').strip()
            if not summary:
                return {
                    'summary': None,
                    'summary_word_count': 0,
                    'original_word_count': original_word_count,
                    'success': False,
                    'error': 'Ollama returned empty summary',
                    'duration': time.time() - start_time
                }
            summary_word_count = len(summary.split())
            return {
                'summary': summary,
                'summary_word_count': summary_word_count,
                'original_word_count': original_word_count,
                'success': True,
                'error': None,
                'duration': time.time() - start_time
            }
        except requests.exceptions.Timeout:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Request timed out after {self.timeout} seconds',
                'duration': time.time() - start_time
            }
        except requests.exceptions.ConnectionError:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Cannot connect to Ollama server at {self.base_url}',
                'duration': time.time() - start_time
            }
        except requests.exceptions.HTTPError as e:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
                'duration': time.time() - start_time
            }
        except Exception as e:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Unexpected error: {str(e)}',
                'duration': time.time() - start_time
            }
    def _build_summarization_prompt(self, content, max_words):
        """Build prompt for article summarization"""
        # Truncate content if too long (keep first 5000 words)
        words = content.split()
        if len(words) > 5000:
            content = ' '.join(words[:5000]) + '...'
        prompt = f"""Summarize the following article in English in {max_words} words or less. Even if the article is in German or another language, provide the summary in English. Focus on the key points, main message, and important details. Be concise and clear.
 Article:
 {content}
 English Summary (max {max_words} words):"""
        return prompt
    def is_available(self):
        """
        Check if Ollama server is reachable
        Returns:
            bool: True if server is reachable, False otherwise
        """
        if not self.enabled:
            return False
        try:
            url = f"{self.base_url}/api/tags"
            headers = {}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            return True
        except:
            return False
    def test_connection(self):
        """
        Test connection and return server info
        Returns:
            {
                'available': bool,
                'models': list,
                'current_model': str,
                'error': str or None
            }
        """
        if not self.enabled:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': 'Ollama is not enabled'
            }
        try:
            url = f"{self.base_url}/api/tags"
            headers = {}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            result = response.json()
            models = [m.get('name', '') for m in result.get('models', [])]
            return {
                'available': True,
                'models': models,
                'current_model': self.model,
                'error': None
            }
        except requests.exceptions.ConnectionError:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': f'Cannot connect to Ollama server at {self.base_url}'
            }
        except Exception as e:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': str(e)
            }
 if __name__ == '__main__':
    # Quick test
    import os
    from dotenv import load_dotenv
    load_dotenv(dotenv_path='../.env')
    client = OllamaClient(
        base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
        model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
        enabled=True
    )
    print("Testing Ollama connection...")
    result = client.test_connection()
    print(f"Available: {result['available']}")
    print(f"Models: {result['models']}")
    print(f"Current model: {result['current_model']}")
    if result['available']:
        print("\nTesting summarization...")
        test_content = """
        The new U-Bahn line connecting Munich's city center with the airport opened today.
        Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
        The line will significantly reduce travel time between the airport and downtown Munich.
        Construction took five years and cost approximately 2 billion euros.
        The new line includes 10 stations and runs every 10 minutes during peak hours.
        """
        summary_result = client.summarize_article(test_content, max_words=50)
        print(f"Success: {summary_result['success']}")
        print(f"Summary: {summary_result['summary']}")
        print(f"Original word count: {summary_result['original_word_count']}")
        print(f"Summary word count: {summary_result['summary_word_count']}")
        print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
        print(f"Duration: {summary_result['duration']:.2f}s")
--- a/news_crawler/requirements.txt
+++ b/news_crawler/requirements.txt
@@ -0,0 +1,6 @@
 beautifulsoup4==4.12.2
 lxml==4.9.3
 requests==2.31.0
 feedparser==6.0.10
 pymongo==4.6.1
 python-dotenv==1.0.0
--- a/news_crawler/rss_utils.py
+++ b/news_crawler/rss_utils.py
@@ -0,0 +1,98 @@
 """
 Utility functions for RSS feed processing
 """
 def extract_article_url(entry):
    """
    Extract article URL from RSS entry.
    Different RSS feeds use different fields for the article URL.
    Args:
        entry: feedparser entry object
    Returns:
        str: Article URL or None if not found
    Examples:
        - Most feeds use 'link'
        - Some use 'guid' as the URL
        - Some use 'id' as the URL
        - Some have guid as a dict with 'href'
    """
    # Try 'link' first (most common)
    if entry.get('link') and entry.get('link', '').startswith('http'):
        return entry.get('link')
    # Try 'guid' if it's a valid URL
    if entry.get('guid'):
        guid = entry.get('guid')
        # guid can be a string
        if isinstance(guid, str) and guid.startswith('http'):
            return guid
        # or a dict with 'href'
        elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
            return guid.get('href')
    # Try 'id' if it's a valid URL
    if entry.get('id') and entry.get('id', '').startswith('http'):
        return entry.get('id')
    # Try 'links' array (some feeds have multiple links)
    if entry.get('links'):
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                # Prefer 'alternate' type, but accept any http link
                if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
                    return link.get('href')
        # If no alternate found, return first http link
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                return link.get('href')
    return None
 def extract_article_summary(entry):
    """
    Extract article summary/description from RSS entry.
    Args:
        entry: feedparser entry object
    Returns:
        str: Article summary or empty string
    """
    # Try different fields
    if entry.get('summary'):
        return entry.get('summary', '')
    elif entry.get('description'):
        return entry.get('description', '')
    elif entry.get('content'):
        # content is usually a list of dicts
        content = entry.get('content', [])
        if content and isinstance(content, list) and len(content) > 0:
            return content[0].get('value', '')
    return ''
 def extract_published_date(entry):
    """
    Extract published date from RSS entry.
    Args:
        entry: feedparser entry object
    Returns:
        str: Published date or empty string
    """
    # Try different fields
    if entry.get('published'):
        return entry.get('published', '')
    elif entry.get('updated'):
        return entry.get('updated', '')
    elif entry.get('created'):
        return entry.get('created', '')
    return ''
--- a/news_crawler/test_crawler.py
+++ b/news_crawler/test_crawler.py
@@ -0,0 +1,83 @@
 #!/usr/bin/env python
 """
 Test script to verify crawler functionality
 """
 from crawler_service import extract_article_content, get_active_rss_feeds
 import sys
 def test_content_extraction():
    """Test content extraction from a sample URL"""
    print("Testing content extraction...")
    # Test with a simple news site
    test_url = "https://www.bbc.com/news"
    print(f"Extracting content from: {test_url}")
    result = extract_article_content(test_url, timeout=10)
    if result:
        print("✓ Content extraction successful!")
        print(f"  Title: {result.get('title', 'N/A')[:50]}...")
        print(f"  Content length: {len(result.get('content', ''))} chars")
        print(f"  Word count: {result.get('word_count', 0)}")
        return True
    else:
        print("✗ Content extraction failed")
        return False
 def test_database_connection():
    """Test MongoDB connection"""
    print("\nTesting database connection...")
    try:
        feeds = get_active_rss_feeds()
        print(f"✓ Database connection successful!")
        print(f"  Found {len(feeds)} active RSS feed(s)")
        if feeds:
            print("\n  Active feeds:")
            for feed in feeds:
                print(f"    - {feed['name']}: {feed['url']}")
        else:
            print("\n  ⚠ No active feeds found. Add feeds via the backend API:")
            print("    curl -X POST http://localhost:5001/api/rss-feeds \\")
            print("      -H 'Content-Type: application/json' \\")
            print("      -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
        return True
    except Exception as e:
        print(f"✗ Database connection failed: {e}")
        return False
 def main():
    print("="*60)
    print("News Crawler - Test Suite")
    print("="*60 + "\n")
    # Test database connection
    db_ok = test_database_connection()
    # Test content extraction
    extract_ok = test_content_extraction()
    print("\n" + "="*60)
    print("Test Results:")
    print(f"  Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
    print(f"  Content Extraction:  {'✓ PASS' if extract_ok else '✗ FAIL'}")
    print("="*60 + "\n")
    if db_ok and extract_ok:
        print("✓ All tests passed! Crawler is ready to use.")
        print("\nRun the crawler with:")
        print("  python crawler_service.py")
        return 0
    else:
        print("✗ Some tests failed. Please check the errors above.")
        return 1
 if __name__ == '__main__':
    sys.exit(main())
--- a/news_crawler/test_ollama.py
+++ b/news_crawler/test_ollama.py
@@ -0,0 +1,129 @@
 #!/usr/bin/env python
 """
 Test script for Ollama integration
 Tests connection, configuration, and summarization
 """
 from config import Config
 from ollama_client import OllamaClient
 print("\n" + "="*70)
 print("Ollama Integration Test")
 print("="*70)
 # Print configuration
 Config.print_config()
 # Validate configuration
 issues = Config.validate()
 if issues:
    print("⚠ Configuration Issues:")
    for issue in issues:
        print(f"  - {issue}")
    print()
 # Initialize client
 client = OllamaClient(
    base_url=Config.OLLAMA_BASE_URL,
    model=Config.OLLAMA_MODEL,
    api_key=Config.OLLAMA_API_KEY,
    enabled=Config.OLLAMA_ENABLED,
    timeout=Config.OLLAMA_TIMEOUT
 )
 # Test 1: Check if Ollama is enabled
 print("Test 1: Configuration Check")
 print(f"  Ollama Enabled: {Config.OLLAMA_ENABLED}")
 if not Config.OLLAMA_ENABLED:
    print("  ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
    print("\n" + "="*70)
    exit(0)
 # Test 2: Test connection
 print("\nTest 2: Connection Test")
 conn_result = client.test_connection()
 print(f"  Available: {conn_result['available']}")
 print(f"  Current Model: {conn_result['current_model']}")
 if conn_result['available']:
    print(f"  ✓ Connected to Ollama server")
    if conn_result['models']:
        print(f"  Available models: {', '.join(conn_result['models'][:5])}")
        if conn_result['current_model'] not in conn_result['models']:
            print(f"  ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
 else:
    print(f"  ✗ Connection failed: {conn_result['error']}")
    print("\n" + "="*70)
    exit(1)
 # Test 3: Test summarization with sample article
 print("\nTest 3: Summarization Test")
 print("  Testing with sample German article...")
 sample_article = """
 Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet. 
 Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil. 
 Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen. 
 Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen 
 und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung 
 den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich 
 über 50.000 Fahrgästen auf der neuen Strecke.
 """
 result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
 print(f"\n  Success: {result['success']}")
 if result['success']:
    print(f"  ✓ Summarization successful!")
    print(f"\n  Original word count: {result['original_word_count']}")
    print(f"  Summary word count: {result['summary_word_count']}")
    print(f"  Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
    print(f"  Duration: {result['duration']:.2f}s")
    print(f"\n  Summary (English):")
    print(f"  {'-'*70}")
    print(f"  {result['summary']}")
    print(f"  {'-'*70}")
 else:
    print(f"  ✗ Summarization failed: {result['error']}")
 # Test 4: Test with English article
 print("\nTest 4: English Article Test")
 print("  Testing with English article...")
 english_article = """
 The city council approved a new bike lane network spanning 50 kilometers across Munich. 
 The project aims to promote sustainable transportation and reduce car traffic in the city center. 
 Construction will begin next month and is expected to be completed within two years. 
 The bike lanes will connect major residential areas with business districts and public transport hubs. 
 Environmental groups have praised the initiative as a significant step toward carbon neutrality.
 """
 result2 = client.summarize_article(english_article, max_words=50)
 print(f"\n  Success: {result2['success']}")
 if result2['success']:
    print(f"  ✓ Summarization successful!")
    print(f"  Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
    print(f"  Duration: {result2['duration']:.2f}s")
    print(f"\n  Summary:")
    print(f"  {result2['summary']}")
 else:
    print(f"  ✗ Summarization failed: {result2['error']}")
 # Summary
 print("\n" + "="*70)
 print("Test Summary")
 print("="*70)
 print(f"✓ Configuration: Valid")
 print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
 print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
 print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
 print("="*70)
 if result['success'] and result2['success']:
    print("\n🎉 All tests passed! Ollama integration is working correctly.")
    print("\nYou can now run the crawler with AI summarization:")
    print("  python crawler_service.py 5")
 else:
    print("\n⚠ Some tests failed. Check the errors above.")
 print()
--- a/news_crawler/test_rss_feeds.py
+++ b/news_crawler/test_rss_feeds.py
@@ -0,0 +1,154 @@
 #!/usr/bin/env python
 """
 Test script to verify RSS feed URL extraction
 Tests actual feeds from the database
 """
 import feedparser
 from pymongo import MongoClient
 import os
 from dotenv import load_dotenv
 from rss_utils import extract_article_url, extract_article_summary, extract_published_date
 # Load environment variables
 load_dotenv(dotenv_path='../.env')
 # MongoDB setup
 MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
 DB_NAME = 'munich_news'
 client = MongoClient(MONGODB_URI)
 db = client[DB_NAME]
 rss_feeds_collection = db['rss_feeds']
 def test_feed(feed_name, feed_url):
    """Test a single RSS feed"""
    print(f"\n{'='*70}")
    print(f"Testing: {feed_name}")
    print(f"URL: {feed_url}")
    print('='*70)
    try:
        # Parse the feed
        print("Fetching RSS feed...")
        feed = feedparser.parse(feed_url)
        if not feed.entries:
            print("❌ No entries found in feed")
            return False
        print(f"✓ Found {len(feed.entries)} entries\n")
        # Test first 5 entries
        success_count = 0
        fail_count = 0
        for i, entry in enumerate(feed.entries[:5], 1):
            print(f"\n--- Entry {i} ---")
            print(f"Title: {entry.get('title', 'No title')[:60]}")
            # Test URL extraction
            article_url = extract_article_url(entry)
            if article_url:
                print(f"✓ URL: {article_url}")
                success_count += 1
            else:
                print(f"❌ No valid URL found")
                print(f"   Available fields: {list(entry.keys())}")
                print(f"   link: {entry.get('link', 'N/A')}")
                print(f"   guid: {entry.get('guid', 'N/A')}")
                print(f"   id: {entry.get('id', 'N/A')}")
                fail_count += 1
            # Test summary extraction
            summary = extract_article_summary(entry)
            if summary:
                print(f"✓ Summary: {summary[:80]}...")
            else:
                print(f"⚠ No summary found")
            # Test date extraction
            pub_date = extract_published_date(entry)
            if pub_date:
                print(f"✓ Published: {pub_date}")
            else:
                print(f"⚠ No published date found")
        print(f"\n{'='*70}")
        print(f"Results for {feed_name}:")
        print(f"  ✓ Success: {success_count}/5")
        print(f"  ❌ Failed: {fail_count}/5")
        print('='*70)
        return fail_count == 0
    except Exception as e:
        print(f"❌ Error testing feed: {e}")
        return False
 def main():
    print("\n" + "="*70)
    print("RSS Feed URL Extraction Test")
    print("="*70)
    # Get all RSS feeds from database
    print("\nFetching RSS feeds from database...")
    feeds = list(rss_feeds_collection.find())
    if not feeds:
        print("❌ No RSS feeds found in database")
        print("\nAdd feeds using:")
        print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
        print("    -H 'Content-Type: application/json' \\")
        print("    -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
        return 1
    print(f"✓ Found {len(feeds)} feed(s) in database\n")
    # Test each feed
    results = {}
    for feed in feeds:
        feed_name = feed.get('name', 'Unknown')
        feed_url = feed.get('url', '')
        active = feed.get('active', True)
        if not active:
            print(f"\n⏭ Skipping inactive feed: {feed_name}")
            continue
        if not feed_url:
            print(f"\n❌ Feed '{feed_name}' has no URL")
            results[feed_name] = False
            continue
        results[feed_name] = test_feed(feed_name, feed_url)
    # Summary
    print("\n" + "="*70)
    print("FINAL RESULTS")
    print("="*70)
    for feed_name, success in results.items():
        status = "✓ PASS" if success else "❌ FAIL"
        print(f"{status} - {feed_name}")
    total = len(results)
    passed = sum(1 for s in results.values() if s)
    print(f"\nTotal: {passed}/{total} feeds passed")
    print("="*70 + "\n")
    if passed == total:
        print("✓ All feeds are working correctly!")
        print("\nYou can now run the crawler:")
        print("  python crawler_service.py")
        return 0
    else:
        print("⚠ Some feeds have issues. Check the output above.")
        return 1
 if __name__ == '__main__':
    import sys
    sys.exit(main())
--- a/news_sender/.gitignore
+++ b/news_sender/.gitignore
@@ -0,0 +1,28 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 env/
 venv/
 ENV/
 .venv
 # Environment variables
 .env
 .env.local
 # Generated files
 newsletter_preview.html
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 Thumbs.db
--- a/news_sender/README.md
+++ b/news_sender/README.md
@@ -0,0 +1,303 @@
 # News Sender Microservice
 Standalone service for sending Munich News Daily newsletters to subscribers.
 ## Features
 - 📧 Sends beautiful HTML newsletters
 - 🤖 Uses AI-generated article summaries
 - 📊 Tracks sending statistics
 - 🧪 Test mode for development
 - 📝 Preview generation
 - 🔄 Fetches data from shared MongoDB
 ## Installation
 ```bash
 cd news_sender
 pip install -r requirements.txt
 ```
 ## Configuration
 The service uses the same `.env` file as the backend (`../backend/.env`):
 ```env
 # MongoDB
 MONGODB_URI=mongodb://localhost:27017/
 # Email (Gmail example)
 SMTP_SERVER=smtp.gmail.com
 SMTP_PORT=587
 EMAIL_USER=your-email@gmail.com
 EMAIL_PASSWORD=your-app-password
 # Newsletter Settings (optional)
 NEWSLETTER_MAX_ARTICLES=10
 WEBSITE_URL=http://localhost:3000
 ```
 **Gmail Setup:**
 1. Enable 2-factor authentication
 2. Generate an App Password: https://support.google.com/accounts/answer/185833
 3. Use the App Password (not your regular password)
 ## Usage
 ### 1. Preview Newsletter
 Generate HTML preview without sending:
 ```bash
 python sender_service.py preview
 ```
 This creates `newsletter_preview.html` - open it in your browser to see how the newsletter looks.
 ### 2. Send Test Email
 Send to a single email address for testing:
 ```bash
 python sender_service.py test your-email@example.com
 ```
 ### 3. Send to All Subscribers
 Send newsletter to all active subscribers:
 ```bash
 # Send with default article count (10)
 python sender_service.py send
 # Send with custom article count
 python sender_service.py send 15
 ```
 ### 4. Use as Python Module
 ```python
 from sender_service import send_newsletter, preview_newsletter
 # Send newsletter
 result = send_newsletter(max_articles=10)
 print(f"Sent to {result['sent_count']} subscribers")
 # Generate preview
 html = preview_newsletter(max_articles=5)
 ```
 ## How It Works
 ```
 ┌─────────────────────────────────────────────────────────┐
 │  1. Fetch Articles from MongoDB                         │
 │     - Get latest articles with AI summaries             │
 │     - Sort by creation date (newest first)              │
 └─────────────────────────────────────────────────────────┘
                         ↓
 ┌─────────────────────────────────────────────────────────┐
 │  2. Fetch Active Subscribers                            │
 │     - Get all subscribers with status='active'          │
 └─────────────────────────────────────────────────────────┘
                         ↓
 ┌─────────────────────────────────────────────────────────┐
 │  3. Render Newsletter HTML                              │
 │     - Load newsletter_template.html                     │
 │     - Populate with articles and metadata               │
 │     - Generate beautiful HTML email                     │
 └─────────────────────────────────────────────────────────┘
                         ↓
 ┌─────────────────────────────────────────────────────────┐
 │  4. Send Emails                                         │
 │     - Connect to SMTP server                            │
 │     - Send to each subscriber                           │
 │     - Track success/failure                             │
 └─────────────────────────────────────────────────────────┘
                         ↓
 ┌─────────────────────────────────────────────────────────┐
 │  5. Report Statistics                                   │
 │     - Total sent                                        │
 │     - Failed sends                                      │
 │     - Error details                                     │
 └─────────────────────────────────────────────────────────┘
 ```
 ## Output Example
 ```
 ======================================================================
 📧 Munich News Daily - Newsletter Sender
 ======================================================================
 Fetching latest 10 articles with AI summaries...
 ✓ Found 10 articles
 Fetching active subscribers...
 ✓ Found 150 active subscriber(s)
 Rendering newsletter HTML...
 ✓ Newsletter rendered
 Sending newsletter: 'Munich News Daily - November 10, 2024'
 ----------------------------------------------------------------------
 [1/150] Sending to user1@example.com... ✓
 [2/150] Sending to user2@example.com... ✓
 [3/150] Sending to user3@example.com... ✓
 ...
 ======================================================================
 📊 Sending Complete
 ======================================================================
 ✓ Successfully sent: 148
 ✗ Failed: 2
 📰 Articles included: 10
 ======================================================================
 ```
 ## Scheduling
 ### Using Cron (Linux/Mac)
 Send newsletter daily at 8 AM:
 ```bash
 # Edit crontab
 crontab -e
 # Add this line
 0 8 * * * cd /path/to/news_sender && /path/to/venv/bin/python sender_service.py send
 ```
 ### Using systemd Timer (Linux)
 Create `/etc/systemd/system/news-sender.service`:
 ```ini
 [Unit]
 Description=Munich News Sender
 [Service]
 Type=oneshot
 WorkingDirectory=/path/to/news_sender
 ExecStart=/path/to/venv/bin/python sender_service.py send
 User=your-user
 ```
 Create `/etc/systemd/system/news-sender.timer`:
 ```ini
 [Unit]
 Description=Send Munich News Daily at 8 AM
 [Timer]
 OnCalendar=daily
 OnCalendar=*-*-* 08:00:00
 [Install]
 WantedBy=timers.target
 ```
 Enable and start:
 ```bash
 sudo systemctl enable news-sender.timer
 sudo systemctl start news-sender.timer
 ```
 ### Using Docker
 Create `Dockerfile`:
 ```dockerfile
 FROM python:3.11-slim
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY sender_service.py newsletter_template.html ./
 CMD ["python", "sender_service.py", "send"]
 ```
 Build and run:
 ```bash
 docker build -t news-sender .
 docker run --env-file ../backend/.env news-sender
 ```
 ## Troubleshooting
 ### "Email credentials not configured"
 - Check that `EMAIL_USER` and `EMAIL_PASSWORD` are set in `.env`
 - For Gmail, use an App Password, not your regular password
 ### "No articles with summaries found"
 - Run the crawler first: `cd ../news_crawler && python crawler_service.py 10`
 - Make sure Ollama is enabled and working
 - Check MongoDB has articles with `summary` field
 ### "No active subscribers found"
 - Add subscribers via the backend API
 - Check subscriber status is 'active' in MongoDB
 ### SMTP Connection Errors
 - Verify SMTP server and port are correct
 - Check firewall isn't blocking SMTP port
 - For Gmail, ensure "Less secure app access" is enabled or use App Password
 ### Emails Going to Spam
 - Set up SPF, DKIM, and DMARC records for your domain
 - Use a verified email address
 - Avoid spam trigger words in subject/content
 - Include unsubscribe link (already included in template)
 ## Architecture
 This is a standalone microservice that:
 - Runs independently of the backend
 - Shares the same MongoDB database
 - Can be deployed separately
 - Can be scheduled independently
 - Has no dependencies on backend code
 ## Integration with Other Services
 ```
 ┌──────────────┐     ┌──────────────┐     ┌──────────────┐
 │   Backend    │     │   Crawler    │     │    Sender    │
 │   (Flask)    │     │  (Scraper)   │     │   (Email)    │
 └──────┬───────┘     └──────┬───────┘     └──────┬───────┘
       │                    │                     │
       │                    │                     │
       └────────────────────┴─────────────────────┘
                            │
                    ┌───────▼────────┐
                    │    MongoDB     │
                    │  (Shared DB)   │
                    └────────────────┘
 ```
 ## Next Steps
 1. **Test the newsletter:**
   ```bash
   python sender_service.py test your-email@example.com
   ```
 2. **Schedule daily sending:**
   - Set up cron job or systemd timer
   - Choose appropriate time (e.g., 8 AM)
 3. **Monitor sending:**
   - Check logs for errors
   - Track open rates (requires email tracking service)
   - Monitor spam complaints
 4. **Optimize:**
   - Add email tracking pixels
   - A/B test subject lines
   - Personalize content per subscriber
--- a/news_sender/newsletter_template.html
+++ b/news_sender/newsletter_template.html
@@ -0,0 +1,162 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <title>Munich News Daily</title>
    <!--[if mso]>
    <style type="text/css">
        body, table, td {font-family: Arial, Helvetica, sans-serif !important;}
    </style>
    <![endif]-->
 </head>
 <body style="margin: 0; padding: 0; background-color: #f4f4f4; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;">
    <!-- Wrapper Table -->
    <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f4f4f4;">
        <tr>
            <td align="center" style="padding: 20px 0;">
                <!-- Main Container -->
                <table role="presentation" width="600" cellpadding="0" cellspacing="0" border="0" style="background-color: #ffffff; max-width: 600px;">
                    <!-- Header -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
                            <h1 style="margin: 0 0 8px 0; font-size: 28px; font-weight: 700; color: #ffffff; letter-spacing: -0.5px;">
                                Munich News Daily
                            </h1>
                            <p style="margin: 0; font-size: 14px; color: #999999; letter-spacing: 0.5px;">
                                {{ date }}
                            </p>
                        </td>
                    </tr>
                    <!-- Greeting -->
                    <tr>
                        <td style="padding: 30px 40px 20px 40px;">
                            <p style="margin: 0; font-size: 16px; line-height: 1.5; color: #333333;">
                                Good morning ☀️
                            </p>
                            <p style="margin: 15px 0 0 0; font-size: 15px; line-height: 1.6; color: #666666;">
                                Here's what's happening in Munich today. We've summarized {{ article_count }} stories using AI so you can stay informed in under 5 minutes.
                            </p>
                        </td>
                    </tr>
                    <!-- Divider -->
                    <tr>
                        <td style="padding: 0 40px;">
                            <div style="height: 1px; background-color: #e0e0e0;"></div>
                        </td>
                    </tr>
                    <!-- Articles -->
                    {% for article in articles %}
                    <tr>
                        <td style="padding: 25px 40px;">
                            <!-- Article Number Badge -->
                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0">
                                <tr>
                                    <td>
                                        <span style="display: inline-block; background-color: #000000; color: #ffffff; width: 24px; height: 24px; line-height: 24px; text-align: center; border-radius: 50%; font-size: 12px; font-weight: 600;">
                                            {{ loop.index }}
                                        </span>
                                    </td>
                                </tr>
                            </table>
                            <!-- Article Title -->
                            <h2 style="margin: 12px 0 8px 0; font-size: 19px; font-weight: 700; line-height: 1.3; color: #1a1a1a;">
                                {{ article.title }}
                            </h2>
                            <!-- Article Meta -->
                            <p style="margin: 0 0 12px 0; font-size: 13px; color: #999999;">
                                <span style="color: #000000; font-weight: 600;">{{ article.source }}</span>
                                {% if article.author %}
                                <span> • {{ article.author }}</span>
                                {% endif %}
                            </p>
                            <!-- Article Summary -->
                            <p style="margin: 0 0 15px 0; font-size: 15px; line-height: 1.6; color: #333333;">
                                {{ article.summary }}
                            </p>
                            <!-- Read More Link -->
                            <a href="{{ article.link }}" style="display: inline-block; color: #000000; text-decoration: none; font-size: 14px; font-weight: 600; border-bottom: 2px solid #000000; padding-bottom: 2px;">
                                Read more →
                            </a>
                        </td>
                    </tr>
                    <!-- Article Divider -->
                    {% if not loop.last %}
                    <tr>
                        <td style="padding: 0 40px;">
                            <div style="height: 1px; background-color: #f0f0f0;"></div>
                        </td>
                    </tr>
                    {% endif %}
                    {% endfor %}
                    <!-- Bottom Divider -->
                    <tr>
                        <td style="padding: 25px 40px 0 40px;">
                            <div style="height: 1px; background-color: #e0e0e0;"></div>
                        </td>
                    </tr>
                    <!-- Summary Box -->
                    <tr>
                        <td style="padding: 30px 40px;">
                            <table role="presentation" width="100%" cellpadding="0" cellspacing="0" border="0" style="background-color: #f8f8f8; border-radius: 8px;">
                                <tr>
                                    <td style="padding: 25px; text-align: center;">
                                        <p style="margin: 0 0 8px 0; font-size: 13px; color: #666666; text-transform: uppercase; letter-spacing: 1px; font-weight: 600;">
                                            Today's Digest
                                        </p>
                                        <p style="margin: 0; font-size: 36px; font-weight: 700; color: #000000;">
                                            {{ article_count }}
                                        </p>
                                        <p style="margin: 8px 0 0 0; font-size: 14px; color: #666666;">
                                            stories • AI-summarized • 5 min read
                                        </p>
                                    </td>
                                </tr>
                            </table>
                        </td>
                    </tr>
                    <!-- Footer -->
                    <tr>
                        <td style="background-color: #1a1a1a; padding: 30px 40px; text-align: center;">
                            <p style="margin: 0 0 15px 0; font-size: 14px; color: #ffffff; font-weight: 600;">
                                Munich News Daily
                            </p>
                            <p style="margin: 0 0 20px 0; font-size: 13px; color: #999999; line-height: 1.5;">
                                AI-powered news summaries for busy people.<br>
                                Delivered daily to your inbox.
                            </p>
                            <!-- Footer Links -->
                            <p style="margin: 0; font-size: 12px; color: #666666;">
                                <a href="{{ website_link }}" style="color: #999999; text-decoration: none;">Visit Website</a>
                                <span style="color: #444444;"> • </span>
                                <a href="{{ unsubscribe_link }}" style="color: #999999; text-decoration: none;">Unsubscribe</a>
                            </p>
                            <p style="margin: 20px 0 0 0; font-size: 11px; color: #666666;">
                                © {{ year }} Munich News Daily. All rights reserved.
                            </p>
                        </td>
                    </tr>
                </table>
                <!-- End Main Container -->
            </td>
        </tr>
    </table>
    <!-- End Wrapper Table -->
 </body>
 </html>
--- a/news_sender/requirements.txt
+++ b/news_sender/requirements.txt
@@ -0,0 +1,3 @@
 pymongo==4.6.1
 python-dotenv==1.0.0
 Jinja2==3.1.2
--- a/news_sender/sender_service.py
+++ b/news_sender/sender_service.py
@@ -0,0 +1,313 @@
 #!/usr/bin/env python
 """
 News Sender Service - Standalone microservice for sending newsletters
 Fetches articles from MongoDB and sends to subscribers via email
 """
 import smtplib
 from email.mime.text import MIMEText
 from email.mime.multipart import MIMEMultipart
 from datetime import datetime
 from pathlib import Path
 from jinja2 import Template
 from pymongo import MongoClient
 import os
 from dotenv import load_dotenv
 # Load environment variables from backend/.env
 backend_dir = Path(__file__).parent.parent / 'backend'
 env_path = backend_dir / '.env'
 if env_path.exists():
    load_dotenv(dotenv_path=env_path)
    print(f"✓ Loaded configuration from: {env_path}")
 else:
    print(f"⚠ Warning: .env file not found at {env_path}")
 class Config:
    """Configuration for news sender"""
    # MongoDB
    MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'
    # Email
    SMTP_SERVER = os.getenv('SMTP_SERVER', 'smtp.gmail.com')
    SMTP_PORT = int(os.getenv('SMTP_PORT', '587'))
    EMAIL_USER = os.getenv('EMAIL_USER', '')
    EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD', '')
    # Newsletter
    MAX_ARTICLES = int(os.getenv('NEWSLETTER_MAX_ARTICLES', '10'))
    WEBSITE_URL = os.getenv('WEBSITE_URL', 'http://localhost:3000')
 # MongoDB connection
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 articles_collection = db['articles']
 subscribers_collection = db['subscribers']
 def get_latest_articles(max_articles=10):
    """
    Get latest articles with AI summaries from database
    Returns:
        list: Articles with summaries
    """
    cursor = articles_collection.find(
        {'summary': {'$exists': True, '$ne': None}}
    ).sort('created_at', -1).limit(max_articles)
    articles = []
    for doc in cursor:
        articles.append({
            'title': doc.get('title', ''),
            'author': doc.get('author'),
            'link': doc.get('link', ''),
            'summary': doc.get('summary', ''),
            'source': doc.get('source', ''),
            'published_at': doc.get('published_at', '')
        })
    return articles
 def get_active_subscribers():
    """
    Get all active subscribers from database
    Returns:
        list: Email addresses of active subscribers
    """
    cursor = subscribers_collection.find({'status': 'active'})
    return [doc['email'] for doc in cursor]
 def render_newsletter_html(articles):
    """
    Render newsletter HTML from template
    Args:
        articles: List of article dictionaries
    Returns:
        str: Rendered HTML content
    """
    # Load template
    template_path = Path(__file__).parent / 'newsletter_template.html'
    with open(template_path, 'r', encoding='utf-8') as f:
        template_content = f.read()
    template = Template(template_content)
    # Prepare template data
    now = datetime.now()
    template_data = {
        'date': now.strftime('%A, %B %d, %Y'),
        'year': now.year,
        'article_count': len(articles),
        'articles': articles,
        'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
        'website_link': Config.WEBSITE_URL
    }
    # Render HTML
    return template.render(**template_data)
 def send_email(to_email, subject, html_content):
    """
    Send email to a single recipient
    Args:
        to_email: Recipient email address
        subject: Email subject
        html_content: HTML content of email
    Returns:
        tuple: (success: bool, error: str or None)
    """
    try:
        msg = MIMEMultipart('alternative')
        msg['Subject'] = subject
        msg['From'] = f'Munich News Daily <{Config.EMAIL_USER}>'
        msg['To'] = to_email
        msg['Date'] = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
        msg['Message-ID'] = f'<{datetime.now().timestamp()}.{to_email}@dongho.kim>'
        msg['X-Mailer'] = 'Munich News Daily Sender'
        # Add plain text version as fallback
        plain_text = "This email requires HTML support. Please view it in an HTML-capable email client."
        msg.attach(MIMEText(plain_text, 'plain', 'utf-8'))
        # Add HTML version
        msg.attach(MIMEText(html_content, 'html', 'utf-8'))
        server = smtplib.SMTP(Config.SMTP_SERVER, Config.SMTP_PORT)
        server.starttls()
        server.login(Config.EMAIL_USER, Config.EMAIL_PASSWORD)
        server.send_message(msg)
        server.quit()
        return True, None
    except Exception as e:
        return False, str(e)
 def send_newsletter(max_articles=None, test_email=None):
    """
    Send newsletter to all active subscribers
    Args:
        max_articles: Maximum number of articles to include (default from config)
        test_email: If provided, send only to this email (for testing)
    Returns:
        dict: Statistics about sending
    """
    print("\n" + "="*70)
    print("📧 Munich News Daily - Newsletter Sender")
    print("="*70)
    # Validate email configuration
    if not Config.EMAIL_USER or not Config.EMAIL_PASSWORD:
        print("❌ Email credentials not configured")
        print("   Set EMAIL_USER and EMAIL_PASSWORD in .env file")
        return {
            'success': False,
            'error': 'Email credentials not configured'
        }
    # Get articles
    max_articles = max_articles or Config.MAX_ARTICLES
    print(f"\nFetching latest {max_articles} articles with AI summaries...")
    articles = get_latest_articles(max_articles)
    if not articles:
        print("❌ No articles with summaries found")
        print("   Run the crawler with Ollama enabled first")
        return {
            'success': False,
            'error': 'No articles with summaries'
        }
    print(f"✓ Found {len(articles)} articles")
    # Get subscribers
    if test_email:
        subscribers = [test_email]
        print(f"\n🧪 Test mode: Sending to {test_email} only")
    else:
        print("\nFetching active subscribers...")
        subscribers = get_active_subscribers()
        print(f"✓ Found {len(subscribers)} active subscriber(s)")
    if not subscribers:
        print("❌ No active subscribers found")
        return {
            'success': False,
            'error': 'No active subscribers'
        }
    # Render newsletter
    print("\nRendering newsletter HTML...")
    html_content = render_newsletter_html(articles)
    print("✓ Newsletter rendered")
    # Send to subscribers
    subject = f"Munich News Daily - {datetime.now().strftime('%B %d, %Y')}"
    print(f"\nSending newsletter: '{subject}'")
    print("-" * 70)
    sent_count = 0
    failed_count = 0
    errors = []
    for i, email in enumerate(subscribers, 1):
        print(f"[{i}/{len(subscribers)}] Sending to {email}...", end=' ')
        success, error = send_email(email, subject, html_content)
        if success:
            print("✓")
            sent_count += 1
        else:
            print(f"✗ {error}")
            failed_count += 1
            errors.append({'email': email, 'error': error})
    # Summary
    print("\n" + "="*70)
    print("📊 Sending Complete")
    print("="*70)
    print(f"✓ Successfully sent: {sent_count}")
    print(f"✗ Failed: {failed_count}")
    print(f"📰 Articles included: {len(articles)}")
    print("="*70 + "\n")
    return {
        'success': True,
        'sent_count': sent_count,
        'failed_count': failed_count,
        'total_subscribers': len(subscribers),
        'article_count': len(articles),
        'errors': errors
    }
 def preview_newsletter(max_articles=None):
    """
    Generate newsletter HTML for preview (doesn't send)
    Args:
        max_articles: Maximum number of articles to include
    Returns:
        str: HTML content
    """
    max_articles = max_articles or Config.MAX_ARTICLES
    articles = get_latest_articles(max_articles)
    if not articles:
        return "<h1>No articles with summaries found</h1><p>Run the crawler with Ollama enabled first.</p>"
    return render_newsletter_html(articles)
 if __name__ == '__main__':
    import sys
    # Parse command line arguments
    if len(sys.argv) > 1:
        command = sys.argv[1]
        if command == 'preview':
            # Generate preview HTML
            html = preview_newsletter()
            output_file = 'newsletter_preview.html'
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(html)
            print(f"✓ Preview saved to {output_file}")
            print(f"  Open it in your browser to see the newsletter")
        elif command == 'test':
            # Send test email
            if len(sys.argv) < 3:
                print("Usage: python sender_service.py test <email>")
                sys.exit(1)
            test_email = sys.argv[2]
            send_newsletter(test_email=test_email)
        elif command == 'send':
            # Send to all subscribers
            max_articles = int(sys.argv[2]) if len(sys.argv) > 2 else None
            send_newsletter(max_articles=max_articles)
        else:
            print("Unknown command. Usage:")
            print("  python sender_service.py preview          - Generate HTML preview")
            print("  python sender_service.py test <email>     - Send test email")
            print("  python sender_service.py send [count]     - Send to all subscribers")
    else:
        # Default: send newsletter
        send_newsletter()
--- a/test_feeds_quick.py
+++ b/test_feeds_quick.py
@@ -0,0 +1,96 @@
 #!/usr/bin/env python
 """
 Quick test script - Run from project root with backend venv activated
 Usage: 
  cd /path/to/munich-news
  source backend/venv/bin/activate  # or backend/venv/Scripts/activate on Windows
  python test_feeds_quick.py
 """
 import sys
 sys.path.insert(0, 'backend')
 from pymongo import MongoClient
 from config import Config
 import feedparser
 from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
 print("="*80)
 print("RSS Feed Test - Checking Database Feeds")
 print("="*80)
 # Connect to database
 client = MongoClient(Config.MONGODB_URI)
 db = client[Config.DB_NAME]
 # Get RSS feeds
 feeds = list(db['rss_feeds'].find())
 if not feeds:
    print("\n❌ No RSS feeds in database!")
    print("\nAdd a feed first:")
    print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
    print("    -H 'Content-Type: application/json' \\")
    print("    -d '{\"name\": \"Test Feed\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
    sys.exit(1)
 print(f"\n✓ Found {len(feeds)} feed(s) in database\n")
 # Test each feed
 for feed_doc in feeds:
    name = feed_doc.get('name', 'Unknown')
    url = feed_doc.get('url', '')
    active = feed_doc.get('active', True)
    print(f"\n{'='*80}")
    print(f"Feed: {name}")
    print(f"URL: {url}")
    print(f"Active: {active}")
    print('='*80)
    if not active:
        print("⏭ Skipping (inactive)")
        continue
    try:
        # Parse RSS
        print("Fetching RSS feed...")
        feed = feedparser.parse(url)
        if not feed.entries:
            print("❌ No entries found")
            continue
        print(f"✓ Found {len(feed.entries)} entries\n")
        # Test first 3 entries
        for i, entry in enumerate(feed.entries[:3], 1):
            print(f"\n--- Entry {i} ---")
            title = entry.get('title', 'No title')
            print(f"Title: {title[:70]}")
            # Test URL extraction
            article_url = extract_article_url(entry)
            if article_url:
                print(f"✓ URL extracted: {article_url}")
            else:
                print(f"❌ Could not extract URL")
                print(f"   Available fields: {list(entry.keys())[:10]}")
                print(f"   link: {entry.get('link', 'N/A')}")
                print(f"   guid: {entry.get('guid', 'N/A')}")
            # Test summary
            summary = extract_article_summary(entry)
            if summary:
                print(f"✓ Summary: {summary[:80]}...")
            # Test date
            pub_date = extract_published_date(entry)
            if pub_date:
                print(f"✓ Date: {pub_date}")
    except Exception as e:
        print(f"❌ Error: {e}")
 print("\n" + "="*80)
 print("Test complete!")
 print("="*80)