update

2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions
--- a/tests/crawler/README.md
+++ b/tests/crawler/README.md
@@ -0,0 +1,110 @@
+# Crawler Tests
+
+Test suite for the news crawler, AI clustering, and neutral summary generation.
+
+## Test Files
+
+### AI Clustering & Aggregation Tests
+
+- **`test_clustering_real.py`** - Tests AI-powered article clustering with realistic fake articles
+- **`test_neutral_summaries.py`** - Tests neutral summary generation from clustered articles
+- **`test_complete_workflow.py`** - End-to-end test of clustering + neutral summaries
+
+### Core Crawler Tests
+
+- **`test_crawler.py`** - Basic crawler functionality
+- **`test_ollama.py`** - Ollama AI integration tests
+- **`test_rss_feeds.py`** - RSS feed parsing tests
+
+## Running Tests
+
+### Run All Tests
+```bash
+# From project root
+docker-compose exec crawler python -m pytest tests/crawler/
+```
+
+### Run Specific Test
+```bash
+# AI clustering test
+docker-compose exec crawler python tests/crawler/test_clustering_real.py
+
+# Neutral summaries test
+docker-compose exec crawler python tests/crawler/test_neutral_summaries.py
+
+# Complete workflow test
+docker-compose exec crawler python tests/crawler/test_complete_workflow.py
+```
+
+### Run Tests Inside Container
+```bash
+# Enter container
+docker-compose exec crawler bash
+
+# Run tests
+python test_clustering_real.py
+python test_neutral_summaries.py
+python test_complete_workflow.py
+```
+
+## Test Data
+
+Tests use fake articles to avoid depending on external RSS feeds:
+
+**Test Scenarios:**
+1. **Same story, different sources** - Should cluster together
+2. **Different stories** - Should remain separate
+3. **Multi-source clustering** - Should generate neutral summaries
+
+**Expected Results:**
+- Housing story (2 sources) → Cluster together → Neutral summary
+- Bayern transfer (2 sources) → Cluster together → Neutral summary
+- Single-source stories → Individual summaries
+
+## Cleanup
+
+Tests create temporary data in MongoDB. To clean up:
+
+```bash
+# Clean test articles
+docker-compose exec crawler python << 'EOF'
+from pymongo import MongoClient
+client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
+db = client["munich_news"]
+db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
+db.cluster_summaries.delete_many({})
+print("✓ Test data cleaned")
+EOF
+```
+
+## Requirements
+
+- Docker containers must be running
+- Ollama service must be available
+- MongoDB must be accessible
+- AI model (phi3:latest) must be downloaded
+
+## Troubleshooting
+
+### Ollama Not Available
+```bash
+# Check Ollama status
+docker-compose logs ollama
+
+# Restart Ollama
+docker-compose restart ollama
+```
+
+### Tests Timing Out
+- Increase timeout in test files (default: 60s)
+- Check Ollama model is downloaded
+- Verify GPU acceleration if enabled
+
+### MongoDB Connection Issues
+```bash
+# Check MongoDB status
+docker-compose logs mongodb
+
+# Restart MongoDB
+docker-compose restart mongodb
+```
--- a/tests/crawler/test_clustering_real.py
+++ b/tests/crawler/test_clustering_real.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Test AI clustering with realistic fake articles
+"""
+from pymongo import MongoClient
+from datetime import datetime
+import sys
+
+# Connect to MongoDB
+client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
+db = client["munich_news"]
+
+# Create test articles about the same Munich story from different sources
+test_articles = [
+    {
+        "title": "München: Stadtrat beschließt neue Regelungen für Wohnungsbau",
+        "content": """Der Münchner Stadtrat hat am Dienstag neue Regelungen für den Wohnungsbau beschlossen. 
+        Die Maßnahmen sollen den Bau von bezahlbarem Wohnraum in der bayerischen Landeshauptstadt fördern. 
+        Oberbürgermeister Dieter Reiter (SPD) sprach von einem wichtigen Schritt zur Lösung der Wohnungskrise.
+        Die neuen Regelungen sehen vor, dass bei Neubauprojekten mindestens 40 Prozent der Wohnungen 
+        als Sozialwohnungen gebaut werden müssen. Zudem werden Bauvorschriften vereinfacht.""",
+        "source": "abendzeitung-muenchen",
+        "link": "https://example.com/az-wohnungsbau-1",
+        "published_at": datetime.utcnow(),
+        "category": "local",
+        "word_count": 85
+    },
+    {
+        "title": "Stadtrat München stimmt für neue Wohnungsbau-Verordnung",
+        "content": """In einer Sitzung am Dienstag stimmte der Münchner Stadtrat für neue Wohnungsbau-Verordnungen.
+        Die Beschlüsse zielen darauf ab, mehr bezahlbaren Wohnraum in München zu schaffen.
+        OB Reiter bezeichnete die Entscheidung als Meilenstein im Kampf gegen die Wohnungsnot.
+        Künftig müssen 40 Prozent aller Neubauwohnungen als Sozialwohnungen errichtet werden.
+        Außerdem werden bürokratische Hürden beim Bauen abgebaut.""",
+        "source": "sueddeutsche",
+        "link": "https://example.com/sz-wohnungsbau-1",
+        "published_at": datetime.utcnow(),
+        "category": "local",
+        "word_count": 72
+    },
+    {
+        "title": "FC Bayern München verpflichtet neuen Stürmer aus Brasilien",
+        "content": """Der FC Bayern München hat einen neuen Stürmer verpflichtet. Der 23-jährige Brasilianer
+        wechselt für eine Ablösesumme von 50 Millionen Euro nach München. Sportdirektor Christoph Freund
+        zeigte sich begeistert von der Verpflichtung. Der Spieler soll die Offensive verstärken.""",
+        "source": "abendzeitung-muenchen",
+        "link": "https://example.com/az-bayern-1",
+        "published_at": datetime.utcnow(),
+        "category": "sports",
+        "word_count": 52
+    },
+    {
+        "title": "Bayern München holt brasilianischen Angreifer",
+        "content": """Der deutsche Rekordmeister Bayern München hat einen brasilianischen Stürmer unter Vertrag genommen.
+        Für 50 Millionen Euro wechselt der 23-Jährige an die Isar. Sportdirektor Freund lobte den Transfer.
+        Der Neuzugang soll die Münchner Offensive beleben und für mehr Torgefahr sorgen.""",
+        "source": "sueddeutsche",
+        "link": "https://example.com/sz-bayern-1",
+        "published_at": datetime.utcnow(),
+        "category": "sports",
+        "word_count": 48
+    }
+]
+
+print("Testing AI Clustering with Realistic Articles")
+print("=" * 70)
+print()
+
+# Clear previous test articles
+print("Cleaning up previous test articles...")
+db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
+print("✓ Cleaned up")
+print()
+
+# Import clustering
+sys.path.insert(0, '/app')
+from ollama_client import OllamaClient
+from article_clustering import ArticleClusterer
+from config import Config
+
+# Initialize
+ollama_client = OllamaClient(
+    base_url=Config.OLLAMA_BASE_URL,
+    model=Config.OLLAMA_MODEL,
+    enabled=Config.OLLAMA_ENABLED,
+    timeout=30
+)
+
+clusterer = ArticleClusterer(
+    ollama_client=ollama_client,
+    similarity_threshold=0.50,
+    time_window_hours=24
+)
+
+print("Processing articles with AI clustering...")
+print()
+
+clustered_articles = []
+for i, article in enumerate(test_articles, 1):
+    print(f"{i}. Processing: {article['title'][:60]}...")
+    print(f"   Source: {article['source']}")
+    
+    # Cluster with previously processed articles
+    clustered = clusterer.cluster_article(article, clustered_articles)
+    clustered_articles.append(clustered)
+    
+    print(f"   → Cluster ID: {clustered['cluster_id']}")
+    print(f"   → Is Primary: {clustered['is_primary']}")
+    
+    # Insert into database
+    db.articles.insert_one(clustered)
+    print(f"   ✓ Saved to database")
+    print()
+
+print("=" * 70)
+print("Clustering Results:")
+print()
+
+# Analyze results
+clusters = {}
+for article in clustered_articles:
+    cluster_id = article['cluster_id']
+    if cluster_id not in clusters:
+        clusters[cluster_id] = []
+    clusters[cluster_id].append(article)
+
+for cluster_id, articles in clusters.items():
+    print(f"Cluster {cluster_id}: {len(articles)} article(s)")
+    for article in articles:
+        print(f"  - [{article['source']}] {article['title'][:60]}...")
+    print()
+
+# Expected results
+print("=" * 70)
+print("Expected Results:")
+print("  ✓ Articles 1&2 should be in same cluster (housing story)")
+print("  ✓ Articles 3&4 should be in same cluster (Bayern transfer)")
+print("  ✓ Total: 2 clusters with 2 articles each")
+print()
+
+# Actual results
+housing_cluster = [a for a in clustered_articles if 'Wohnungsbau' in a['title'] or 'Wohnungsbau' in a['title']]
+bayern_cluster = [a for a in clustered_articles if 'Bayern' in a['title'] or 'Stürmer' in a['title']]
+
+housing_cluster_ids = set(a['cluster_id'] for a in housing_cluster)
+bayern_cluster_ids = set(a['cluster_id'] for a in bayern_cluster)
+
+print("Actual Results:")
+if len(housing_cluster_ids) == 1:
+    print("  ✓ Housing articles clustered together")
+else:
+    print(f"  ✗ Housing articles in {len(housing_cluster_ids)} different clusters")
+
+if len(bayern_cluster_ids) == 1:
+    print("  ✓ Bayern articles clustered together")
+else:
+    print(f"  ✗ Bayern articles in {len(bayern_cluster_ids)} different clusters")
+
+if len(clusters) == 2:
+    print("  ✓ Total clusters: 2 (correct)")
+else:
+    print(f"  ✗ Total clusters: {len(clusters)} (expected 2)")
+
+print()
+print("=" * 70)
+print("✓ Test complete! Check the results above.")
--- a/tests/crawler/test_complete_workflow.py
+++ b/tests/crawler/test_complete_workflow.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""
+Complete workflow test: Clustering + Neutral Summaries
+"""
+from pymongo import MongoClient
+from datetime import datetime
+import sys
+
+client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
+db = client["munich_news"]
+
+print("=" * 70)
+print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
+print("=" * 70)
+print()
+
+# Clean up previous test
+print("1. Cleaning up previous test data...")
+db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
+db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
+print("   ✓ Cleaned up")
+print()
+
+# Import modules
+sys.path.insert(0, '/app')
+from ollama_client import OllamaClient
+from article_clustering import ArticleClusterer
+from cluster_summarizer import ClusterSummarizer
+from config import Config
+
+# Initialize
+ollama_client = OllamaClient(
+    base_url=Config.OLLAMA_BASE_URL,
+    model=Config.OLLAMA_MODEL,
+    enabled=Config.OLLAMA_ENABLED,
+    timeout=60
+)
+
+clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
+summarizer = ClusterSummarizer(ollama_client, max_words=200)
+
+# Test articles - 2 stories, 2 sources each
+test_articles = [
+    # Story 1: Munich Housing (2 sources)
+    {
+        "title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
+        "content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
+        "source": "abendzeitung-muenchen",
+        "link": "https://example.com/test-housing-az",
+        "published_at": datetime.utcnow(),
+        "category": "local"
+    },
+    {
+        "title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
+        "content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
+        "source": "sueddeutsche",
+        "link": "https://example.com/test-housing-sz",
+        "published_at": datetime.utcnow(),
+        "category": "local"
+    },
+    # Story 2: Bayern Transfer (2 sources)
+    {
+        "title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
+        "content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
+        "source": "abendzeitung-muenchen",
+        "link": "https://example.com/test-bayern-az",
+        "published_at": datetime.utcnow(),
+        "category": "sports"
+    },
+    {
+        "title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
+        "content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
+        "source": "sueddeutsche",
+        "link": "https://example.com/test-bayern-sz",
+        "published_at": datetime.utcnow(),
+        "category": "sports"
+    }
+]
+
+print("2. Processing articles with AI clustering...")
+print()
+
+clustered_articles = []
+for i, article in enumerate(test_articles, 1):
+    print(f"   Article {i}: {article['title'][:50]}...")
+    print(f"   Source: {article['source']}")
+    
+    # Cluster
+    clustered = clusterer.cluster_article(article, clustered_articles)
+    clustered_articles.append(clustered)
+    
+    print(f"   → Cluster: {clustered['cluster_id']}")
+    print(f"   → Primary: {clustered['is_primary']}")
+    
+    # Save to DB
+    db.articles.insert_one(clustered)
+    print(f"   ✓ Saved")
+    print()
+
+print("=" * 70)
+print("3. Clustering Results:")
+print()
+
+# Analyze clusters
+clusters = {}
+for article in clustered_articles:
+    cid = article['cluster_id']
+    if cid not in clusters:
+        clusters[cid] = []
+    clusters[cid].append(article)
+
+print(f"   Total clusters: {len(clusters)}")
+print()
+
+for cid, articles in clusters.items():
+    print(f"   Cluster {cid}:")
+    print(f"   - Articles: {len(articles)}")
+    for article in articles:
+        print(f"     • [{article['source']}] {article['title'][:45]}...")
+    print()
+
+# Check expectations
+if len(clusters) == 2:
+    print("   ✓ Expected 2 clusters (housing + bayern)")
+else:
+    print(f"   ⚠ Expected 2 clusters, got {len(clusters)}")
+
+print()
+print("=" * 70)
+print("4. Generating neutral summaries...")
+print()
+
+summary_count = 0
+for cid, articles in clusters.items():
+    if len(articles) < 2:
+        print(f"   Skipping cluster {cid} (only 1 article)")
+        continue
+    
+    print(f"   Cluster {cid}: {len(articles)} articles")
+    
+    result = summarizer.generate_neutral_summary(articles)
+    
+    if result['success']:
+        print(f"   ✓ Generated summary ({result['duration']:.1f}s)")
+        
+        # Save
+        db.cluster_summaries.insert_one({
+            "cluster_id": cid,
+            "neutral_summary": result['neutral_summary'],
+            "sources": result['sources'],
+            "article_count": result['article_count'],
+            "created_at": datetime.utcnow()
+        })
+        summary_count += 1
+        
+        # Show preview
+        preview = result['neutral_summary'][:100] + "..."
+        print(f"   Preview: {preview}")
+    else:
+        print(f"   ✗ Failed: {result['error']}")
+    
+    print()
+
+print("=" * 70)
+print("5. Final Results:")
+print()
+
+test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
+test_summary_count = db.cluster_summaries.count_documents({})
+
+print(f"   Articles saved: {test_article_count}")
+print(f"   Clusters created: {len(clusters)}")
+print(f"   Neutral summaries: {summary_count}")
+print()
+
+if len(clusters) == 2 and summary_count == 2:
+    print("   ✅ SUCCESS! Complete workflow working perfectly!")
+    print()
+    print("   The system now:")
+    print("   1. ✓ Clusters articles from different sources")
+    print("   2. ✓ Generates neutral summaries combining perspectives")
+    print("   3. ✓ Stores everything in MongoDB")
+else:
+    print("   ⚠ Partial success - check results above")
+
+print()
+print("=" * 70)
--- a/tests/crawler/test_neutral_summaries.py
+++ b/tests/crawler/test_neutral_summaries.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Test neutral summary generation from clustered articles
+"""
+from pymongo import MongoClient
+from datetime import datetime
+import sys
+
+# Connect to MongoDB
+client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
+db = client["munich_news"]
+
+print("Testing Neutral Summary Generation")
+print("=" * 70)
+print()
+
+# Check for test articles
+test_articles = list(db.articles.find(
+    {"link": {"$regex": "^https://example.com/"}}
+).sort("_id", 1))
+
+if len(test_articles) == 0:
+    print("⚠ No test articles found. Run test-clustering-real.py first.")
+    sys.exit(1)
+
+print(f"Found {len(test_articles)} test articles")
+print()
+
+# Find clusters with multiple articles
+clusters = {}
+for article in test_articles:
+    cid = article['cluster_id']
+    if cid not in clusters:
+        clusters[cid] = []
+    clusters[cid].append(article)
+
+multi_article_clusters = {k: v for k, v in clusters.items() if len(v) > 1}
+
+if len(multi_article_clusters) == 0:
+    print("⚠ No clusters with multiple articles found")
+    sys.exit(1)
+
+print(f"Found {len(multi_article_clusters)} cluster(s) with multiple articles")
+print()
+
+# Import cluster summarizer
+sys.path.insert(0, '/app')
+from ollama_client import OllamaClient
+from cluster_summarizer import ClusterSummarizer
+from config import Config
+
+# Initialize
+ollama_client = OllamaClient(
+    base_url=Config.OLLAMA_BASE_URL,
+    model=Config.OLLAMA_MODEL,
+    enabled=Config.OLLAMA_ENABLED,
+    timeout=60
+)
+
+summarizer = ClusterSummarizer(ollama_client, max_words=200)
+
+print("Generating neutral summaries...")
+print("=" * 70)
+print()
+
+for cluster_id, articles in multi_article_clusters.items():
+    print(f"Cluster: {cluster_id}")
+    print(f"Articles: {len(articles)}")
+    print()
+    
+    # Show individual articles
+    for i, article in enumerate(articles, 1):
+        print(f"  {i}. [{article['source']}] {article['title'][:60]}...")
+    print()
+    
+    # Generate neutral summary
+    print("  Generating neutral summary...")
+    result = summarizer.generate_neutral_summary(articles)
+    
+    if result['success']:
+        print(f"  ✓ Success ({result['duration']:.1f}s)")
+        print()
+        print("  Neutral Summary:")
+        print("  " + "-" * 66)
+        # Wrap text at 66 chars
+        summary = result['neutral_summary']
+        words = summary.split()
+        lines = []
+        current_line = "  "
+        for word in words:
+            if len(current_line) + len(word) + 1 <= 68:
+                current_line += word + " "
+            else:
+                lines.append(current_line.rstrip())
+                current_line = "  " + word + " "
+        if current_line.strip():
+            lines.append(current_line.rstrip())
+        print("\n".join(lines))
+        print("  " + "-" * 66)
+        print()
+        
+        # Save to database
+        db.cluster_summaries.update_one(
+            {"cluster_id": cluster_id},
+            {
+                "$set": {
+                    "cluster_id": cluster_id,
+                    "neutral_summary": result['neutral_summary'],
+                    "sources": result['sources'],
+                    "article_count": result['article_count'],
+                    "created_at": datetime.utcnow(),
+                    "updated_at": datetime.utcnow()
+                }
+            },
+            upsert=True
+        )
+        print("  ✓ Saved to cluster_summaries collection")
+    else:
+        print(f"  ✗ Failed: {result['error']}")
+    
+    print()
+    print("=" * 70)
+    print()
+
+print("Testing complete!")
+print()
+
+# Show summary statistics
+total_cluster_summaries = db.cluster_summaries.count_documents({})
+print(f"Total cluster summaries in database: {total_cluster_summaries}")