This commit is contained in:
2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions

110
tests/crawler/README.md Normal file
View File

@@ -0,0 +1,110 @@
# Crawler Tests
Test suite for the news crawler, AI clustering, and neutral summary generation.
## Test Files
### AI Clustering & Aggregation Tests
- **`test_clustering_real.py`** - Tests AI-powered article clustering with realistic fake articles
- **`test_neutral_summaries.py`** - Tests neutral summary generation from clustered articles
- **`test_complete_workflow.py`** - End-to-end test of clustering + neutral summaries
### Core Crawler Tests
- **`test_crawler.py`** - Basic crawler functionality
- **`test_ollama.py`** - Ollama AI integration tests
- **`test_rss_feeds.py`** - RSS feed parsing tests
## Running Tests
### Run All Tests
```bash
# From project root
docker-compose exec crawler python -m pytest tests/crawler/
```
### Run Specific Test
```bash
# AI clustering test
docker-compose exec crawler python tests/crawler/test_clustering_real.py
# Neutral summaries test
docker-compose exec crawler python tests/crawler/test_neutral_summaries.py
# Complete workflow test
docker-compose exec crawler python tests/crawler/test_complete_workflow.py
```
### Run Tests Inside Container
```bash
# Enter container
docker-compose exec crawler bash
# Run tests
python test_clustering_real.py
python test_neutral_summaries.py
python test_complete_workflow.py
```
## Test Data
Tests use fake articles to avoid depending on external RSS feeds:
**Test Scenarios:**
1. **Same story, different sources** - Should cluster together
2. **Different stories** - Should remain separate
3. **Multi-source clustering** - Should generate neutral summaries
**Expected Results:**
- Housing story (2 sources) → Cluster together → Neutral summary
- Bayern transfer (2 sources) → Cluster together → Neutral summary
- Single-source stories → Individual summaries
## Cleanup
Tests create temporary data in MongoDB. To clean up:
```bash
# Clean test articles
docker-compose exec crawler python << 'EOF'
from pymongo import MongoClient
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
db.cluster_summaries.delete_many({})
print("✓ Test data cleaned")
EOF
```
## Requirements
- Docker containers must be running
- Ollama service must be available
- MongoDB must be accessible
- AI model (phi3:latest) must be downloaded
## Troubleshooting
### Ollama Not Available
```bash
# Check Ollama status
docker-compose logs ollama
# Restart Ollama
docker-compose restart ollama
```
### Tests Timing Out
- Increase timeout in test files (default: 60s)
- Check Ollama model is downloaded
- Verify GPU acceleration if enabled
### MongoDB Connection Issues
```bash
# Check MongoDB status
docker-compose logs mongodb
# Restart MongoDB
docker-compose restart mongodb
```

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Test AI clustering with realistic fake articles
"""
from pymongo import MongoClient
from datetime import datetime
import sys
# Connect to MongoDB
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
# Create test articles about the same Munich story from different sources
test_articles = [
{
"title": "München: Stadtrat beschließt neue Regelungen für Wohnungsbau",
"content": """Der Münchner Stadtrat hat am Dienstag neue Regelungen für den Wohnungsbau beschlossen.
Die Maßnahmen sollen den Bau von bezahlbarem Wohnraum in der bayerischen Landeshauptstadt fördern.
Oberbürgermeister Dieter Reiter (SPD) sprach von einem wichtigen Schritt zur Lösung der Wohnungskrise.
Die neuen Regelungen sehen vor, dass bei Neubauprojekten mindestens 40 Prozent der Wohnungen
als Sozialwohnungen gebaut werden müssen. Zudem werden Bauvorschriften vereinfacht.""",
"source": "abendzeitung-muenchen",
"link": "https://example.com/az-wohnungsbau-1",
"published_at": datetime.utcnow(),
"category": "local",
"word_count": 85
},
{
"title": "Stadtrat München stimmt für neue Wohnungsbau-Verordnung",
"content": """In einer Sitzung am Dienstag stimmte der Münchner Stadtrat für neue Wohnungsbau-Verordnungen.
Die Beschlüsse zielen darauf ab, mehr bezahlbaren Wohnraum in München zu schaffen.
OB Reiter bezeichnete die Entscheidung als Meilenstein im Kampf gegen die Wohnungsnot.
Künftig müssen 40 Prozent aller Neubauwohnungen als Sozialwohnungen errichtet werden.
Außerdem werden bürokratische Hürden beim Bauen abgebaut.""",
"source": "sueddeutsche",
"link": "https://example.com/sz-wohnungsbau-1",
"published_at": datetime.utcnow(),
"category": "local",
"word_count": 72
},
{
"title": "FC Bayern München verpflichtet neuen Stürmer aus Brasilien",
"content": """Der FC Bayern München hat einen neuen Stürmer verpflichtet. Der 23-jährige Brasilianer
wechselt für eine Ablösesumme von 50 Millionen Euro nach München. Sportdirektor Christoph Freund
zeigte sich begeistert von der Verpflichtung. Der Spieler soll die Offensive verstärken.""",
"source": "abendzeitung-muenchen",
"link": "https://example.com/az-bayern-1",
"published_at": datetime.utcnow(),
"category": "sports",
"word_count": 52
},
{
"title": "Bayern München holt brasilianischen Angreifer",
"content": """Der deutsche Rekordmeister Bayern München hat einen brasilianischen Stürmer unter Vertrag genommen.
Für 50 Millionen Euro wechselt der 23-Jährige an die Isar. Sportdirektor Freund lobte den Transfer.
Der Neuzugang soll die Münchner Offensive beleben und für mehr Torgefahr sorgen.""",
"source": "sueddeutsche",
"link": "https://example.com/sz-bayern-1",
"published_at": datetime.utcnow(),
"category": "sports",
"word_count": 48
}
]
print("Testing AI Clustering with Realistic Articles")
print("=" * 70)
print()
# Clear previous test articles
print("Cleaning up previous test articles...")
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
print("✓ Cleaned up")
print()
# Import clustering
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from config import Config
# Initialize
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
enabled=Config.OLLAMA_ENABLED,
timeout=30
)
clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.50,
time_window_hours=24
)
print("Processing articles with AI clustering...")
print()
clustered_articles = []
for i, article in enumerate(test_articles, 1):
print(f"{i}. Processing: {article['title'][:60]}...")
print(f" Source: {article['source']}")
# Cluster with previously processed articles
clustered = clusterer.cluster_article(article, clustered_articles)
clustered_articles.append(clustered)
print(f" → Cluster ID: {clustered['cluster_id']}")
print(f" → Is Primary: {clustered['is_primary']}")
# Insert into database
db.articles.insert_one(clustered)
print(f" ✓ Saved to database")
print()
print("=" * 70)
print("Clustering Results:")
print()
# Analyze results
clusters = {}
for article in clustered_articles:
cluster_id = article['cluster_id']
if cluster_id not in clusters:
clusters[cluster_id] = []
clusters[cluster_id].append(article)
for cluster_id, articles in clusters.items():
print(f"Cluster {cluster_id}: {len(articles)} article(s)")
for article in articles:
print(f" - [{article['source']}] {article['title'][:60]}...")
print()
# Expected results
print("=" * 70)
print("Expected Results:")
print(" ✓ Articles 1&2 should be in same cluster (housing story)")
print(" ✓ Articles 3&4 should be in same cluster (Bayern transfer)")
print(" ✓ Total: 2 clusters with 2 articles each")
print()
# Actual results
housing_cluster = [a for a in clustered_articles if 'Wohnungsbau' in a['title'] or 'Wohnungsbau' in a['title']]
bayern_cluster = [a for a in clustered_articles if 'Bayern' in a['title'] or 'Stürmer' in a['title']]
housing_cluster_ids = set(a['cluster_id'] for a in housing_cluster)
bayern_cluster_ids = set(a['cluster_id'] for a in bayern_cluster)
print("Actual Results:")
if len(housing_cluster_ids) == 1:
print(" ✓ Housing articles clustered together")
else:
print(f" ✗ Housing articles in {len(housing_cluster_ids)} different clusters")
if len(bayern_cluster_ids) == 1:
print(" ✓ Bayern articles clustered together")
else:
print(f" ✗ Bayern articles in {len(bayern_cluster_ids)} different clusters")
if len(clusters) == 2:
print(" ✓ Total clusters: 2 (correct)")
else:
print(f" ✗ Total clusters: {len(clusters)} (expected 2)")
print()
print("=" * 70)
print("✓ Test complete! Check the results above.")

View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""
Complete workflow test: Clustering + Neutral Summaries
"""
from pymongo import MongoClient
from datetime import datetime
import sys
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
print("=" * 70)
print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
print("=" * 70)
print()
# Clean up previous test
print("1. Cleaning up previous test data...")
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
print(" ✓ Cleaned up")
print()
# Import modules
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import ClusterSummarizer
from config import Config
# Initialize
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
enabled=Config.OLLAMA_ENABLED,
timeout=60
)
clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
summarizer = ClusterSummarizer(ollama_client, max_words=200)
# Test articles - 2 stories, 2 sources each
test_articles = [
# Story 1: Munich Housing (2 sources)
{
"title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
"content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
"source": "abendzeitung-muenchen",
"link": "https://example.com/test-housing-az",
"published_at": datetime.utcnow(),
"category": "local"
},
{
"title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
"content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
"source": "sueddeutsche",
"link": "https://example.com/test-housing-sz",
"published_at": datetime.utcnow(),
"category": "local"
},
# Story 2: Bayern Transfer (2 sources)
{
"title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
"content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
"source": "abendzeitung-muenchen",
"link": "https://example.com/test-bayern-az",
"published_at": datetime.utcnow(),
"category": "sports"
},
{
"title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
"content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
"source": "sueddeutsche",
"link": "https://example.com/test-bayern-sz",
"published_at": datetime.utcnow(),
"category": "sports"
}
]
print("2. Processing articles with AI clustering...")
print()
clustered_articles = []
for i, article in enumerate(test_articles, 1):
print(f" Article {i}: {article['title'][:50]}...")
print(f" Source: {article['source']}")
# Cluster
clustered = clusterer.cluster_article(article, clustered_articles)
clustered_articles.append(clustered)
print(f" → Cluster: {clustered['cluster_id']}")
print(f" → Primary: {clustered['is_primary']}")
# Save to DB
db.articles.insert_one(clustered)
print(f" ✓ Saved")
print()
print("=" * 70)
print("3. Clustering Results:")
print()
# Analyze clusters
clusters = {}
for article in clustered_articles:
cid = article['cluster_id']
if cid not in clusters:
clusters[cid] = []
clusters[cid].append(article)
print(f" Total clusters: {len(clusters)}")
print()
for cid, articles in clusters.items():
print(f" Cluster {cid}:")
print(f" - Articles: {len(articles)}")
for article in articles:
print(f" • [{article['source']}] {article['title'][:45]}...")
print()
# Check expectations
if len(clusters) == 2:
print(" ✓ Expected 2 clusters (housing + bayern)")
else:
print(f" ⚠ Expected 2 clusters, got {len(clusters)}")
print()
print("=" * 70)
print("4. Generating neutral summaries...")
print()
summary_count = 0
for cid, articles in clusters.items():
if len(articles) < 2:
print(f" Skipping cluster {cid} (only 1 article)")
continue
print(f" Cluster {cid}: {len(articles)} articles")
result = summarizer.generate_neutral_summary(articles)
if result['success']:
print(f" ✓ Generated summary ({result['duration']:.1f}s)")
# Save
db.cluster_summaries.insert_one({
"cluster_id": cid,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow()
})
summary_count += 1
# Show preview
preview = result['neutral_summary'][:100] + "..."
print(f" Preview: {preview}")
else:
print(f" ✗ Failed: {result['error']}")
print()
print("=" * 70)
print("5. Final Results:")
print()
test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
test_summary_count = db.cluster_summaries.count_documents({})
print(f" Articles saved: {test_article_count}")
print(f" Clusters created: {len(clusters)}")
print(f" Neutral summaries: {summary_count}")
print()
if len(clusters) == 2 and summary_count == 2:
print(" ✅ SUCCESS! Complete workflow working perfectly!")
print()
print(" The system now:")
print(" 1. ✓ Clusters articles from different sources")
print(" 2. ✓ Generates neutral summaries combining perspectives")
print(" 3. ✓ Stores everything in MongoDB")
else:
print(" ⚠ Partial success - check results above")
print()
print("=" * 70)

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Test neutral summary generation from clustered articles
"""
from pymongo import MongoClient
from datetime import datetime
import sys
# Connect to MongoDB
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
print("Testing Neutral Summary Generation")
print("=" * 70)
print()
# Check for test articles
test_articles = list(db.articles.find(
{"link": {"$regex": "^https://example.com/"}}
).sort("_id", 1))
if len(test_articles) == 0:
print("⚠ No test articles found. Run test-clustering-real.py first.")
sys.exit(1)
print(f"Found {len(test_articles)} test articles")
print()
# Find clusters with multiple articles
clusters = {}
for article in test_articles:
cid = article['cluster_id']
if cid not in clusters:
clusters[cid] = []
clusters[cid].append(article)
multi_article_clusters = {k: v for k, v in clusters.items() if len(v) > 1}
if len(multi_article_clusters) == 0:
print("⚠ No clusters with multiple articles found")
sys.exit(1)
print(f"Found {len(multi_article_clusters)} cluster(s) with multiple articles")
print()
# Import cluster summarizer
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from cluster_summarizer import ClusterSummarizer
from config import Config
# Initialize
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
enabled=Config.OLLAMA_ENABLED,
timeout=60
)
summarizer = ClusterSummarizer(ollama_client, max_words=200)
print("Generating neutral summaries...")
print("=" * 70)
print()
for cluster_id, articles in multi_article_clusters.items():
print(f"Cluster: {cluster_id}")
print(f"Articles: {len(articles)}")
print()
# Show individual articles
for i, article in enumerate(articles, 1):
print(f" {i}. [{article['source']}] {article['title'][:60]}...")
print()
# Generate neutral summary
print(" Generating neutral summary...")
result = summarizer.generate_neutral_summary(articles)
if result['success']:
print(f" ✓ Success ({result['duration']:.1f}s)")
print()
print(" Neutral Summary:")
print(" " + "-" * 66)
# Wrap text at 66 chars
summary = result['neutral_summary']
words = summary.split()
lines = []
current_line = " "
for word in words:
if len(current_line) + len(word) + 1 <= 68:
current_line += word + " "
else:
lines.append(current_line.rstrip())
current_line = " " + word + " "
if current_line.strip():
lines.append(current_line.rstrip())
print("\n".join(lines))
print(" " + "-" * 66)
print()
# Save to database
db.cluster_summaries.update_one(
{"cluster_id": cluster_id},
{
"$set": {
"cluster_id": cluster_id,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
},
upsert=True
)
print(" ✓ Saved to cluster_summaries collection")
else:
print(f" ✗ Failed: {result['error']}")
print()
print("=" * 70)
print()
print("Testing complete!")
print()
# Show summary statistics
total_cluster_summaries = db.cluster_summaries.count_documents({})
print(f"Total cluster summaries in database: {total_cluster_summaries}")