Files
Munich-news/tests/crawler/test_complete_workflow.py
2025-11-12 11:34:33 +01:00

188 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Complete workflow test: Clustering + Neutral Summaries
"""
from pymongo import MongoClient
from datetime import datetime
import sys
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
print("=" * 70)
print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
print("=" * 70)
print()
# Clean up previous test
print("1. Cleaning up previous test data...")
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
print(" ✓ Cleaned up")
print()
# Import modules
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import ClusterSummarizer
from config import Config
# Initialize
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
enabled=Config.OLLAMA_ENABLED,
timeout=60
)
clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
summarizer = ClusterSummarizer(ollama_client, max_words=200)
# Test articles - 2 stories, 2 sources each
test_articles = [
# Story 1: Munich Housing (2 sources)
{
"title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
"content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
"source": "abendzeitung-muenchen",
"link": "https://example.com/test-housing-az",
"published_at": datetime.utcnow(),
"category": "local"
},
{
"title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
"content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
"source": "sueddeutsche",
"link": "https://example.com/test-housing-sz",
"published_at": datetime.utcnow(),
"category": "local"
},
# Story 2: Bayern Transfer (2 sources)
{
"title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
"content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
"source": "abendzeitung-muenchen",
"link": "https://example.com/test-bayern-az",
"published_at": datetime.utcnow(),
"category": "sports"
},
{
"title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
"content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
"source": "sueddeutsche",
"link": "https://example.com/test-bayern-sz",
"published_at": datetime.utcnow(),
"category": "sports"
}
]
print("2. Processing articles with AI clustering...")
print()
clustered_articles = []
for i, article in enumerate(test_articles, 1):
print(f" Article {i}: {article['title'][:50]}...")
print(f" Source: {article['source']}")
# Cluster
clustered = clusterer.cluster_article(article, clustered_articles)
clustered_articles.append(clustered)
print(f" → Cluster: {clustered['cluster_id']}")
print(f" → Primary: {clustered['is_primary']}")
# Save to DB
db.articles.insert_one(clustered)
print(f" ✓ Saved")
print()
print("=" * 70)
print("3. Clustering Results:")
print()
# Analyze clusters
clusters = {}
for article in clustered_articles:
cid = article['cluster_id']
if cid not in clusters:
clusters[cid] = []
clusters[cid].append(article)
print(f" Total clusters: {len(clusters)}")
print()
for cid, articles in clusters.items():
print(f" Cluster {cid}:")
print(f" - Articles: {len(articles)}")
for article in articles:
print(f" • [{article['source']}] {article['title'][:45]}...")
print()
# Check expectations
if len(clusters) == 2:
print(" ✓ Expected 2 clusters (housing + bayern)")
else:
print(f" ⚠ Expected 2 clusters, got {len(clusters)}")
print()
print("=" * 70)
print("4. Generating neutral summaries...")
print()
summary_count = 0
for cid, articles in clusters.items():
if len(articles) < 2:
print(f" Skipping cluster {cid} (only 1 article)")
continue
print(f" Cluster {cid}: {len(articles)} articles")
result = summarizer.generate_neutral_summary(articles)
if result['success']:
print(f" ✓ Generated summary ({result['duration']:.1f}s)")
# Save
db.cluster_summaries.insert_one({
"cluster_id": cid,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow()
})
summary_count += 1
# Show preview
preview = result['neutral_summary'][:100] + "..."
print(f" Preview: {preview}")
else:
print(f" ✗ Failed: {result['error']}")
print()
print("=" * 70)
print("5. Final Results:")
print()
test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
test_summary_count = db.cluster_summaries.count_documents({})
print(f" Articles saved: {test_article_count}")
print(f" Clusters created: {len(clusters)}")
print(f" Neutral summaries: {summary_count}")
print()
if len(clusters) == 2 and summary_count == 2:
print(" ✅ SUCCESS! Complete workflow working perfectly!")
print()
print(" The system now:")
print(" 1. ✓ Clusters articles from different sources")
print(" 2. ✓ Generates neutral summaries combining perspectives")
print(" 3. ✓ Stores everything in MongoDB")
else:
print(" ⚠ Partial success - check results above")
print()
print("=" * 70)