188 lines
5.8 KiB
Python
188 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Complete workflow test: Clustering + Neutral Summaries
|
|
"""
|
|
from pymongo import MongoClient
|
|
from datetime import datetime
|
|
import sys
|
|
|
|
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
|
|
db = client["munich_news"]
|
|
|
|
print("=" * 70)
|
|
print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Clean up previous test
|
|
print("1. Cleaning up previous test data...")
|
|
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
|
|
db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
|
|
print(" ✓ Cleaned up")
|
|
print()
|
|
|
|
# Import modules
|
|
sys.path.insert(0, '/app')
|
|
from ollama_client import OllamaClient
|
|
from article_clustering import ArticleClusterer
|
|
from cluster_summarizer import ClusterSummarizer
|
|
from config import Config
|
|
|
|
# Initialize
|
|
ollama_client = OllamaClient(
|
|
base_url=Config.OLLAMA_BASE_URL,
|
|
model=Config.OLLAMA_MODEL,
|
|
enabled=Config.OLLAMA_ENABLED,
|
|
timeout=60
|
|
)
|
|
|
|
clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
|
|
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
|
|
|
# Test articles - 2 stories, 2 sources each
|
|
test_articles = [
|
|
# Story 1: Munich Housing (2 sources)
|
|
{
|
|
"title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
|
|
"content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
|
|
"source": "abendzeitung-muenchen",
|
|
"link": "https://example.com/test-housing-az",
|
|
"published_at": datetime.utcnow(),
|
|
"category": "local"
|
|
},
|
|
{
|
|
"title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
|
|
"content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
|
|
"source": "sueddeutsche",
|
|
"link": "https://example.com/test-housing-sz",
|
|
"published_at": datetime.utcnow(),
|
|
"category": "local"
|
|
},
|
|
# Story 2: Bayern Transfer (2 sources)
|
|
{
|
|
"title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
|
|
"content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
|
|
"source": "abendzeitung-muenchen",
|
|
"link": "https://example.com/test-bayern-az",
|
|
"published_at": datetime.utcnow(),
|
|
"category": "sports"
|
|
},
|
|
{
|
|
"title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
|
|
"content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
|
|
"source": "sueddeutsche",
|
|
"link": "https://example.com/test-bayern-sz",
|
|
"published_at": datetime.utcnow(),
|
|
"category": "sports"
|
|
}
|
|
]
|
|
|
|
print("2. Processing articles with AI clustering...")
|
|
print()
|
|
|
|
clustered_articles = []
|
|
for i, article in enumerate(test_articles, 1):
|
|
print(f" Article {i}: {article['title'][:50]}...")
|
|
print(f" Source: {article['source']}")
|
|
|
|
# Cluster
|
|
clustered = clusterer.cluster_article(article, clustered_articles)
|
|
clustered_articles.append(clustered)
|
|
|
|
print(f" → Cluster: {clustered['cluster_id']}")
|
|
print(f" → Primary: {clustered['is_primary']}")
|
|
|
|
# Save to DB
|
|
db.articles.insert_one(clustered)
|
|
print(f" ✓ Saved")
|
|
print()
|
|
|
|
print("=" * 70)
|
|
print("3. Clustering Results:")
|
|
print()
|
|
|
|
# Analyze clusters
|
|
clusters = {}
|
|
for article in clustered_articles:
|
|
cid = article['cluster_id']
|
|
if cid not in clusters:
|
|
clusters[cid] = []
|
|
clusters[cid].append(article)
|
|
|
|
print(f" Total clusters: {len(clusters)}")
|
|
print()
|
|
|
|
for cid, articles in clusters.items():
|
|
print(f" Cluster {cid}:")
|
|
print(f" - Articles: {len(articles)}")
|
|
for article in articles:
|
|
print(f" • [{article['source']}] {article['title'][:45]}...")
|
|
print()
|
|
|
|
# Check expectations
|
|
if len(clusters) == 2:
|
|
print(" ✓ Expected 2 clusters (housing + bayern)")
|
|
else:
|
|
print(f" ⚠ Expected 2 clusters, got {len(clusters)}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("4. Generating neutral summaries...")
|
|
print()
|
|
|
|
summary_count = 0
|
|
for cid, articles in clusters.items():
|
|
if len(articles) < 2:
|
|
print(f" Skipping cluster {cid} (only 1 article)")
|
|
continue
|
|
|
|
print(f" Cluster {cid}: {len(articles)} articles")
|
|
|
|
result = summarizer.generate_neutral_summary(articles)
|
|
|
|
if result['success']:
|
|
print(f" ✓ Generated summary ({result['duration']:.1f}s)")
|
|
|
|
# Save
|
|
db.cluster_summaries.insert_one({
|
|
"cluster_id": cid,
|
|
"neutral_summary": result['neutral_summary'],
|
|
"sources": result['sources'],
|
|
"article_count": result['article_count'],
|
|
"created_at": datetime.utcnow()
|
|
})
|
|
summary_count += 1
|
|
|
|
# Show preview
|
|
preview = result['neutral_summary'][:100] + "..."
|
|
print(f" Preview: {preview}")
|
|
else:
|
|
print(f" ✗ Failed: {result['error']}")
|
|
|
|
print()
|
|
|
|
print("=" * 70)
|
|
print("5. Final Results:")
|
|
print()
|
|
|
|
test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
|
|
test_summary_count = db.cluster_summaries.count_documents({})
|
|
|
|
print(f" Articles saved: {test_article_count}")
|
|
print(f" Clusters created: {len(clusters)}")
|
|
print(f" Neutral summaries: {summary_count}")
|
|
print()
|
|
|
|
if len(clusters) == 2 and summary_count == 2:
|
|
print(" ✅ SUCCESS! Complete workflow working perfectly!")
|
|
print()
|
|
print(" The system now:")
|
|
print(" 1. ✓ Clusters articles from different sources")
|
|
print(" 2. ✓ Generates neutral summaries combining perspectives")
|
|
print(" 3. ✓ Stores everything in MongoDB")
|
|
else:
|
|
print(" ⚠ Partial success - check results above")
|
|
|
|
print()
|
|
print("=" * 70)
|