update
This commit is contained in:
187
tests/crawler/test_complete_workflow.py
Normal file
187
tests/crawler/test_complete_workflow.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete workflow test: Clustering + Neutral Summaries
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
|
||||
db = client["munich_news"]
|
||||
|
||||
print("=" * 70)
|
||||
print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Clean up previous test
|
||||
print("1. Cleaning up previous test data...")
|
||||
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
|
||||
db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
|
||||
print(" ✓ Cleaned up")
|
||||
print()
|
||||
|
||||
# Import modules
|
||||
sys.path.insert(0, '/app')
|
||||
from ollama_client import OllamaClient
|
||||
from article_clustering import ArticleClusterer
|
||||
from cluster_summarizer import ClusterSummarizer
|
||||
from config import Config
|
||||
|
||||
# Initialize
|
||||
ollama_client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
|
||||
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
||||
|
||||
# Test articles - 2 stories, 2 sources each
|
||||
test_articles = [
|
||||
# Story 1: Munich Housing (2 sources)
|
||||
{
|
||||
"title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
|
||||
"content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
|
||||
"source": "abendzeitung-muenchen",
|
||||
"link": "https://example.com/test-housing-az",
|
||||
"published_at": datetime.utcnow(),
|
||||
"category": "local"
|
||||
},
|
||||
{
|
||||
"title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
|
||||
"content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
|
||||
"source": "sueddeutsche",
|
||||
"link": "https://example.com/test-housing-sz",
|
||||
"published_at": datetime.utcnow(),
|
||||
"category": "local"
|
||||
},
|
||||
# Story 2: Bayern Transfer (2 sources)
|
||||
{
|
||||
"title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
|
||||
"content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
|
||||
"source": "abendzeitung-muenchen",
|
||||
"link": "https://example.com/test-bayern-az",
|
||||
"published_at": datetime.utcnow(),
|
||||
"category": "sports"
|
||||
},
|
||||
{
|
||||
"title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
|
||||
"content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
|
||||
"source": "sueddeutsche",
|
||||
"link": "https://example.com/test-bayern-sz",
|
||||
"published_at": datetime.utcnow(),
|
||||
"category": "sports"
|
||||
}
|
||||
]
|
||||
|
||||
print("2. Processing articles with AI clustering...")
|
||||
print()
|
||||
|
||||
clustered_articles = []
|
||||
for i, article in enumerate(test_articles, 1):
|
||||
print(f" Article {i}: {article['title'][:50]}...")
|
||||
print(f" Source: {article['source']}")
|
||||
|
||||
# Cluster
|
||||
clustered = clusterer.cluster_article(article, clustered_articles)
|
||||
clustered_articles.append(clustered)
|
||||
|
||||
print(f" → Cluster: {clustered['cluster_id']}")
|
||||
print(f" → Primary: {clustered['is_primary']}")
|
||||
|
||||
# Save to DB
|
||||
db.articles.insert_one(clustered)
|
||||
print(f" ✓ Saved")
|
||||
print()
|
||||
|
||||
print("=" * 70)
|
||||
print("3. Clustering Results:")
|
||||
print()
|
||||
|
||||
# Analyze clusters
|
||||
clusters = {}
|
||||
for article in clustered_articles:
|
||||
cid = article['cluster_id']
|
||||
if cid not in clusters:
|
||||
clusters[cid] = []
|
||||
clusters[cid].append(article)
|
||||
|
||||
print(f" Total clusters: {len(clusters)}")
|
||||
print()
|
||||
|
||||
for cid, articles in clusters.items():
|
||||
print(f" Cluster {cid}:")
|
||||
print(f" - Articles: {len(articles)}")
|
||||
for article in articles:
|
||||
print(f" • [{article['source']}] {article['title'][:45]}...")
|
||||
print()
|
||||
|
||||
# Check expectations
|
||||
if len(clusters) == 2:
|
||||
print(" ✓ Expected 2 clusters (housing + bayern)")
|
||||
else:
|
||||
print(f" ⚠ Expected 2 clusters, got {len(clusters)}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("4. Generating neutral summaries...")
|
||||
print()
|
||||
|
||||
summary_count = 0
|
||||
for cid, articles in clusters.items():
|
||||
if len(articles) < 2:
|
||||
print(f" Skipping cluster {cid} (only 1 article)")
|
||||
continue
|
||||
|
||||
print(f" Cluster {cid}: {len(articles)} articles")
|
||||
|
||||
result = summarizer.generate_neutral_summary(articles)
|
||||
|
||||
if result['success']:
|
||||
print(f" ✓ Generated summary ({result['duration']:.1f}s)")
|
||||
|
||||
# Save
|
||||
db.cluster_summaries.insert_one({
|
||||
"cluster_id": cid,
|
||||
"neutral_summary": result['neutral_summary'],
|
||||
"sources": result['sources'],
|
||||
"article_count": result['article_count'],
|
||||
"created_at": datetime.utcnow()
|
||||
})
|
||||
summary_count += 1
|
||||
|
||||
# Show preview
|
||||
preview = result['neutral_summary'][:100] + "..."
|
||||
print(f" Preview: {preview}")
|
||||
else:
|
||||
print(f" ✗ Failed: {result['error']}")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 70)
|
||||
print("5. Final Results:")
|
||||
print()
|
||||
|
||||
test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
|
||||
test_summary_count = db.cluster_summaries.count_documents({})
|
||||
|
||||
print(f" Articles saved: {test_article_count}")
|
||||
print(f" Clusters created: {len(clusters)}")
|
||||
print(f" Neutral summaries: {summary_count}")
|
||||
print()
|
||||
|
||||
if len(clusters) == 2 and summary_count == 2:
|
||||
print(" ✅ SUCCESS! Complete workflow working perfectly!")
|
||||
print()
|
||||
print(" The system now:")
|
||||
print(" 1. ✓ Clusters articles from different sources")
|
||||
print(" 2. ✓ Generates neutral summaries combining perspectives")
|
||||
print(" 3. ✓ Stores everything in MongoDB")
|
||||
else:
|
||||
print(" ⚠ Partial success - check results above")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
Reference in New Issue
Block a user