Munich-news/tests/crawler/test_complete_workflow.py

#!/usr/bin/env python3
"""
Complete workflow test: Clustering + Neutral Summaries
"""
from pymongo import MongoClient
from datetime import datetime
import sys

client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]

print("=" * 70)
print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries")
print("=" * 70)
print()

# Clean up previous test
print("1. Cleaning up previous test data...")
db.articles.delete_many({"link": {"$regex": "^https://example.com/"}})
db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}})
print("   ✓ Cleaned up")
print()

# Import modules
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import ClusterSummarizer
from config import Config

# Initialize
ollama_client = OllamaClient(
    base_url=Config.OLLAMA_BASE_URL,
    model=Config.OLLAMA_MODEL,
    enabled=Config.OLLAMA_ENABLED,
    timeout=60
)

clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24)
summarizer = ClusterSummarizer(ollama_client, max_words=200)

# Test articles - 2 stories, 2 sources each
test_articles = [
    # Story 1: Munich Housing (2 sources)
    {
        "title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen",
        "content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.",
        "source": "abendzeitung-muenchen",
        "link": "https://example.com/test-housing-az",
        "published_at": datetime.utcnow(),
        "category": "local"
    },
    {
        "title": "Stadtrat München: Neue Verordnung für Wohnungsbau",
        "content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.",
        "source": "sueddeutsche",
        "link": "https://example.com/test-housing-sz",
        "published_at": datetime.utcnow(),
        "category": "local"
    },
    # Story 2: Bayern Transfer (2 sources)
    {
        "title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen",
        "content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.",
        "source": "abendzeitung-muenchen",
        "link": "https://example.com/test-bayern-az",
        "published_at": datetime.utcnow(),
        "category": "sports"
    },
    {
        "title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro",
        "content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.",
        "source": "sueddeutsche",
        "link": "https://example.com/test-bayern-sz",
        "published_at": datetime.utcnow(),
        "category": "sports"
    }
]

print("2. Processing articles with AI clustering...")
print()

clustered_articles = []
for i, article in enumerate(test_articles, 1):
    print(f"   Article {i}: {article['title'][:50]}...")
    print(f"   Source: {article['source']}")

    # Cluster
    clustered = clusterer.cluster_article(article, clustered_articles)
    clustered_articles.append(clustered)

    print(f"   → Cluster: {clustered['cluster_id']}")
    print(f"   → Primary: {clustered['is_primary']}")

    # Save to DB
    db.articles.insert_one(clustered)
    print(f"   ✓ Saved")
    print()

print("=" * 70)
print("3. Clustering Results:")
print()

# Analyze clusters
clusters = {}
for article in clustered_articles:
    cid = article['cluster_id']
    if cid not in clusters:
        clusters[cid] = []
    clusters[cid].append(article)

print(f"   Total clusters: {len(clusters)}")
print()

for cid, articles in clusters.items():
    print(f"   Cluster {cid}:")
    print(f"   - Articles: {len(articles)}")
    for article in articles:
        print(f"     • [{article['source']}] {article['title'][:45]}...")
    print()

# Check expectations
if len(clusters) == 2:
    print("   ✓ Expected 2 clusters (housing + bayern)")
else:
    print(f"   ⚠ Expected 2 clusters, got {len(clusters)}")

print()
print("=" * 70)
print("4. Generating neutral summaries...")
print()

summary_count = 0
for cid, articles in clusters.items():
    if len(articles) < 2:
        print(f"   Skipping cluster {cid} (only 1 article)")
        continue

    print(f"   Cluster {cid}: {len(articles)} articles")

    result = summarizer.generate_neutral_summary(articles)

    if result['success']:
        print(f"   ✓ Generated summary ({result['duration']:.1f}s)")

        # Save
        db.cluster_summaries.insert_one({
            "cluster_id": cid,
            "neutral_summary": result['neutral_summary'],
            "sources": result['sources'],
            "article_count": result['article_count'],
            "created_at": datetime.utcnow()
        })
        summary_count += 1

        # Show preview
        preview = result['neutral_summary'][:100] + "..."
        print(f"   Preview: {preview}")
    else:
        print(f"   ✗ Failed: {result['error']}")

    print()

print("=" * 70)
print("5. Final Results:")
print()

test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}})
test_summary_count = db.cluster_summaries.count_documents({})

print(f"   Articles saved: {test_article_count}")
print(f"   Clusters created: {len(clusters)}")
print(f"   Neutral summaries: {summary_count}")
print()

if len(clusters) == 2 and summary_count == 2:
    print("   ✅ SUCCESS! Complete workflow working perfectly!")
    print()
    print("   The system now:")
    print("   1. ✓ Clusters articles from different sources")
    print("   2. ✓ Generates neutral summaries combining perspectives")
    print("   3. ✓ Stores everything in MongoDB")
else:
    print("   ⚠ Partial success - check results above")

print()
print("=" * 70)