#!/usr/bin/env python3 """ Complete workflow test: Clustering + Neutral Summaries """ from pymongo import MongoClient from datetime import datetime import sys client = MongoClient("mongodb://admin:changeme@mongodb:27017/") db = client["munich_news"] print("=" * 70) print("COMPLETE WORKFLOW TEST: AI Clustering + Neutral Summaries") print("=" * 70) print() # Clean up previous test print("1. Cleaning up previous test data...") db.articles.delete_many({"link": {"$regex": "^https://example.com/"}}) db.cluster_summaries.delete_many({"cluster_id": {"$regex": "^test_"}}) print(" ✓ Cleaned up") print() # Import modules sys.path.insert(0, '/app') from ollama_client import OllamaClient from article_clustering import ArticleClusterer from cluster_summarizer import ClusterSummarizer from config import Config # Initialize ollama_client = OllamaClient( base_url=Config.OLLAMA_BASE_URL, model=Config.OLLAMA_MODEL, enabled=Config.OLLAMA_ENABLED, timeout=60 ) clusterer = ArticleClusterer(ollama_client, similarity_threshold=0.50, time_window_hours=24) summarizer = ClusterSummarizer(ollama_client, max_words=200) # Test articles - 2 stories, 2 sources each test_articles = [ # Story 1: Munich Housing (2 sources) { "title": "München: Stadtrat beschließt neue Wohnungsbau-Regelungen", "content": "Der Münchner Stadtrat hat neue Regelungen für bezahlbaren Wohnungsbau beschlossen. 40% Sozialwohnungen werden Pflicht.", "source": "abendzeitung-muenchen", "link": "https://example.com/test-housing-az", "published_at": datetime.utcnow(), "category": "local" }, { "title": "Stadtrat München: Neue Verordnung für Wohnungsbau", "content": "München führt neue Wohnungsbau-Verordnung ein. Mindestens 40% der Neubauten müssen Sozialwohnungen sein.", "source": "sueddeutsche", "link": "https://example.com/test-housing-sz", "published_at": datetime.utcnow(), "category": "local" }, # Story 2: Bayern Transfer (2 sources) { "title": "FC Bayern verpflichtet brasilianischen Stürmer für 50 Millionen", "content": "Bayern München holt einen 23-jährigen Brasilianer. Sportdirektor Freund ist begeistert.", "source": "abendzeitung-muenchen", "link": "https://example.com/test-bayern-az", "published_at": datetime.utcnow(), "category": "sports" }, { "title": "Bayern München: Neuzugang aus Brasilien für 50 Mio. Euro", "content": "Der Rekordmeister verstärkt die Offensive mit einem brasilianischen Angreifer. Freund lobt den Transfer.", "source": "sueddeutsche", "link": "https://example.com/test-bayern-sz", "published_at": datetime.utcnow(), "category": "sports" } ] print("2. Processing articles with AI clustering...") print() clustered_articles = [] for i, article in enumerate(test_articles, 1): print(f" Article {i}: {article['title'][:50]}...") print(f" Source: {article['source']}") # Cluster clustered = clusterer.cluster_article(article, clustered_articles) clustered_articles.append(clustered) print(f" → Cluster: {clustered['cluster_id']}") print(f" → Primary: {clustered['is_primary']}") # Save to DB db.articles.insert_one(clustered) print(f" ✓ Saved") print() print("=" * 70) print("3. Clustering Results:") print() # Analyze clusters clusters = {} for article in clustered_articles: cid = article['cluster_id'] if cid not in clusters: clusters[cid] = [] clusters[cid].append(article) print(f" Total clusters: {len(clusters)}") print() for cid, articles in clusters.items(): print(f" Cluster {cid}:") print(f" - Articles: {len(articles)}") for article in articles: print(f" • [{article['source']}] {article['title'][:45]}...") print() # Check expectations if len(clusters) == 2: print(" ✓ Expected 2 clusters (housing + bayern)") else: print(f" ⚠ Expected 2 clusters, got {len(clusters)}") print() print("=" * 70) print("4. Generating neutral summaries...") print() summary_count = 0 for cid, articles in clusters.items(): if len(articles) < 2: print(f" Skipping cluster {cid} (only 1 article)") continue print(f" Cluster {cid}: {len(articles)} articles") result = summarizer.generate_neutral_summary(articles) if result['success']: print(f" ✓ Generated summary ({result['duration']:.1f}s)") # Save db.cluster_summaries.insert_one({ "cluster_id": cid, "neutral_summary": result['neutral_summary'], "sources": result['sources'], "article_count": result['article_count'], "created_at": datetime.utcnow() }) summary_count += 1 # Show preview preview = result['neutral_summary'][:100] + "..." print(f" Preview: {preview}") else: print(f" ✗ Failed: {result['error']}") print() print("=" * 70) print("5. Final Results:") print() test_article_count = db.articles.count_documents({"link": {"$regex": "^https://example.com/test-"}}) test_summary_count = db.cluster_summaries.count_documents({}) print(f" Articles saved: {test_article_count}") print(f" Clusters created: {len(clusters)}") print(f" Neutral summaries: {summary_count}") print() if len(clusters) == 2 and summary_count == 2: print(" ✅ SUCCESS! Complete workflow working perfectly!") print() print(" The system now:") print(" 1. ✓ Clusters articles from different sources") print(" 2. ✓ Generates neutral summaries combining perspectives") print(" 3. ✓ Stores everything in MongoDB") else: print(" ⚠ Partial success - check results above") print() print("=" * 70)