Files
Munich-news/tests/crawler/test_neutral_summaries.py
2025-11-12 11:34:33 +01:00

131 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Test neutral summary generation from clustered articles
"""
from pymongo import MongoClient
from datetime import datetime
import sys
# Connect to MongoDB
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
db = client["munich_news"]
print("Testing Neutral Summary Generation")
print("=" * 70)
print()
# Check for test articles
test_articles = list(db.articles.find(
{"link": {"$regex": "^https://example.com/"}}
).sort("_id", 1))
if len(test_articles) == 0:
print("⚠ No test articles found. Run test-clustering-real.py first.")
sys.exit(1)
print(f"Found {len(test_articles)} test articles")
print()
# Find clusters with multiple articles
clusters = {}
for article in test_articles:
cid = article['cluster_id']
if cid not in clusters:
clusters[cid] = []
clusters[cid].append(article)
multi_article_clusters = {k: v for k, v in clusters.items() if len(v) > 1}
if len(multi_article_clusters) == 0:
print("⚠ No clusters with multiple articles found")
sys.exit(1)
print(f"Found {len(multi_article_clusters)} cluster(s) with multiple articles")
print()
# Import cluster summarizer
sys.path.insert(0, '/app')
from ollama_client import OllamaClient
from cluster_summarizer import ClusterSummarizer
from config import Config
# Initialize
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
enabled=Config.OLLAMA_ENABLED,
timeout=60
)
summarizer = ClusterSummarizer(ollama_client, max_words=200)
print("Generating neutral summaries...")
print("=" * 70)
print()
for cluster_id, articles in multi_article_clusters.items():
print(f"Cluster: {cluster_id}")
print(f"Articles: {len(articles)}")
print()
# Show individual articles
for i, article in enumerate(articles, 1):
print(f" {i}. [{article['source']}] {article['title'][:60]}...")
print()
# Generate neutral summary
print(" Generating neutral summary...")
result = summarizer.generate_neutral_summary(articles)
if result['success']:
print(f" ✓ Success ({result['duration']:.1f}s)")
print()
print(" Neutral Summary:")
print(" " + "-" * 66)
# Wrap text at 66 chars
summary = result['neutral_summary']
words = summary.split()
lines = []
current_line = " "
for word in words:
if len(current_line) + len(word) + 1 <= 68:
current_line += word + " "
else:
lines.append(current_line.rstrip())
current_line = " " + word + " "
if current_line.strip():
lines.append(current_line.rstrip())
print("\n".join(lines))
print(" " + "-" * 66)
print()
# Save to database
db.cluster_summaries.update_one(
{"cluster_id": cluster_id},
{
"$set": {
"cluster_id": cluster_id,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
},
upsert=True
)
print(" ✓ Saved to cluster_summaries collection")
else:
print(f" ✗ Failed: {result['error']}")
print()
print("=" * 70)
print()
print("Testing complete!")
print()
# Show summary statistics
total_cluster_summaries = db.cluster_summaries.count_documents({})
print(f"Total cluster summaries in database: {total_cluster_summaries}")