131 lines
3.6 KiB
Python
131 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test neutral summary generation from clustered articles
|
|
"""
|
|
from pymongo import MongoClient
|
|
from datetime import datetime
|
|
import sys
|
|
|
|
# Connect to MongoDB
|
|
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
|
|
db = client["munich_news"]
|
|
|
|
print("Testing Neutral Summary Generation")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Check for test articles
|
|
test_articles = list(db.articles.find(
|
|
{"link": {"$regex": "^https://example.com/"}}
|
|
).sort("_id", 1))
|
|
|
|
if len(test_articles) == 0:
|
|
print("⚠ No test articles found. Run test-clustering-real.py first.")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(test_articles)} test articles")
|
|
print()
|
|
|
|
# Find clusters with multiple articles
|
|
clusters = {}
|
|
for article in test_articles:
|
|
cid = article['cluster_id']
|
|
if cid not in clusters:
|
|
clusters[cid] = []
|
|
clusters[cid].append(article)
|
|
|
|
multi_article_clusters = {k: v for k, v in clusters.items() if len(v) > 1}
|
|
|
|
if len(multi_article_clusters) == 0:
|
|
print("⚠ No clusters with multiple articles found")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(multi_article_clusters)} cluster(s) with multiple articles")
|
|
print()
|
|
|
|
# Import cluster summarizer
|
|
sys.path.insert(0, '/app')
|
|
from ollama_client import OllamaClient
|
|
from cluster_summarizer import ClusterSummarizer
|
|
from config import Config
|
|
|
|
# Initialize
|
|
ollama_client = OllamaClient(
|
|
base_url=Config.OLLAMA_BASE_URL,
|
|
model=Config.OLLAMA_MODEL,
|
|
enabled=Config.OLLAMA_ENABLED,
|
|
timeout=60
|
|
)
|
|
|
|
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
|
|
|
print("Generating neutral summaries...")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
for cluster_id, articles in multi_article_clusters.items():
|
|
print(f"Cluster: {cluster_id}")
|
|
print(f"Articles: {len(articles)}")
|
|
print()
|
|
|
|
# Show individual articles
|
|
for i, article in enumerate(articles, 1):
|
|
print(f" {i}. [{article['source']}] {article['title'][:60]}...")
|
|
print()
|
|
|
|
# Generate neutral summary
|
|
print(" Generating neutral summary...")
|
|
result = summarizer.generate_neutral_summary(articles)
|
|
|
|
if result['success']:
|
|
print(f" ✓ Success ({result['duration']:.1f}s)")
|
|
print()
|
|
print(" Neutral Summary:")
|
|
print(" " + "-" * 66)
|
|
# Wrap text at 66 chars
|
|
summary = result['neutral_summary']
|
|
words = summary.split()
|
|
lines = []
|
|
current_line = " "
|
|
for word in words:
|
|
if len(current_line) + len(word) + 1 <= 68:
|
|
current_line += word + " "
|
|
else:
|
|
lines.append(current_line.rstrip())
|
|
current_line = " " + word + " "
|
|
if current_line.strip():
|
|
lines.append(current_line.rstrip())
|
|
print("\n".join(lines))
|
|
print(" " + "-" * 66)
|
|
print()
|
|
|
|
# Save to database
|
|
db.cluster_summaries.update_one(
|
|
{"cluster_id": cluster_id},
|
|
{
|
|
"$set": {
|
|
"cluster_id": cluster_id,
|
|
"neutral_summary": result['neutral_summary'],
|
|
"sources": result['sources'],
|
|
"article_count": result['article_count'],
|
|
"created_at": datetime.utcnow(),
|
|
"updated_at": datetime.utcnow()
|
|
}
|
|
},
|
|
upsert=True
|
|
)
|
|
print(" ✓ Saved to cluster_summaries collection")
|
|
else:
|
|
print(f" ✗ Failed: {result['error']}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print()
|
|
|
|
print("Testing complete!")
|
|
print()
|
|
|
|
# Show summary statistics
|
|
total_cluster_summaries = db.cluster_summaries.count_documents({})
|
|
print(f"Total cluster summaries in database: {total_cluster_summaries}")
|