update
This commit is contained in:
130
tests/crawler/test_neutral_summaries.py
Normal file
130
tests/crawler/test_neutral_summaries.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test neutral summary generation from clustered articles
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
# Connect to MongoDB
|
||||
client = MongoClient("mongodb://admin:changeme@mongodb:27017/")
|
||||
db = client["munich_news"]
|
||||
|
||||
print("Testing Neutral Summary Generation")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Check for test articles
|
||||
test_articles = list(db.articles.find(
|
||||
{"link": {"$regex": "^https://example.com/"}}
|
||||
).sort("_id", 1))
|
||||
|
||||
if len(test_articles) == 0:
|
||||
print("⚠ No test articles found. Run test-clustering-real.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(test_articles)} test articles")
|
||||
print()
|
||||
|
||||
# Find clusters with multiple articles
|
||||
clusters = {}
|
||||
for article in test_articles:
|
||||
cid = article['cluster_id']
|
||||
if cid not in clusters:
|
||||
clusters[cid] = []
|
||||
clusters[cid].append(article)
|
||||
|
||||
multi_article_clusters = {k: v for k, v in clusters.items() if len(v) > 1}
|
||||
|
||||
if len(multi_article_clusters) == 0:
|
||||
print("⚠ No clusters with multiple articles found")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(multi_article_clusters)} cluster(s) with multiple articles")
|
||||
print()
|
||||
|
||||
# Import cluster summarizer
|
||||
sys.path.insert(0, '/app')
|
||||
from ollama_client import OllamaClient
|
||||
from cluster_summarizer import ClusterSummarizer
|
||||
from config import Config
|
||||
|
||||
# Initialize
|
||||
ollama_client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
||||
|
||||
print("Generating neutral summaries...")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
for cluster_id, articles in multi_article_clusters.items():
|
||||
print(f"Cluster: {cluster_id}")
|
||||
print(f"Articles: {len(articles)}")
|
||||
print()
|
||||
|
||||
# Show individual articles
|
||||
for i, article in enumerate(articles, 1):
|
||||
print(f" {i}. [{article['source']}] {article['title'][:60]}...")
|
||||
print()
|
||||
|
||||
# Generate neutral summary
|
||||
print(" Generating neutral summary...")
|
||||
result = summarizer.generate_neutral_summary(articles)
|
||||
|
||||
if result['success']:
|
||||
print(f" ✓ Success ({result['duration']:.1f}s)")
|
||||
print()
|
||||
print(" Neutral Summary:")
|
||||
print(" " + "-" * 66)
|
||||
# Wrap text at 66 chars
|
||||
summary = result['neutral_summary']
|
||||
words = summary.split()
|
||||
lines = []
|
||||
current_line = " "
|
||||
for word in words:
|
||||
if len(current_line) + len(word) + 1 <= 68:
|
||||
current_line += word + " "
|
||||
else:
|
||||
lines.append(current_line.rstrip())
|
||||
current_line = " " + word + " "
|
||||
if current_line.strip():
|
||||
lines.append(current_line.rstrip())
|
||||
print("\n".join(lines))
|
||||
print(" " + "-" * 66)
|
||||
print()
|
||||
|
||||
# Save to database
|
||||
db.cluster_summaries.update_one(
|
||||
{"cluster_id": cluster_id},
|
||||
{
|
||||
"$set": {
|
||||
"cluster_id": cluster_id,
|
||||
"neutral_summary": result['neutral_summary'],
|
||||
"sources": result['sources'],
|
||||
"article_count": result['article_count'],
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
},
|
||||
upsert=True
|
||||
)
|
||||
print(" ✓ Saved to cluster_summaries collection")
|
||||
else:
|
||||
print(f" ✗ Failed: {result['error']}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
print("Testing complete!")
|
||||
print()
|
||||
|
||||
# Show summary statistics
|
||||
total_cluster_summaries = db.cluster_summaries.count_documents({})
|
||||
print(f"Total cluster summaries in database: {total_cluster_summaries}")
|
||||
Reference in New Issue
Block a user