Files
Munich-news/news_crawler/cluster_summarizer.py
2025-11-12 11:34:33 +01:00

214 lines
7.4 KiB
Python

"""
Cluster Summarizer Module
Generates neutral summaries from multiple clustered articles
"""
from typing import List, Dict, Optional
from datetime import datetime
from ollama_client import OllamaClient
class ClusterSummarizer:
"""
Generates neutral summaries by synthesizing multiple articles about the same story
"""
def __init__(self, ollama_client: OllamaClient, max_words=200):
"""
Initialize cluster summarizer
Args:
ollama_client: OllamaClient instance for AI-based summarization
max_words: Maximum words in neutral summary
"""
self.ollama_client = ollama_client
self.max_words = max_words
def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
"""
Generate a neutral summary from multiple articles about the same story
Args:
articles: List of article dicts with 'title', 'content', 'source'
Returns:
{
'neutral_summary': str,
'sources': list,
'article_count': int,
'success': bool,
'error': str or None,
'duration': float
}
"""
if not articles or len(articles) == 0:
return {
'neutral_summary': None,
'sources': [],
'article_count': 0,
'success': False,
'error': 'No articles provided',
'duration': 0
}
# If only one article, return its summary
if len(articles) == 1:
return {
'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
'sources': [articles[0].get('source', 'unknown')],
'article_count': 1,
'success': True,
'error': None,
'duration': 0
}
# Build combined context from all articles
combined_context = self._build_combined_context(articles)
# Generate neutral summary using AI
prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
result = self.ollama_client.generate(prompt, max_tokens=300)
if result['success']:
return {
'neutral_summary': result['text'],
'sources': list(set(a.get('source', 'unknown') for a in articles)),
'article_count': len(articles),
'success': True,
'error': None,
'duration': result['duration']
}
else:
return {
'neutral_summary': None,
'sources': list(set(a.get('source', 'unknown') for a in articles)),
'article_count': len(articles),
'success': False,
'error': result['error'],
'duration': result['duration']
}
def _build_combined_context(self, articles: List[Dict]) -> str:
"""Build combined context from multiple articles"""
context_parts = []
for i, article in enumerate(articles, 1):
source = article.get('source', 'Unknown')
title = article.get('title', 'No title')
# Use summary if available, otherwise use first 500 chars of content
content = article.get('summary') or article.get('content', '')[:500]
context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
return "\n\n".join(context_parts)
def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
"""Build prompt for neutral summary generation"""
prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
1. Combines information from all sources
2. Remains neutral and objective
3. Highlights key facts that all sources agree on
4. Notes any significant differences in perspective (if any)
5. Is written in clear, professional English
6. Is approximately {self.max_words} words
Here are the articles:
{combined_context}
Write a neutral summary in English that synthesizes these perspectives:"""
return prompt
def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
"""
Create or update neutral summaries for article clusters
Args:
db: MongoDB database instance
ollama_client: OllamaClient instance
cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
Returns:
{
'processed': int,
'succeeded': int,
'failed': int,
'errors': list
}
"""
summarizer = ClusterSummarizer(ollama_client, max_words=200)
# Find clusters to process
if cluster_ids:
clusters_to_process = cluster_ids
else:
# Get all cluster IDs with multiple articles
pipeline = [
{"$match": {"cluster_id": {"$exists": True}}},
{"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
{"$match": {"count": {"$gt": 1}}},
{"$project": {"_id": 1}}
]
clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
processed = 0
succeeded = 0
failed = 0
errors = []
for cluster_id in clusters_to_process:
try:
# Get all articles in this cluster
articles = list(db.articles.find({"cluster_id": cluster_id}))
if len(articles) < 2:
continue
print(f"Processing cluster {cluster_id}: {len(articles)} articles")
# Generate neutral summary
result = summarizer.generate_neutral_summary(articles)
processed += 1
if result['success']:
# Save cluster summary
db.cluster_summaries.update_one(
{"cluster_id": cluster_id},
{
"$set": {
"cluster_id": cluster_id,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
},
upsert=True
)
succeeded += 1
print(f" ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
else:
failed += 1
error_msg = f"Cluster {cluster_id}: {result['error']}"
errors.append(error_msg)
print(f" ✗ Failed: {result['error']}")
except Exception as e:
failed += 1
error_msg = f"Cluster {cluster_id}: {str(e)}"
errors.append(error_msg)
print(f" ✗ Error: {e}")
return {
'processed': processed,
'succeeded': succeeded,
'failed': failed,
'errors': errors
}