214 lines
7.4 KiB
Python
214 lines
7.4 KiB
Python
"""
|
|
Cluster Summarizer Module
|
|
Generates neutral summaries from multiple clustered articles
|
|
"""
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
from ollama_client import OllamaClient
|
|
|
|
|
|
class ClusterSummarizer:
|
|
"""
|
|
Generates neutral summaries by synthesizing multiple articles about the same story
|
|
"""
|
|
|
|
def __init__(self, ollama_client: OllamaClient, max_words=200):
|
|
"""
|
|
Initialize cluster summarizer
|
|
|
|
Args:
|
|
ollama_client: OllamaClient instance for AI-based summarization
|
|
max_words: Maximum words in neutral summary
|
|
"""
|
|
self.ollama_client = ollama_client
|
|
self.max_words = max_words
|
|
|
|
def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
|
|
"""
|
|
Generate a neutral summary from multiple articles about the same story
|
|
|
|
Args:
|
|
articles: List of article dicts with 'title', 'content', 'source'
|
|
|
|
Returns:
|
|
{
|
|
'neutral_summary': str,
|
|
'sources': list,
|
|
'article_count': int,
|
|
'success': bool,
|
|
'error': str or None,
|
|
'duration': float
|
|
}
|
|
"""
|
|
if not articles or len(articles) == 0:
|
|
return {
|
|
'neutral_summary': None,
|
|
'sources': [],
|
|
'article_count': 0,
|
|
'success': False,
|
|
'error': 'No articles provided',
|
|
'duration': 0
|
|
}
|
|
|
|
# If only one article, return its summary
|
|
if len(articles) == 1:
|
|
return {
|
|
'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
|
|
'sources': [articles[0].get('source', 'unknown')],
|
|
'article_count': 1,
|
|
'success': True,
|
|
'error': None,
|
|
'duration': 0
|
|
}
|
|
|
|
# Build combined context from all articles
|
|
combined_context = self._build_combined_context(articles)
|
|
|
|
# Generate neutral summary using AI
|
|
prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
|
|
|
|
result = self.ollama_client.generate(prompt, max_tokens=300)
|
|
|
|
if result['success']:
|
|
return {
|
|
'neutral_summary': result['text'],
|
|
'sources': list(set(a.get('source', 'unknown') for a in articles)),
|
|
'article_count': len(articles),
|
|
'success': True,
|
|
'error': None,
|
|
'duration': result['duration']
|
|
}
|
|
else:
|
|
return {
|
|
'neutral_summary': None,
|
|
'sources': list(set(a.get('source', 'unknown') for a in articles)),
|
|
'article_count': len(articles),
|
|
'success': False,
|
|
'error': result['error'],
|
|
'duration': result['duration']
|
|
}
|
|
|
|
def _build_combined_context(self, articles: List[Dict]) -> str:
|
|
"""Build combined context from multiple articles"""
|
|
context_parts = []
|
|
|
|
for i, article in enumerate(articles, 1):
|
|
source = article.get('source', 'Unknown')
|
|
title = article.get('title', 'No title')
|
|
|
|
# Use summary if available, otherwise use first 500 chars of content
|
|
content = article.get('summary') or article.get('content', '')[:500]
|
|
|
|
context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
|
|
|
|
return "\n\n".join(context_parts)
|
|
|
|
def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
|
|
"""Build prompt for neutral summary generation"""
|
|
prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
|
|
|
|
1. Combines information from all sources
|
|
2. Remains neutral and objective
|
|
3. Highlights key facts that all sources agree on
|
|
4. Notes any significant differences in perspective (if any)
|
|
5. Is written in clear, professional English
|
|
6. Is approximately {self.max_words} words
|
|
|
|
Here are the articles:
|
|
|
|
{combined_context}
|
|
|
|
Write a neutral summary in English that synthesizes these perspectives:"""
|
|
|
|
return prompt
|
|
|
|
|
|
def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
|
|
"""
|
|
Create or update neutral summaries for article clusters
|
|
|
|
Args:
|
|
db: MongoDB database instance
|
|
ollama_client: OllamaClient instance
|
|
cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
|
|
|
|
Returns:
|
|
{
|
|
'processed': int,
|
|
'succeeded': int,
|
|
'failed': int,
|
|
'errors': list
|
|
}
|
|
"""
|
|
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
|
|
|
# Find clusters to process
|
|
if cluster_ids:
|
|
clusters_to_process = cluster_ids
|
|
else:
|
|
# Get all cluster IDs with multiple articles
|
|
pipeline = [
|
|
{"$match": {"cluster_id": {"$exists": True}}},
|
|
{"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
|
|
{"$match": {"count": {"$gt": 1}}},
|
|
{"$project": {"_id": 1}}
|
|
]
|
|
clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
|
|
|
|
processed = 0
|
|
succeeded = 0
|
|
failed = 0
|
|
errors = []
|
|
|
|
for cluster_id in clusters_to_process:
|
|
try:
|
|
# Get all articles in this cluster
|
|
articles = list(db.articles.find({"cluster_id": cluster_id}))
|
|
|
|
if len(articles) < 2:
|
|
continue
|
|
|
|
print(f"Processing cluster {cluster_id}: {len(articles)} articles")
|
|
|
|
# Generate neutral summary
|
|
result = summarizer.generate_neutral_summary(articles)
|
|
|
|
processed += 1
|
|
|
|
if result['success']:
|
|
# Save cluster summary
|
|
db.cluster_summaries.update_one(
|
|
{"cluster_id": cluster_id},
|
|
{
|
|
"$set": {
|
|
"cluster_id": cluster_id,
|
|
"neutral_summary": result['neutral_summary'],
|
|
"sources": result['sources'],
|
|
"article_count": result['article_count'],
|
|
"created_at": datetime.utcnow(),
|
|
"updated_at": datetime.utcnow()
|
|
}
|
|
},
|
|
upsert=True
|
|
)
|
|
succeeded += 1
|
|
print(f" ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
|
|
else:
|
|
failed += 1
|
|
error_msg = f"Cluster {cluster_id}: {result['error']}"
|
|
errors.append(error_msg)
|
|
print(f" ✗ Failed: {result['error']}")
|
|
|
|
except Exception as e:
|
|
failed += 1
|
|
error_msg = f"Cluster {cluster_id}: {str(e)}"
|
|
errors.append(error_msg)
|
|
print(f" ✗ Error: {e}")
|
|
|
|
return {
|
|
'processed': processed,
|
|
'succeeded': succeeded,
|
|
'failed': failed,
|
|
'errors': errors
|
|
}
|