update
This commit is contained in:
213
news_crawler/cluster_summarizer.py
Normal file
213
news_crawler/cluster_summarizer.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Cluster Summarizer Module
|
||||
Generates neutral summaries from multiple clustered articles
|
||||
"""
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
|
||||
class ClusterSummarizer:
|
||||
"""
|
||||
Generates neutral summaries by synthesizing multiple articles about the same story
|
||||
"""
|
||||
|
||||
def __init__(self, ollama_client: OllamaClient, max_words=200):
|
||||
"""
|
||||
Initialize cluster summarizer
|
||||
|
||||
Args:
|
||||
ollama_client: OllamaClient instance for AI-based summarization
|
||||
max_words: Maximum words in neutral summary
|
||||
"""
|
||||
self.ollama_client = ollama_client
|
||||
self.max_words = max_words
|
||||
|
||||
def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
|
||||
"""
|
||||
Generate a neutral summary from multiple articles about the same story
|
||||
|
||||
Args:
|
||||
articles: List of article dicts with 'title', 'content', 'source'
|
||||
|
||||
Returns:
|
||||
{
|
||||
'neutral_summary': str,
|
||||
'sources': list,
|
||||
'article_count': int,
|
||||
'success': bool,
|
||||
'error': str or None,
|
||||
'duration': float
|
||||
}
|
||||
"""
|
||||
if not articles or len(articles) == 0:
|
||||
return {
|
||||
'neutral_summary': None,
|
||||
'sources': [],
|
||||
'article_count': 0,
|
||||
'success': False,
|
||||
'error': 'No articles provided',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
# If only one article, return its summary
|
||||
if len(articles) == 1:
|
||||
return {
|
||||
'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
|
||||
'sources': [articles[0].get('source', 'unknown')],
|
||||
'article_count': 1,
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
# Build combined context from all articles
|
||||
combined_context = self._build_combined_context(articles)
|
||||
|
||||
# Generate neutral summary using AI
|
||||
prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
|
||||
|
||||
result = self.ollama_client.generate(prompt, max_tokens=300)
|
||||
|
||||
if result['success']:
|
||||
return {
|
||||
'neutral_summary': result['text'],
|
||||
'sources': list(set(a.get('source', 'unknown') for a in articles)),
|
||||
'article_count': len(articles),
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': result['duration']
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'neutral_summary': None,
|
||||
'sources': list(set(a.get('source', 'unknown') for a in articles)),
|
||||
'article_count': len(articles),
|
||||
'success': False,
|
||||
'error': result['error'],
|
||||
'duration': result['duration']
|
||||
}
|
||||
|
||||
def _build_combined_context(self, articles: List[Dict]) -> str:
|
||||
"""Build combined context from multiple articles"""
|
||||
context_parts = []
|
||||
|
||||
for i, article in enumerate(articles, 1):
|
||||
source = article.get('source', 'Unknown')
|
||||
title = article.get('title', 'No title')
|
||||
|
||||
# Use summary if available, otherwise use first 500 chars of content
|
||||
content = article.get('summary') or article.get('content', '')[:500]
|
||||
|
||||
context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
|
||||
|
||||
return "\n\n".join(context_parts)
|
||||
|
||||
def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
|
||||
"""Build prompt for neutral summary generation"""
|
||||
prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
|
||||
|
||||
1. Combines information from all sources
|
||||
2. Remains neutral and objective
|
||||
3. Highlights key facts that all sources agree on
|
||||
4. Notes any significant differences in perspective (if any)
|
||||
5. Is written in clear, professional English
|
||||
6. Is approximately {self.max_words} words
|
||||
|
||||
Here are the articles:
|
||||
|
||||
{combined_context}
|
||||
|
||||
Write a neutral summary in English that synthesizes these perspectives:"""
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
|
||||
"""
|
||||
Create or update neutral summaries for article clusters
|
||||
|
||||
Args:
|
||||
db: MongoDB database instance
|
||||
ollama_client: OllamaClient instance
|
||||
cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'processed': int,
|
||||
'succeeded': int,
|
||||
'failed': int,
|
||||
'errors': list
|
||||
}
|
||||
"""
|
||||
summarizer = ClusterSummarizer(ollama_client, max_words=200)
|
||||
|
||||
# Find clusters to process
|
||||
if cluster_ids:
|
||||
clusters_to_process = cluster_ids
|
||||
else:
|
||||
# Get all cluster IDs with multiple articles
|
||||
pipeline = [
|
||||
{"$match": {"cluster_id": {"$exists": True}}},
|
||||
{"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
|
||||
{"$match": {"count": {"$gt": 1}}},
|
||||
{"$project": {"_id": 1}}
|
||||
]
|
||||
clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
|
||||
|
||||
processed = 0
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
errors = []
|
||||
|
||||
for cluster_id in clusters_to_process:
|
||||
try:
|
||||
# Get all articles in this cluster
|
||||
articles = list(db.articles.find({"cluster_id": cluster_id}))
|
||||
|
||||
if len(articles) < 2:
|
||||
continue
|
||||
|
||||
print(f"Processing cluster {cluster_id}: {len(articles)} articles")
|
||||
|
||||
# Generate neutral summary
|
||||
result = summarizer.generate_neutral_summary(articles)
|
||||
|
||||
processed += 1
|
||||
|
||||
if result['success']:
|
||||
# Save cluster summary
|
||||
db.cluster_summaries.update_one(
|
||||
{"cluster_id": cluster_id},
|
||||
{
|
||||
"$set": {
|
||||
"cluster_id": cluster_id,
|
||||
"neutral_summary": result['neutral_summary'],
|
||||
"sources": result['sources'],
|
||||
"article_count": result['article_count'],
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow()
|
||||
}
|
||||
},
|
||||
upsert=True
|
||||
)
|
||||
succeeded += 1
|
||||
print(f" ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
|
||||
else:
|
||||
failed += 1
|
||||
error_msg = f"Cluster {cluster_id}: {result['error']}"
|
||||
errors.append(error_msg)
|
||||
print(f" ✗ Failed: {result['error']}")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
error_msg = f"Cluster {cluster_id}: {str(e)}"
|
||||
errors.append(error_msg)
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
return {
|
||||
'processed': processed,
|
||||
'succeeded': succeeded,
|
||||
'failed': failed,
|
||||
'errors': errors
|
||||
}
|
||||
Reference in New Issue
Block a user