""" Cluster Summarizer Module Generates neutral summaries from multiple clustered articles """ from typing import List, Dict, Optional from datetime import datetime from ollama_client import OllamaClient class ClusterSummarizer: """ Generates neutral summaries by synthesizing multiple articles about the same story """ def __init__(self, ollama_client: OllamaClient, max_words=200): """ Initialize cluster summarizer Args: ollama_client: OllamaClient instance for AI-based summarization max_words: Maximum words in neutral summary """ self.ollama_client = ollama_client self.max_words = max_words def generate_neutral_summary(self, articles: List[Dict]) -> Dict: """ Generate a neutral summary from multiple articles about the same story Args: articles: List of article dicts with 'title', 'content', 'source' Returns: { 'neutral_summary': str, 'sources': list, 'article_count': int, 'success': bool, 'error': str or None, 'duration': float } """ if not articles or len(articles) == 0: return { 'neutral_summary': None, 'sources': [], 'article_count': 0, 'success': False, 'error': 'No articles provided', 'duration': 0 } # If only one article, return its summary if len(articles) == 1: return { 'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]), 'sources': [articles[0].get('source', 'unknown')], 'article_count': 1, 'success': True, 'error': None, 'duration': 0 } # Build combined context from all articles combined_context = self._build_combined_context(articles) # Generate neutral summary using AI prompt = self._build_neutral_summary_prompt(combined_context, len(articles)) result = self.ollama_client.generate(prompt, max_tokens=300) if result['success']: return { 'neutral_summary': result['text'], 'sources': list(set(a.get('source', 'unknown') for a in articles)), 'article_count': len(articles), 'success': True, 'error': None, 'duration': result['duration'] } else: return { 'neutral_summary': None, 'sources': list(set(a.get('source', 'unknown') for a in articles)), 'article_count': len(articles), 'success': False, 'error': result['error'], 'duration': result['duration'] } def _build_combined_context(self, articles: List[Dict]) -> str: """Build combined context from multiple articles""" context_parts = [] for i, article in enumerate(articles, 1): source = article.get('source', 'Unknown') title = article.get('title', 'No title') # Use summary if available, otherwise use first 500 chars of content content = article.get('summary') or article.get('content', '')[:500] context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}") return "\n\n".join(context_parts) def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str: """Build prompt for neutral summary generation""" prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that: 1. Combines information from all sources 2. Remains neutral and objective 3. Highlights key facts that all sources agree on 4. Notes any significant differences in perspective (if any) 5. Is written in clear, professional English 6. Is approximately {self.max_words} words Here are the articles: {combined_context} Write a neutral summary in English that synthesizes these perspectives:""" return prompt def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None): """ Create or update neutral summaries for article clusters Args: db: MongoDB database instance ollama_client: OllamaClient instance cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters. Returns: { 'processed': int, 'succeeded': int, 'failed': int, 'errors': list } """ summarizer = ClusterSummarizer(ollama_client, max_words=200) # Find clusters to process if cluster_ids: clusters_to_process = cluster_ids else: # Get all cluster IDs with multiple articles pipeline = [ {"$match": {"cluster_id": {"$exists": True}}}, {"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}}, {"$match": {"count": {"$gt": 1}}}, {"$project": {"_id": 1}} ] clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)] processed = 0 succeeded = 0 failed = 0 errors = [] for cluster_id in clusters_to_process: try: # Get all articles in this cluster articles = list(db.articles.find({"cluster_id": cluster_id})) if len(articles) < 2: continue print(f"Processing cluster {cluster_id}: {len(articles)} articles") # Generate neutral summary result = summarizer.generate_neutral_summary(articles) processed += 1 if result['success']: # Save cluster summary db.cluster_summaries.update_one( {"cluster_id": cluster_id}, { "$set": { "cluster_id": cluster_id, "neutral_summary": result['neutral_summary'], "sources": result['sources'], "article_count": result['article_count'], "created_at": datetime.utcnow(), "updated_at": datetime.utcnow() } }, upsert=True ) succeeded += 1 print(f" ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)") else: failed += 1 error_msg = f"Cluster {cluster_id}: {result['error']}" errors.append(error_msg) print(f" ✗ Failed: {result['error']}") except Exception as e: failed += 1 error_msg = f"Cluster {cluster_id}: {str(e)}" errors.append(error_msg) print(f" ✗ Error: {e}") return { 'processed': processed, 'succeeded': succeeded, 'failed': failed, 'errors': errors }