update

2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions
--- a/news_crawler/cluster_summarizer.py
+++ b/news_crawler/cluster_summarizer.py
@@ -0,0 +1,213 @@
+"""
+Cluster Summarizer Module
+Generates neutral summaries from multiple clustered articles
+"""
+from typing import List, Dict, Optional
+from datetime import datetime
+from ollama_client import OllamaClient
+
+
+class ClusterSummarizer:
+    """
+    Generates neutral summaries by synthesizing multiple articles about the same story
+    """
+    
+    def __init__(self, ollama_client: OllamaClient, max_words=200):
+        """
+        Initialize cluster summarizer
+        
+        Args:
+            ollama_client: OllamaClient instance for AI-based summarization
+            max_words: Maximum words in neutral summary
+        """
+        self.ollama_client = ollama_client
+        self.max_words = max_words
+    
+    def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
+        """
+        Generate a neutral summary from multiple articles about the same story
+        
+        Args:
+            articles: List of article dicts with 'title', 'content', 'source'
+            
+        Returns:
+            {
+                'neutral_summary': str,
+                'sources': list,
+                'article_count': int,
+                'success': bool,
+                'error': str or None,
+                'duration': float
+            }
+        """
+        if not articles or len(articles) == 0:
+            return {
+                'neutral_summary': None,
+                'sources': [],
+                'article_count': 0,
+                'success': False,
+                'error': 'No articles provided',
+                'duration': 0
+            }
+        
+        # If only one article, return its summary
+        if len(articles) == 1:
+            return {
+                'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
+                'sources': [articles[0].get('source', 'unknown')],
+                'article_count': 1,
+                'success': True,
+                'error': None,
+                'duration': 0
+            }
+        
+        # Build combined context from all articles
+        combined_context = self._build_combined_context(articles)
+        
+        # Generate neutral summary using AI
+        prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
+        
+        result = self.ollama_client.generate(prompt, max_tokens=300)
+        
+        if result['success']:
+            return {
+                'neutral_summary': result['text'],
+                'sources': list(set(a.get('source', 'unknown') for a in articles)),
+                'article_count': len(articles),
+                'success': True,
+                'error': None,
+                'duration': result['duration']
+            }
+        else:
+            return {
+                'neutral_summary': None,
+                'sources': list(set(a.get('source', 'unknown') for a in articles)),
+                'article_count': len(articles),
+                'success': False,
+                'error': result['error'],
+                'duration': result['duration']
+            }
+    
+    def _build_combined_context(self, articles: List[Dict]) -> str:
+        """Build combined context from multiple articles"""
+        context_parts = []
+        
+        for i, article in enumerate(articles, 1):
+            source = article.get('source', 'Unknown')
+            title = article.get('title', 'No title')
+            
+            # Use summary if available, otherwise use first 500 chars of content
+            content = article.get('summary') or article.get('content', '')[:500]
+            
+            context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
+        
+        return "\n\n".join(context_parts)
+    
+    def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
+        """Build prompt for neutral summary generation"""
+        prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
+
+1. Combines information from all sources
+2. Remains neutral and objective
+3. Highlights key facts that all sources agree on
+4. Notes any significant differences in perspective (if any)
+5. Is written in clear, professional English
+6. Is approximately {self.max_words} words
+
+Here are the articles:
+
+{combined_context}
+
+Write a neutral summary in English that synthesizes these perspectives:"""
+        
+        return prompt
+
+
+def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
+    """
+    Create or update neutral summaries for article clusters
+    
+    Args:
+        db: MongoDB database instance
+        ollama_client: OllamaClient instance
+        cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
+        
+    Returns:
+        {
+            'processed': int,
+            'succeeded': int,
+            'failed': int,
+            'errors': list
+        }
+    """
+    summarizer = ClusterSummarizer(ollama_client, max_words=200)
+    
+    # Find clusters to process
+    if cluster_ids:
+        clusters_to_process = cluster_ids
+    else:
+        # Get all cluster IDs with multiple articles
+        pipeline = [
+            {"$match": {"cluster_id": {"$exists": True}}},
+            {"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
+            {"$match": {"count": {"$gt": 1}}},
+            {"$project": {"_id": 1}}
+        ]
+        clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
+    
+    processed = 0
+    succeeded = 0
+    failed = 0
+    errors = []
+    
+    for cluster_id in clusters_to_process:
+        try:
+            # Get all articles in this cluster
+            articles = list(db.articles.find({"cluster_id": cluster_id}))
+            
+            if len(articles) < 2:
+                continue
+            
+            print(f"Processing cluster {cluster_id}: {len(articles)} articles")
+            
+            # Generate neutral summary
+            result = summarizer.generate_neutral_summary(articles)
+            
+            processed += 1
+            
+            if result['success']:
+                # Save cluster summary
+                db.cluster_summaries.update_one(
+                    {"cluster_id": cluster_id},
+                    {
+                        "$set": {
+                            "cluster_id": cluster_id,
+                            "neutral_summary": result['neutral_summary'],
+                            "sources": result['sources'],
+                            "article_count": result['article_count'],
+                            "created_at": datetime.utcnow(),
+                            "updated_at": datetime.utcnow()
+                        }
+                    },
+                    upsert=True
+                )
+                succeeded += 1
+                print(f"  ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
+            else:
+                failed += 1
+                error_msg = f"Cluster {cluster_id}: {result['error']}"
+                errors.append(error_msg)
+                print(f"  ✗ Failed: {result['error']}")
+                
+        except Exception as e:
+            failed += 1
+            error_msg = f"Cluster {cluster_id}: {str(e)}"
+            errors.append(error_msg)
+            print(f"  ✗ Error: {e}")
+    
+    return {
+        'processed': processed,
+        'succeeded': succeeded,
+        'failed': failed,
+        'errors': errors
+    }