Munich-news/news_crawler/cluster_summarizer.py

"""
Cluster Summarizer Module
Generates neutral summaries from multiple clustered articles
"""
from typing import List, Dict, Optional
from datetime import datetime
from ollama_client import OllamaClient


class ClusterSummarizer:
    """
    Generates neutral summaries by synthesizing multiple articles about the same story
    """

    def __init__(self, ollama_client: OllamaClient, max_words=200):
        """
        Initialize cluster summarizer

        Args:
            ollama_client: OllamaClient instance for AI-based summarization
            max_words: Maximum words in neutral summary
        """
        self.ollama_client = ollama_client
        self.max_words = max_words

    def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
        """
        Generate a neutral summary from multiple articles about the same story

        Args:
            articles: List of article dicts with 'title', 'content', 'source'

        Returns:
            {
                'neutral_summary': str,
                'sources': list,
                'article_count': int,
                'success': bool,
                'error': str or None,
                'duration': float
            }
        """
        if not articles or len(articles) == 0:
            return {
                'neutral_summary': None,
                'sources': [],
                'article_count': 0,
                'success': False,
                'error': 'No articles provided',
                'duration': 0
            }

        # If only one article, return its summary
        if len(articles) == 1:
            return {
                'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
                'sources': [articles[0].get('source', 'unknown')],
                'article_count': 1,
                'success': True,
                'error': None,
                'duration': 0
            }

        # Build combined context from all articles
        combined_context = self._build_combined_context(articles)

        # Generate neutral summary using AI
        prompt = self._build_neutral_summary_prompt(combined_context, len(articles))

        result = self.ollama_client.generate(prompt, max_tokens=300)

        if result['success']:
            return {
                'neutral_summary': result['text'],
                'sources': list(set(a.get('source', 'unknown') for a in articles)),
                'article_count': len(articles),
                'success': True,
                'error': None,
                'duration': result['duration']
            }
        else:
            return {
                'neutral_summary': None,
                'sources': list(set(a.get('source', 'unknown') for a in articles)),
                'article_count': len(articles),
                'success': False,
                'error': result['error'],
                'duration': result['duration']
            }

    def _build_combined_context(self, articles: List[Dict]) -> str:
        """Build combined context from multiple articles"""
        context_parts = []

        for i, article in enumerate(articles, 1):
            source = article.get('source', 'Unknown')
            title = article.get('title', 'No title')

            # Use summary if available, otherwise use first 500 chars of content
            content = article.get('summary') or article.get('content', '')[:500]

            context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")

        return "\n\n".join(context_parts)

    def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
        """Build prompt for neutral summary generation"""
        prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:

1. Combines information from all sources
2. Remains neutral and objective
3. Highlights key facts that all sources agree on
4. Notes any significant differences in perspective (if any)
5. Is written in clear, professional English
6. Is approximately {self.max_words} words

Here are the articles:

{combined_context}

Write a neutral summary in English that synthesizes these perspectives:"""

        return prompt


def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
    """
    Create or update neutral summaries for article clusters

    Args:
        db: MongoDB database instance
        ollama_client: OllamaClient instance
        cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.

    Returns:
        {
            'processed': int,
            'succeeded': int,
            'failed': int,
            'errors': list
        }
    """
    summarizer = ClusterSummarizer(ollama_client, max_words=200)

    # Find clusters to process
    if cluster_ids:
        clusters_to_process = cluster_ids
    else:
        # Get all cluster IDs with multiple articles
        pipeline = [
            {"$match": {"cluster_id": {"$exists": True}}},
            {"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
            {"$match": {"count": {"$gt": 1}}},
            {"$project": {"_id": 1}}
        ]
        clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]

    processed = 0
    succeeded = 0
    failed = 0
    errors = []

    for cluster_id in clusters_to_process:
        try:
            # Get all articles in this cluster
            articles = list(db.articles.find({"cluster_id": cluster_id}))

            if len(articles) < 2:
                continue

            print(f"Processing cluster {cluster_id}: {len(articles)} articles")

            # Generate neutral summary
            result = summarizer.generate_neutral_summary(articles)

            processed += 1

            if result['success']:
                # Save cluster summary
                db.cluster_summaries.update_one(
                    {"cluster_id": cluster_id},
                    {
                        "$set": {
                            "cluster_id": cluster_id,
                            "neutral_summary": result['neutral_summary'],
                            "sources": result['sources'],
                            "article_count": result['article_count'],
                            "created_at": datetime.utcnow(),
                            "updated_at": datetime.utcnow()
                        }
                    },
                    upsert=True
                )
                succeeded += 1
                print(f"  ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
            else:
                failed += 1
                error_msg = f"Cluster {cluster_id}: {result['error']}"
                errors.append(error_msg)
                print(f"  ✗ Failed: {result['error']}")

        except Exception as e:
            failed += 1
            error_msg = f"Cluster {cluster_id}: {str(e)}"
            errors.append(error_msg)
            print(f"  ✗ Error: {e}")

    return {
        'processed': processed,
        'succeeded': succeeded,
        'failed': failed,
        'errors': errors
    }