update

2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions
--- a/news_crawler/article_clustering.py
+++ b/news_crawler/article_clustering.py
@@ -0,0 +1,246 @@
+"""
+Article Clustering Module
+Detects and groups similar articles from different sources using Ollama AI
+"""
+from difflib import SequenceMatcher
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+from ollama_client import OllamaClient
+
+
+class ArticleClusterer:
+    """
+    Clusters articles about the same story from different sources using Ollama AI
+    """
+    
+    def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
+        """
+        Initialize clusterer
+        
+        Args:
+            ollama_client: OllamaClient instance for AI-based similarity detection
+            similarity_threshold: Minimum similarity to consider articles as same story (0-1)
+            time_window_hours: Time window to look for similar articles
+        """
+        self.ollama_client = ollama_client
+        self.similarity_threshold = similarity_threshold
+        self.time_window_hours = time_window_hours
+    
+    def normalize_title(self, title: str) -> str:
+        """
+        Normalize title for comparison
+        
+        Args:
+            title: Article title
+            
+        Returns:
+            Normalized title (lowercase, stripped)
+        """
+        return title.lower().strip()
+    
+    def simple_stem(self, word: str) -> str:
+        """
+        Simple German word stemming (remove common suffixes)
+        
+        Args:
+            word: Word to stem
+            
+        Returns:
+            Stemmed word
+        """
+        # Remove common German suffixes
+        suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
+        for suffix in suffixes:
+            if len(word) > 5 and word.endswith(suffix):
+                return word[:-len(suffix)]
+        return word
+    
+    def extract_keywords(self, text: str) -> set:
+        """
+        Extract important keywords from text with simple stemming
+        
+        Args:
+            text: Article title or content
+            
+        Returns:
+            Set of stemmed keywords
+        """
+        # Common German stop words to ignore
+        stop_words = {
+            'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
+            'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
+            'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
+            'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
+            'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
+            'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
+        }
+        
+        # Normalize and split
+        words = text.lower().strip().split()
+        
+        # Filter out stop words, short words, and apply stemming
+        keywords = set()
+        for word in words:
+            # Remove punctuation
+            word = ''.join(c for c in word if c.isalnum() or c == '-')
+            
+            if len(word) > 3 and word not in stop_words:
+                # Apply simple stemming
+                stemmed = self.simple_stem(word)
+                keywords.add(stemmed)
+        
+        return keywords
+    
+    def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
+        """
+        Use Ollama AI to determine if two articles are about the same story
+        
+        Args:
+            article1: First article
+            article2: Second article
+            
+        Returns:
+            True if same story, False otherwise
+        """
+        if not self.ollama_client.enabled:
+            # Fallback to keyword-based similarity
+            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
+        
+        title1 = article1.get('title', '')
+        title2 = article2.get('title', '')
+        content1 = article1.get('content', '')[:300]  # First 300 chars
+        content2 = article2.get('content', '')[:300]
+        
+        prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.
+
+Article 1:
+Title: {title1}
+Content: {content1}
+
+Article 2:
+Title: {title2}
+Content: {content2}
+
+Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
+Consider them the same story if they report on the same event, even if from different perspectives.
+
+Answer:"""
+        
+        try:
+            response = self.ollama_client.generate(prompt, max_tokens=10)
+            answer = response.get('text', '').strip().upper()
+            return 'YES' in answer
+        except Exception as e:
+            print(f"   ⚠ AI clustering failed: {e}, using fallback")
+            # Fallback to keyword-based similarity
+            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
+    
+    def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
+        """
+        Calculate similarity between two articles using title and content
+        
+        Args:
+            article1: First article (dict with 'title' and optionally 'content')
+            article2: Second article (dict with 'title' and optionally 'content')
+            
+        Returns:
+            Similarity score (0-1)
+        """
+        title1 = article1.get('title', '')
+        title2 = article2.get('title', '')
+        content1 = article1.get('content', '')
+        content2 = article2.get('content', '')
+        
+        # Extract keywords from titles
+        title_keywords1 = self.extract_keywords(title1)
+        title_keywords2 = self.extract_keywords(title2)
+        
+        # Calculate title similarity
+        if title_keywords1 and title_keywords2:
+            title_intersection = title_keywords1.intersection(title_keywords2)
+            title_union = title_keywords1.union(title_keywords2)
+            title_similarity = len(title_intersection) / len(title_union) if title_union else 0
+        else:
+            # Fallback to string similarity
+            t1 = self.normalize_title(title1)
+            t2 = self.normalize_title(title2)
+            title_similarity = SequenceMatcher(None, t1, t2).ratio()
+        
+        # If we have content, use it for better accuracy
+        if content1 and content2:
+            # Extract keywords from first 500 chars of content (for performance)
+            content_keywords1 = self.extract_keywords(content1[:500])
+            content_keywords2 = self.extract_keywords(content2[:500])
+            
+            if content_keywords1 and content_keywords2:
+                content_intersection = content_keywords1.intersection(content_keywords2)
+                content_union = content_keywords1.union(content_keywords2)
+                content_similarity = len(content_intersection) / len(content_union) if content_union else 0
+                
+                # Weighted average: title (40%) + content (60%)
+                return (title_similarity * 0.4) + (content_similarity * 0.6)
+        
+        # If no content, use only title similarity
+        return title_similarity
+    
+    def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
+        """
+        Find if article belongs to an existing cluster using AI
+        
+        Args:
+            article: New article to cluster (dict with 'title' and optionally 'content')
+            existing_articles: List of existing articles
+            
+        Returns:
+            cluster_id if found, None otherwise
+        """
+        cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
+        
+        for existing in existing_articles:
+            # Only compare recent articles
+            published_at = existing.get('published_at')
+            if published_at and published_at < cutoff_time:
+                continue
+            
+            # Use AI to check if same story
+            if self.check_same_story_with_ai(article, existing):
+                return existing.get('cluster_id', str(existing.get('_id')))
+        
+        return None
+    
+    def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
+        """
+        Cluster a single article
+        
+        Args:
+            article: Article to cluster
+            existing_articles: List of existing articles
+            
+        Returns:
+            Article with cluster_id and is_primary fields
+        """
+        cluster_id = self.find_cluster(article, existing_articles)
+        
+        if cluster_id:
+            # Add to existing cluster
+            article['cluster_id'] = cluster_id
+            article['is_primary'] = False
+        else:
+            # Create new cluster
+            article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
+            article['is_primary'] = True
+        
+        return article
+    
+    def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
+        """
+        Get all articles in a cluster
+        
+        Args:
+            cluster_id: Cluster ID
+            articles_collection: MongoDB collection
+            
+        Returns:
+            List of articles in the cluster
+        """
+        return list(articles_collection.find({'cluster_id': cluster_id}))
--- a/news_crawler/cluster_summarizer.py
+++ b/news_crawler/cluster_summarizer.py
@@ -0,0 +1,213 @@
+"""
+Cluster Summarizer Module
+Generates neutral summaries from multiple clustered articles
+"""
+from typing import List, Dict, Optional
+from datetime import datetime
+from ollama_client import OllamaClient
+
+
+class ClusterSummarizer:
+    """
+    Generates neutral summaries by synthesizing multiple articles about the same story
+    """
+    
+    def __init__(self, ollama_client: OllamaClient, max_words=200):
+        """
+        Initialize cluster summarizer
+        
+        Args:
+            ollama_client: OllamaClient instance for AI-based summarization
+            max_words: Maximum words in neutral summary
+        """
+        self.ollama_client = ollama_client
+        self.max_words = max_words
+    
+    def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
+        """
+        Generate a neutral summary from multiple articles about the same story
+        
+        Args:
+            articles: List of article dicts with 'title', 'content', 'source'
+            
+        Returns:
+            {
+                'neutral_summary': str,
+                'sources': list,
+                'article_count': int,
+                'success': bool,
+                'error': str or None,
+                'duration': float
+            }
+        """
+        if not articles or len(articles) == 0:
+            return {
+                'neutral_summary': None,
+                'sources': [],
+                'article_count': 0,
+                'success': False,
+                'error': 'No articles provided',
+                'duration': 0
+            }
+        
+        # If only one article, return its summary
+        if len(articles) == 1:
+            return {
+                'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
+                'sources': [articles[0].get('source', 'unknown')],
+                'article_count': 1,
+                'success': True,
+                'error': None,
+                'duration': 0
+            }
+        
+        # Build combined context from all articles
+        combined_context = self._build_combined_context(articles)
+        
+        # Generate neutral summary using AI
+        prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
+        
+        result = self.ollama_client.generate(prompt, max_tokens=300)
+        
+        if result['success']:
+            return {
+                'neutral_summary': result['text'],
+                'sources': list(set(a.get('source', 'unknown') for a in articles)),
+                'article_count': len(articles),
+                'success': True,
+                'error': None,
+                'duration': result['duration']
+            }
+        else:
+            return {
+                'neutral_summary': None,
+                'sources': list(set(a.get('source', 'unknown') for a in articles)),
+                'article_count': len(articles),
+                'success': False,
+                'error': result['error'],
+                'duration': result['duration']
+            }
+    
+    def _build_combined_context(self, articles: List[Dict]) -> str:
+        """Build combined context from multiple articles"""
+        context_parts = []
+        
+        for i, article in enumerate(articles, 1):
+            source = article.get('source', 'Unknown')
+            title = article.get('title', 'No title')
+            
+            # Use summary if available, otherwise use first 500 chars of content
+            content = article.get('summary') or article.get('content', '')[:500]
+            
+            context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
+        
+        return "\n\n".join(context_parts)
+    
+    def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
+        """Build prompt for neutral summary generation"""
+        prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
+
+1. Combines information from all sources
+2. Remains neutral and objective
+3. Highlights key facts that all sources agree on
+4. Notes any significant differences in perspective (if any)
+5. Is written in clear, professional English
+6. Is approximately {self.max_words} words
+
+Here are the articles:
+
+{combined_context}
+
+Write a neutral summary in English that synthesizes these perspectives:"""
+        
+        return prompt
+
+
+def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
+    """
+    Create or update neutral summaries for article clusters
+    
+    Args:
+        db: MongoDB database instance
+        ollama_client: OllamaClient instance
+        cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
+        
+    Returns:
+        {
+            'processed': int,
+            'succeeded': int,
+            'failed': int,
+            'errors': list
+        }
+    """
+    summarizer = ClusterSummarizer(ollama_client, max_words=200)
+    
+    # Find clusters to process
+    if cluster_ids:
+        clusters_to_process = cluster_ids
+    else:
+        # Get all cluster IDs with multiple articles
+        pipeline = [
+            {"$match": {"cluster_id": {"$exists": True}}},
+            {"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
+            {"$match": {"count": {"$gt": 1}}},
+            {"$project": {"_id": 1}}
+        ]
+        clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
+    
+    processed = 0
+    succeeded = 0
+    failed = 0
+    errors = []
+    
+    for cluster_id in clusters_to_process:
+        try:
+            # Get all articles in this cluster
+            articles = list(db.articles.find({"cluster_id": cluster_id}))
+            
+            if len(articles) < 2:
+                continue
+            
+            print(f"Processing cluster {cluster_id}: {len(articles)} articles")
+            
+            # Generate neutral summary
+            result = summarizer.generate_neutral_summary(articles)
+            
+            processed += 1
+            
+            if result['success']:
+                # Save cluster summary
+                db.cluster_summaries.update_one(
+                    {"cluster_id": cluster_id},
+                    {
+                        "$set": {
+                            "cluster_id": cluster_id,
+                            "neutral_summary": result['neutral_summary'],
+                            "sources": result['sources'],
+                            "article_count": result['article_count'],
+                            "created_at": datetime.utcnow(),
+                            "updated_at": datetime.utcnow()
+                        }
+                    },
+                    upsert=True
+                )
+                succeeded += 1
+                print(f"  ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
+            else:
+                failed += 1
+                error_msg = f"Cluster {cluster_id}: {result['error']}"
+                errors.append(error_msg)
+                print(f"  ✗ Failed: {result['error']}")
+                
+        except Exception as e:
+            failed += 1
+            error_msg = f"Cluster {cluster_id}: {str(e)}"
+            errors.append(error_msg)
+            print(f"  ✗ Error: {e}")
+    
+    return {
+        'processed': processed,
+        'succeeded': succeeded,
+        'failed': failed,
+        'errors': errors
+    }
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -13,6 +13,8 @@ from dotenv import load_dotenv
 from rss_utils import extract_article_url, extract_article_summary, extract_published_date
 from config import Config
 from ollama_client import OllamaClient
+from article_clustering import ArticleClusterer
+from cluster_summarizer import create_cluster_summaries

 # Load environment variables
 load_dotenv(dotenv_path='../.env')
@@ -33,6 +35,9 @@ ollama_client = OllamaClient(
    timeout=Config.OLLAMA_TIMEOUT
 )

+# Initialize Article Clusterer (will be initialized after ollama_client)
+article_clusterer = None
+
 # Print configuration on startup
 if __name__ != '__main__':
    Config.print_config()
@@ -44,6 +49,14 @@ if __name__ != '__main__':
            print("⚠ Warning: Ollama server is not reachable")
    else:
        print("ℹ Ollama AI summarization: DISABLED")
+    
+    # Initialize Article Clusterer with ollama_client
+    article_clusterer = ArticleClusterer(
+        ollama_client=ollama_client,
+        similarity_threshold=0.60,  # Not used when AI is enabled
+        time_window_hours=24        # Look back 24 hours
+    )
+    print("🔗 Article clustering: ENABLED (AI-powered)")


 def get_active_rss_feeds():
@@ -394,6 +407,13 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                    'created_at': datetime.utcnow()
                }
                
+                # Cluster article with existing articles (detect duplicates from other sources)
+                from datetime import timedelta
+                recent_articles = list(articles_collection.find({
+                    'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
+                }))
+                article_doc = article_clusterer.cluster_article(article_doc, recent_articles)
+                
                try:
                    # Upsert: update if exists, insert if not
                    articles_collection.update_one(
@@ -434,6 +454,16 @@ def crawl_all_feeds(max_articles_per_feed=10):
    Crawl all active RSS feeds
    Returns: dict with statistics
    """
+    global article_clusterer
+    
+    # Initialize clusterer if not already done
+    if article_clusterer is None:
+        article_clusterer = ArticleClusterer(
+            ollama_client=ollama_client,
+            similarity_threshold=0.60,
+            time_window_hours=24
+        )
+    
    print("\n" + "="*60)
    print("🚀 Starting RSS Feed Crawler")
    print("="*60)
@@ -485,12 +515,29 @@ def crawl_all_feeds(max_articles_per_feed=10):
        print(f"  Average time per article: {duration/total_crawled:.1f}s")
    print("="*60 + "\n")
    
+    # Generate neutral summaries for clustered articles
+    cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
+    if Config.OLLAMA_ENABLED and total_crawled > 0:
+        print("\n" + "="*60)
+        print("🔄 Generating Neutral Summaries for Clustered Articles")
+        print("="*60)
+        
+        cluster_summary_stats = create_cluster_summaries(db, ollama_client)
+        
+        print("\n" + "="*60)
+        print(f"✓ Cluster Summarization Complete!")
+        print(f"  Clusters processed: {cluster_summary_stats['processed']}")
+        print(f"  Succeeded: {cluster_summary_stats['succeeded']}")
+        print(f"  Failed: {cluster_summary_stats['failed']}")
+        print("="*60 + "\n")
+    
    return {
        'total_feeds': len(feeds),
        'total_articles_crawled': total_crawled,
        'total_summarized': total_summarized,
        'failed_summaries': total_failed,
-        'duration_seconds': round(duration, 2)
+        'duration_seconds': round(duration, 2),
+        'cluster_summaries': cluster_summary_stats
    }


--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -391,6 +391,80 @@ English Summary (max {max_words} words):"""
                'current_model': self.model,
                'error': str(e)
            }
+    
+    def generate(self, prompt, max_tokens=100):
+        """
+        Generate text using Ollama
+        
+        Args:
+            prompt: Text prompt
+            max_tokens: Maximum tokens to generate
+            
+        Returns:
+            {
+                'text': str,           # Generated text
+                'success': bool,       # Whether generation succeeded
+                'error': str or None,  # Error message if failed
+                'duration': float      # Time taken in seconds
+            }
+        """
+        if not self.enabled:
+            return {
+                'text': '',
+                'success': False,
+                'error': 'Ollama is disabled',
+                'duration': 0
+            }
+        
+        start_time = time.time()
+        
+        try:
+            response = requests.post(
+                f"{self.base_url}/api/generate",
+                json={
+                    "model": self.model,
+                    "prompt": prompt,
+                    "stream": False,
+                    "options": {
+                        "num_predict": max_tokens,
+                        "temperature": 0.1  # Low temperature for consistent answers
+                    }
+                },
+                timeout=self.timeout
+            )
+            
+            duration = time.time() - start_time
+            
+            if response.status_code == 200:
+                result = response.json()
+                return {
+                    'text': result.get('response', '').strip(),
+                    'success': True,
+                    'error': None,
+                    'duration': duration
+                }
+            else:
+                return {
+                    'text': '',
+                    'success': False,
+                    'error': f"HTTP {response.status_code}: {response.text}",
+                    'duration': duration
+                }
+                
+        except requests.exceptions.Timeout:
+            return {
+                'text': '',
+                'success': False,
+                'error': f"Request timed out after {self.timeout}s",
+                'duration': time.time() - start_time
+            }
+        except Exception as e:
+            return {
+                'text': '',
+                'success': False,
+                'error': str(e),
+                'duration': time.time() - start_time
+            }


 if __name__ == '__main__':