update

2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions
--- a/news_crawler/article_clustering.py
+++ b/news_crawler/article_clustering.py
@@ -0,0 +1,246 @@
+"""
+Article Clustering Module
+Detects and groups similar articles from different sources using Ollama AI
+"""
+from difflib import SequenceMatcher
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+from ollama_client import OllamaClient
+
+
+class ArticleClusterer:
+    """
+    Clusters articles about the same story from different sources using Ollama AI
+    """
+    
+    def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
+        """
+        Initialize clusterer
+        
+        Args:
+            ollama_client: OllamaClient instance for AI-based similarity detection
+            similarity_threshold: Minimum similarity to consider articles as same story (0-1)
+            time_window_hours: Time window to look for similar articles
+        """
+        self.ollama_client = ollama_client
+        self.similarity_threshold = similarity_threshold
+        self.time_window_hours = time_window_hours
+    
+    def normalize_title(self, title: str) -> str:
+        """
+        Normalize title for comparison
+        
+        Args:
+            title: Article title
+            
+        Returns:
+            Normalized title (lowercase, stripped)
+        """
+        return title.lower().strip()
+    
+    def simple_stem(self, word: str) -> str:
+        """
+        Simple German word stemming (remove common suffixes)
+        
+        Args:
+            word: Word to stem
+            
+        Returns:
+            Stemmed word
+        """
+        # Remove common German suffixes
+        suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
+        for suffix in suffixes:
+            if len(word) > 5 and word.endswith(suffix):
+                return word[:-len(suffix)]
+        return word
+    
+    def extract_keywords(self, text: str) -> set:
+        """
+        Extract important keywords from text with simple stemming
+        
+        Args:
+            text: Article title or content
+            
+        Returns:
+            Set of stemmed keywords
+        """
+        # Common German stop words to ignore
+        stop_words = {
+            'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
+            'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
+            'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
+            'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
+            'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
+            'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
+        }
+        
+        # Normalize and split
+        words = text.lower().strip().split()
+        
+        # Filter out stop words, short words, and apply stemming
+        keywords = set()
+        for word in words:
+            # Remove punctuation
+            word = ''.join(c for c in word if c.isalnum() or c == '-')
+            
+            if len(word) > 3 and word not in stop_words:
+                # Apply simple stemming
+                stemmed = self.simple_stem(word)
+                keywords.add(stemmed)
+        
+        return keywords
+    
+    def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
+        """
+        Use Ollama AI to determine if two articles are about the same story
+        
+        Args:
+            article1: First article
+            article2: Second article
+            
+        Returns:
+            True if same story, False otherwise
+        """
+        if not self.ollama_client.enabled:
+            # Fallback to keyword-based similarity
+            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
+        
+        title1 = article1.get('title', '')
+        title2 = article2.get('title', '')
+        content1 = article1.get('content', '')[:300]  # First 300 chars
+        content2 = article2.get('content', '')[:300]
+        
+        prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.
+
+Article 1:
+Title: {title1}
+Content: {content1}
+
+Article 2:
+Title: {title2}
+Content: {content2}
+
+Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
+Consider them the same story if they report on the same event, even if from different perspectives.
+
+Answer:"""
+        
+        try:
+            response = self.ollama_client.generate(prompt, max_tokens=10)
+            answer = response.get('text', '').strip().upper()
+            return 'YES' in answer
+        except Exception as e:
+            print(f"   ⚠ AI clustering failed: {e}, using fallback")
+            # Fallback to keyword-based similarity
+            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
+    
+    def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
+        """
+        Calculate similarity between two articles using title and content
+        
+        Args:
+            article1: First article (dict with 'title' and optionally 'content')
+            article2: Second article (dict with 'title' and optionally 'content')
+            
+        Returns:
+            Similarity score (0-1)
+        """
+        title1 = article1.get('title', '')
+        title2 = article2.get('title', '')
+        content1 = article1.get('content', '')
+        content2 = article2.get('content', '')
+        
+        # Extract keywords from titles
+        title_keywords1 = self.extract_keywords(title1)
+        title_keywords2 = self.extract_keywords(title2)
+        
+        # Calculate title similarity
+        if title_keywords1 and title_keywords2:
+            title_intersection = title_keywords1.intersection(title_keywords2)
+            title_union = title_keywords1.union(title_keywords2)
+            title_similarity = len(title_intersection) / len(title_union) if title_union else 0
+        else:
+            # Fallback to string similarity
+            t1 = self.normalize_title(title1)
+            t2 = self.normalize_title(title2)
+            title_similarity = SequenceMatcher(None, t1, t2).ratio()
+        
+        # If we have content, use it for better accuracy
+        if content1 and content2:
+            # Extract keywords from first 500 chars of content (for performance)
+            content_keywords1 = self.extract_keywords(content1[:500])
+            content_keywords2 = self.extract_keywords(content2[:500])
+            
+            if content_keywords1 and content_keywords2:
+                content_intersection = content_keywords1.intersection(content_keywords2)
+                content_union = content_keywords1.union(content_keywords2)
+                content_similarity = len(content_intersection) / len(content_union) if content_union else 0
+                
+                # Weighted average: title (40%) + content (60%)
+                return (title_similarity * 0.4) + (content_similarity * 0.6)
+        
+        # If no content, use only title similarity
+        return title_similarity
+    
+    def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
+        """
+        Find if article belongs to an existing cluster using AI
+        
+        Args:
+            article: New article to cluster (dict with 'title' and optionally 'content')
+            existing_articles: List of existing articles
+            
+        Returns:
+            cluster_id if found, None otherwise
+        """
+        cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
+        
+        for existing in existing_articles:
+            # Only compare recent articles
+            published_at = existing.get('published_at')
+            if published_at and published_at < cutoff_time:
+                continue
+            
+            # Use AI to check if same story
+            if self.check_same_story_with_ai(article, existing):
+                return existing.get('cluster_id', str(existing.get('_id')))
+        
+        return None
+    
+    def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
+        """
+        Cluster a single article
+        
+        Args:
+            article: Article to cluster
+            existing_articles: List of existing articles
+            
+        Returns:
+            Article with cluster_id and is_primary fields
+        """
+        cluster_id = self.find_cluster(article, existing_articles)
+        
+        if cluster_id:
+            # Add to existing cluster
+            article['cluster_id'] = cluster_id
+            article['is_primary'] = False
+        else:
+            # Create new cluster
+            article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
+            article['is_primary'] = True
+        
+        return article
+    
+    def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
+        """
+        Get all articles in a cluster
+        
+        Args:
+            cluster_id: Cluster ID
+            articles_collection: MongoDB collection
+            
+        Returns:
+            List of articles in the cluster
+        """
+        return list(articles_collection.find({'cluster_id': cluster_id}))