"""
Article Clustering Module
Detects and groups similar articles from different sources using Ollama AI
"""
from difflib import SequenceMatcher
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from ollama_client import OllamaClient


class ArticleClusterer:
    """
    Clusters articles about the same story from different sources using Ollama AI
    """
    
    def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
        """
        Initialize clusterer
        
        Args:
            ollama_client: OllamaClient instance for AI-based similarity detection
            similarity_threshold: Minimum similarity to consider articles as same story (0-1)
            time_window_hours: Time window to look for similar articles
        """
        self.ollama_client = ollama_client
        self.similarity_threshold = similarity_threshold
        self.time_window_hours = time_window_hours
    
    def normalize_title(self, title: str) -> str:
        """
        Normalize title for comparison
        
        Args:
            title: Article title
            
        Returns:
            Normalized title (lowercase, stripped)
        """
        return title.lower().strip()
    
    def simple_stem(self, word: str) -> str:
        """
        Simple German word stemming (remove common suffixes)
        
        Args:
            word: Word to stem
            
        Returns:
            Stemmed word
        """
        # Remove common German suffixes
        suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
        for suffix in suffixes:
            if len(word) > 5 and word.endswith(suffix):
                return word[:-len(suffix)]
        return word
    
    def extract_keywords(self, text: str) -> set:
        """
        Extract important keywords from text with simple stemming
        
        Args:
            text: Article title or content
            
        Returns:
            Set of stemmed keywords
        """
        # Common German stop words to ignore
        stop_words = {
            'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
            'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
            'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
            'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
            'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
            'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
        }
        
        # Normalize and split
        words = text.lower().strip().split()
        
        # Filter out stop words, short words, and apply stemming
        keywords = set()
        for word in words:
            # Remove punctuation
            word = ''.join(c for c in word if c.isalnum() or c == '-')
            
            if len(word) > 3 and word not in stop_words:
                # Apply simple stemming
                stemmed = self.simple_stem(word)
                keywords.add(stemmed)
        
        return keywords
    
    def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
        """
        Use Ollama AI to determine if two articles are about the same story
        
        Args:
            article1: First article
            article2: Second article
            
        Returns:
            True if same story, False otherwise
        """
        if not self.ollama_client.enabled:
            # Fallback to keyword-based similarity
            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
        
        title1 = article1.get('title', '')
        title2 = article2.get('title', '')
        content1 = article1.get('content', '')[:300]  # First 300 chars
        content2 = article2.get('content', '')[:300]
        
        prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.

Article 1:
Title: {title1}
Content: {content1}

Article 2:
Title: {title2}
Content: {content2}

Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
Consider them the same story if they report on the same event, even if from different perspectives.

Answer:"""
        
        try:
            response = self.ollama_client.generate(prompt, max_tokens=10)
            answer = response.get('text', '').strip().upper()
            return 'YES' in answer
        except Exception as e:
            print(f"   ⚠ AI clustering failed: {e}, using fallback")
            # Fallback to keyword-based similarity
            return self.calculate_similarity(article1, article2) >= self.similarity_threshold
    
    def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
        """
        Calculate similarity between two articles using title and content
        
        Args:
            article1: First article (dict with 'title' and optionally 'content')
            article2: Second article (dict with 'title' and optionally 'content')
            
        Returns:
            Similarity score (0-1)
        """
        title1 = article1.get('title', '')
        title2 = article2.get('title', '')
        content1 = article1.get('content', '')
        content2 = article2.get('content', '')
        
        # Extract keywords from titles
        title_keywords1 = self.extract_keywords(title1)
        title_keywords2 = self.extract_keywords(title2)
        
        # Calculate title similarity
        if title_keywords1 and title_keywords2:
            title_intersection = title_keywords1.intersection(title_keywords2)
            title_union = title_keywords1.union(title_keywords2)
            title_similarity = len(title_intersection) / len(title_union) if title_union else 0
        else:
            # Fallback to string similarity
            t1 = self.normalize_title(title1)
            t2 = self.normalize_title(title2)
            title_similarity = SequenceMatcher(None, t1, t2).ratio()
        
        # If we have content, use it for better accuracy
        if content1 and content2:
            # Extract keywords from first 500 chars of content (for performance)
            content_keywords1 = self.extract_keywords(content1[:500])
            content_keywords2 = self.extract_keywords(content2[:500])
            
            if content_keywords1 and content_keywords2:
                content_intersection = content_keywords1.intersection(content_keywords2)
                content_union = content_keywords1.union(content_keywords2)
                content_similarity = len(content_intersection) / len(content_union) if content_union else 0
                
                # Weighted average: title (40%) + content (60%)
                return (title_similarity * 0.4) + (content_similarity * 0.6)
        
        # If no content, use only title similarity
        return title_similarity
    
    def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
        """
        Find if article belongs to an existing cluster using AI
        
        Args:
            article: New article to cluster (dict with 'title' and optionally 'content')
            existing_articles: List of existing articles
            
        Returns:
            cluster_id if found, None otherwise
        """
        cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
        
        for existing in existing_articles:
            # Only compare recent articles
            published_at = existing.get('published_at')
            if published_at and published_at < cutoff_time:
                continue
            
            # Use AI to check if same story
            if self.check_same_story_with_ai(article, existing):
                return existing.get('cluster_id', str(existing.get('_id')))
        
        return None
    
    def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
        """
        Cluster a single article
        
        Args:
            article: Article to cluster
            existing_articles: List of existing articles
            
        Returns:
            Article with cluster_id and is_primary fields
        """
        cluster_id = self.find_cluster(article, existing_articles)
        
        if cluster_id:
            # Add to existing cluster
            article['cluster_id'] = cluster_id
            article['is_primary'] = False
        else:
            # Create new cluster
            article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
            article['is_primary'] = True
        
        return article
    
    def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
        """
        Get all articles in a cluster
        
        Args:
            cluster_id: Cluster ID
            articles_collection: MongoDB collection
            
        Returns:
            List of articles in the cluster
        """
        return list(articles_collection.find({'cluster_id': cluster_id}))