Munich-news/news_crawler/ollama_client.py

"""
Ollama client for AI-powered article summarization
"""
import requests
import time
from datetime import datetime


class OllamaClient:
    """Client for communicating with Ollama server for text summarization"""

    def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
        """
        Initialize Ollama client

        Args:
            base_url: Ollama server URL (e.g., http://localhost:11434)
            model: Model name to use (e.g., phi3:latest)
            api_key: Optional API key for authentication
            enabled: Whether Ollama is enabled
            timeout: Request timeout in seconds (default 30)
        """
        self.base_url = base_url.rstrip('/')
        self.model = model
        self.api_key = api_key
        self.enabled = enabled
        self.timeout = timeout

    def summarize_article(self, content, max_words=150):
        """
        Summarize article content using Ollama

        Args:
            content: Full article text
            max_words: Maximum words in summary (default 150)

        Returns:
            {
                'summary': str,              # AI-generated summary
                'summary_word_count': int,   # Summary word count
                'original_word_count': int,  # Original article word count
                'success': bool,             # Whether summarization succeeded
                'error': str or None,        # Error message if failed
                'duration': float            # Time taken in seconds
            }
        """
        if not self.enabled:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': 0,
                'success': False,
                'error': 'Ollama is not enabled',
                'duration': 0
            }

        if not content or len(content.strip()) == 0:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': 0,
                'success': False,
                'error': 'Content is empty',
                'duration': 0
            }

        # Calculate original word count
        original_word_count = len(content.split())

        start_time = time.time()

        try:
            # Construct prompt
            prompt = self._build_summarization_prompt(content, max_words)

            # Prepare request
            url = f"{self.base_url}/api/generate"
            headers = {'Content-Type': 'application/json'}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'

            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.7,
                    'num_predict': 250  # Limit response length
                }
            }

            # Make request
            response = requests.post(
                url,
                json=payload,
                headers=headers,
                timeout=self.timeout
            )
            response.raise_for_status()

            # Parse response
            result = response.json()
            summary = result.get('response', '').strip()

            if not summary:
                return {
                    'summary': None,
                    'summary_word_count': 0,
                    'original_word_count': original_word_count,
                    'success': False,
                    'error': 'Ollama returned empty summary',
                    'duration': time.time() - start_time
                }

            # Clean markdown formatting from summary
            summary = self._clean_markdown(summary)

            summary_word_count = len(summary.split())

            return {
                'summary': summary,
                'summary_word_count': summary_word_count,
                'original_word_count': original_word_count,
                'success': True,
                'error': None,
                'duration': time.time() - start_time
            }

        except requests.exceptions.Timeout:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Request timed out after {self.timeout} seconds',
                'duration': time.time() - start_time
            }
        except requests.exceptions.ConnectionError:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Cannot connect to Ollama server at {self.base_url}',
                'duration': time.time() - start_time
            }
        except requests.exceptions.HTTPError as e:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
                'duration': time.time() - start_time
            }
        except Exception as e:
            return {
                'summary': None,
                'summary_word_count': 0,
                'original_word_count': original_word_count,
                'success': False,
                'error': f'Unexpected error: {str(e)}',
                'duration': time.time() - start_time
            }

    def translate_title(self, title, target_language='English'):
        """
        Translate article title to target language

        Args:
            title: Original title (typically German)
            target_language: Target language (default: 'English')

        Returns:
            {
                'success': bool,             # Whether translation succeeded
                'translated_title': str or None,  # Translated title
                'error': str or None,        # Error message if failed
                'duration': float            # Time taken in seconds
            }
        """
        if not self.enabled:
            return {
                'success': False,
                'translated_title': None,
                'error': 'Ollama is not enabled',
                'duration': 0
            }

        if not title or len(title.strip()) == 0:
            return {
                'success': False,
                'translated_title': None,
                'error': 'Title is empty',
                'duration': 0
            }

        start_time = time.time()

        try:
            # Construct prompt
            prompt = self._build_translation_prompt(title, target_language)

            # Prepare request
            url = f"{self.base_url}/api/generate"
            headers = {'Content-Type': 'application/json'}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'

            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.3,  # Lower temperature for consistent translations
                    'num_predict': 100   # Limit response length for title-length outputs
                }
            }

            # Make request
            response = requests.post(
                url,
                json=payload,
                headers=headers,
                timeout=self.timeout
            )
            response.raise_for_status()

            # Parse response
            result = response.json()
            translated_title = result.get('response', '').strip()

            if not translated_title:
                return {
                    'success': False,
                    'translated_title': None,
                    'error': 'Ollama returned empty translation',
                    'duration': time.time() - start_time
                }

            # Clean the translation output
            translated_title = self._clean_translation(translated_title)

            return {
                'success': True,
                'translated_title': translated_title,
                'error': None,
                'duration': time.time() - start_time
            }

        except requests.exceptions.Timeout:
            return {
                'success': False,
                'translated_title': None,
                'error': f'Request timed out after {self.timeout} seconds',
                'duration': time.time() - start_time
            }
        except requests.exceptions.ConnectionError:
            return {
                'success': False,
                'translated_title': None,
                'error': f'Cannot connect to Ollama server at {self.base_url}',
                'duration': time.time() - start_time
            }
        except requests.exceptions.HTTPError as e:
            return {
                'success': False,
                'translated_title': None,
                'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
                'duration': time.time() - start_time
            }
        except Exception as e:
            return {
                'success': False,
                'translated_title': None,
                'error': f'Unexpected error: {str(e)}',
                'duration': time.time() - start_time
            }

    def _build_translation_prompt(self, title, target_language):
        """Build prompt for title translation"""
        prompt = f"""Translate the following German news headline to {target_language}.

IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline.

German headline:
{title}

{target_language} translation:"""

        return prompt

    def _clean_translation(self, translation):
        """Clean translation output by removing quotes and extra text"""
        # Extract first line only
        translation = translation.split('\n')[0]

        # Remove surrounding quotes (single and double)
        translation = translation.strip()
        if (translation.startswith('"') and translation.endswith('"')) or \
           (translation.startswith("'") and translation.endswith("'")):
            translation = translation[1:-1]

        # Trim whitespace again after quote removal
        translation = translation.strip()

        return translation

    def _clean_markdown(self, text):
        """Remove markdown formatting from text"""
        import re

        # Remove markdown headers (##, ###, etc.)
        text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)

        # Remove bold/italic markers (**text**, *text*, __text__, _text_)
        text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
        text = re.sub(r'__([^_]+)__', r'\1', text)
        text = re.sub(r'\*([^\*]+)\*', r'\1', text)
        text = re.sub(r'_([^_]+)_', r'\1', text)

        # Remove markdown links [text](url) -> text
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

        # Remove inline code `text`
        text = re.sub(r'`([^`]+)`', r'\1', text)

        # Remove bullet points and list markers
        text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
        text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)

        # Clean up extra whitespace
        text = re.sub(r'\n\s*\n', '\n\n', text)
        text = text.strip()

        return text

    def _build_summarization_prompt(self, content, max_words):
        """Build prompt for article summarization"""
        # Truncate content if too long (keep first 5000 words)
        words = content.split()
        if len(words) > 5000:
            content = ' '.join(words[:5000]) + '...'

        prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less.

Write in the clear, engaging, and authoritative style of New York Times Magazine:
- Lead with the most newsworthy information
- Use active voice and vivid language
- Make it accessible and easy to read
- Focus on what matters to readers
- Even if the source is in German or another language, write your summary entirely in English

IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.

Article:
{content}

New York Times-style summary (max {max_words} words):"""

        return prompt

    def is_available(self):
        """
        Check if Ollama server is reachable

        Returns:
            bool: True if server is reachable, False otherwise
        """
        if not self.enabled:
            return False

        try:
            url = f"{self.base_url}/api/tags"
            headers = {}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'

            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            return True
        except:
            return False

    def test_connection(self):
        """
        Test connection and return server info

        Returns:
            {
                'available': bool,
                'models': list,
                'current_model': str,
                'error': str or None
            }
        """
        if not self.enabled:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': 'Ollama is not enabled'
            }

        try:
            url = f"{self.base_url}/api/tags"
            headers = {}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'

            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()

            result = response.json()
            models = [m.get('name', '') for m in result.get('models', [])]

            return {
                'available': True,
                'models': models,
                'current_model': self.model,
                'error': None
            }
        except requests.exceptions.ConnectionError:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': f'Cannot connect to Ollama server at {self.base_url}'
            }
        except Exception as e:
            return {
                'available': False,
                'models': [],
                'current_model': self.model,
                'error': str(e)
            }

    def generate(self, prompt, max_tokens=100):
        """
        Generate text using Ollama

        Args:
            prompt: Text prompt
            max_tokens: Maximum tokens to generate

        Returns:
            {
                'text': str,           # Generated text
                'success': bool,       # Whether generation succeeded
                'error': str or None,  # Error message if failed
                'duration': float      # Time taken in seconds
            }
        """
        if not self.enabled:
            return {
                'text': '',
                'success': False,
                'error': 'Ollama is disabled',
                'duration': 0
            }

        start_time = time.time()

        try:
            response = requests.post(
                f"{self.base_url}/api/generate",
                json={
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "num_predict": max_tokens,
                        "temperature": 0.1  # Low temperature for consistent answers
                    }
                },
                timeout=self.timeout
            )

            duration = time.time() - start_time

            if response.status_code == 200:
                result = response.json()
                return {
                    'text': result.get('response', '').strip(),
                    'success': True,
                    'error': None,
                    'duration': duration
                }
            else:
                return {
                    'text': '',
                    'success': False,
                    'error': f"HTTP {response.status_code}: {response.text}",
                    'duration': duration
                }

        except requests.exceptions.Timeout:
            return {
                'text': '',
                'success': False,
                'error': f"Request timed out after {self.timeout}s",
                'duration': time.time() - start_time
            }
        except Exception as e:
            return {
                'text': '',
                'success': False,
                'error': str(e),
                'duration': time.time() - start_time
            }

    def extract_keywords(self, title, summary, max_keywords=5):
        """
        Extract keywords/topics from article for personalization

        Args:
            title: Article title
            summary: Article summary
            max_keywords: Maximum number of keywords to extract (default 5)

        Returns:
            {
                'keywords': list,      # List of extracted keywords
                'success': bool,       # Whether extraction succeeded
                'error': str or None,  # Error message if failed
                'duration': float      # Time taken in seconds
            }
        """
        if not self.enabled:
            return {
                'keywords': [],
                'success': False,
                'error': 'Ollama is disabled',
                'duration': 0
            }

        start_time = time.time()

        try:
            # Construct prompt for keyword extraction
            prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.

Title: {title}
Summary: {summary}

Return ONLY the keywords separated by commas, nothing else. Focus on:
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
- Locations (e.g., "Marienplatz", "Airport")
- Events or themes (e.g., "Transportation", "Housing", "Technology")

Keywords:"""

            # Prepare request
            url = f"{self.base_url}/api/generate"
            headers = {'Content-Type': 'application/json'}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'

            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.3,  # Lower temperature for consistent extraction
                    'num_predict': 100   # Limit response length
                }
            }

            # Make request
            response = requests.post(
                url,
                json=payload,
                headers=headers,
                timeout=self.timeout
            )
            response.raise_for_status()

            # Parse response
            result = response.json()
            keywords_text = result.get('response', '').strip()

            if not keywords_text:
                return {
                    'keywords': [],
                    'success': False,
                    'error': 'Ollama returned empty response',
                    'duration': time.time() - start_time
                }

            # Parse keywords from response
            keywords = [k.strip() for k in keywords_text.split(',')]
            keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]

            return {
                'keywords': keywords,
                'success': True,
                'error': None,
                'duration': time.time() - start_time
            }

        except requests.exceptions.Timeout:
            return {
                'keywords': [],
                'success': False,
                'error': f"Request timed out after {self.timeout}s",
                'duration': time.time() - start_time
            }
        except Exception as e:
            return {
                'keywords': [],
                'success': False,
                'error': str(e),
                'duration': time.time() - start_time
            }


if __name__ == '__main__':
    # Quick test
    import os
    from dotenv import load_dotenv

    load_dotenv(dotenv_path='../.env')

    client = OllamaClient(
        base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
        model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
        enabled=True
    )

    print("Testing Ollama connection...")
    result = client.test_connection()
    print(f"Available: {result['available']}")
    print(f"Models: {result['models']}")
    print(f"Current model: {result['current_model']}")

    if result['available']:
        print("\nTesting summarization...")
        test_content = """
        The new U-Bahn line connecting Munich's city center with the airport opened today.
        Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
        The line will significantly reduce travel time between the airport and downtown Munich.
        Construction took five years and cost approximately 2 billion euros.
        The new line includes 10 stations and runs every 10 minutes during peak hours.
        """

        summary_result = client.summarize_article(test_content, max_words=50)
        print(f"Success: {summary_result['success']}")
        print(f"Summary: {summary_result['summary']}")
        print(f"Original word count: {summary_result['original_word_count']}")
        print(f"Summary word count: {summary_result['summary_word_count']}")
        print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
        print(f"Duration: {summary_result['duration']:.2f}s")