update

2025-12-10 12:43:18 +00:00
parent 19fabacf5a
commit 57f37c8dc0
1 changed files with 108 additions and 158 deletions
--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -26,6 +26,43 @@ class OllamaClient:
        self.enabled = enabled
        self.timeout = timeout

+    def _chat_request(self, messages, options=None):
+        """
+        Helper to make chat requests to Ollama
+        
+        Args:
+            messages: List of message dicts [{'role': 'user', 'content': '...'}]
+            options: Optional dict of model parameters
+            
+        Returns:
+            str: Generated text content
+        """
+        if options is None:
+            options = {}
+            
+        url = f"{self.base_url}/api/chat"
+        headers = {'Content-Type': 'application/json'}
+        if self.api_key:
+            headers['Authorization'] = f'Bearer {self.api_key}'
+            
+        payload = {
+            'model': self.model,
+            'messages': messages,
+            'stream': False,
+            'options': options
+        }
+        
+        response = requests.post(
+            url,
+            json=payload,
+            headers=headers,
+            timeout=self.timeout
+        )
+        response.raise_for_status()
+        
+        result = response.json()
+        return result.get('message', {}).get('content', '').strip()
+        
    def summarize_article(self, content, max_words=150):
        """
        Summarize article content using Ollama
@@ -70,37 +107,26 @@ class OllamaClient:
        start_time = time.time()
        
        try:
-            # Construct prompt
-            prompt = self._build_summarization_prompt(content, max_words)
-            
-            # Prepare request
-            url = f"{self.base_url}/api/generate"
-            headers = {'Content-Type': 'application/json'}
-            if self.api_key:
-                headers['Authorization'] = f'Bearer {self.api_key}'
-            
-            payload = {
-                'model': self.model,
-                'prompt': prompt,
-                'stream': False,
-                'options': {
-                    'temperature': 0.7,
-                    'num_predict': 250  # Limit response length
+            # Construct messages for chat API
+            messages = [
+                {
+                    'role': 'system',
+                    'content': f"You are a skilled journalist writing for The New York Times. Summarize the provided article in English in {max_words} words or less.\\n\\nWrite in the clear, engaging, and authoritative style of New York Times Magazine:\\n- Lead with the most newsworthy information\\n- Use active voice and vivid language\\n- Make it accessible and easy to read\\n- Focus on what matters to readers\\n- Even if the source is in German or another language, write your summary entirely in English\\n\\nIMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose."
+                },
+                {
+                    'role': 'user',
+                    'content': f"Summarize this article:\\n\\n{content}"
                }
-            }
+            ]
            
-            # Make request
-            response = requests.post(
-                url,
-                json=payload,
-                headers=headers,
-                timeout=self.timeout
+            # Make request using chat endpoint
+            summary = self._chat_request(
+                messages, 
+                options={
+                    'temperature': 0.5,
+                    'num_predict': 350
+                }
            )
-            response.raise_for_status()
-            
-            # Parse response
-            result = response.json()
-            summary = result.get('response', '').strip()
            
            if not summary:
                return {
@@ -198,37 +224,26 @@ class OllamaClient:
        start_time = time.time()
        
        try:
-            # Construct prompt
-            prompt = self._build_translation_prompt(title, target_language)
-            
-            # Prepare request
-            url = f"{self.base_url}/api/generate"
-            headers = {'Content-Type': 'application/json'}
-            if self.api_key:
-                headers['Authorization'] = f'Bearer {self.api_key}'
-            
-            payload = {
-                'model': self.model,
-                'prompt': prompt,
-                'stream': False,
-                'options': {
-                    'temperature': 0.3,  # Lower temperature for consistent translations
-                    'num_predict': 100   # Limit response length for title-length outputs
+            # Construct messages for chat API
+            messages = [
+                {
+                    'role': 'system',
+                    'content': f"You are a professional translator. Translate the following German news headline to {target_language}.\\n\\nIMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline."
+                },
+                {
+                    'role': 'user',
+                    'content': title
                }
-            }
+            ]
            
-            # Make request
-            response = requests.post(
-                url,
-                json=payload,
-                headers=headers,
-                timeout=self.timeout
+            # Make request using chat endpoint
+            translated_title = self._chat_request(
+                messages,
+                options={
+                    'temperature': 0.1,  # Low temperature for consistent translations
+                    'num_predict': 100   # Limit response length
+                }
            )
-            response.raise_for_status()
-            
-            # Parse response
-            result = response.json()
-            translated_title = result.get('response', '').strip()
            
            if not translated_title:
                return {
@@ -241,6 +256,13 @@ class OllamaClient:
            # Clean the translation output
            translated_title = self._clean_translation(translated_title)
            
+            # Validate translation (if it's same as original, it might have failed)
+            if translated_title.lower() == title.lower() and target_language == 'English':
+                # Retry with more forceful prompt
+                messages[0]['content'] += " If the text is already English, just output it as is."
+                translated_title = self._chat_request(messages, options={'temperature': 0.1})
+                translated_title = self._clean_translation(translated_title)
+            
            return {
                'success': True,
                'translated_title': translated_title,
@@ -277,19 +299,6 @@ class OllamaClient:
                'duration': time.time() - start_time
            }
    
-    def _build_translation_prompt(self, title, target_language):
-        """Build prompt for title translation"""
-        prompt = f"""Translate the following German news headline to {target_language}. 
-
-IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline.
-
-German headline:
-{title}
-
-{target_language} translation:"""
-        
-        return prompt
-    
    def _clean_translation(self, translation):
        """Clean translation output by removing quotes and extra text"""
        # Extract first line only
@@ -335,31 +344,6 @@ German headline:
        
        return text
    
-    def _build_summarization_prompt(self, content, max_words):
-        """Build prompt for article summarization"""
-        # Truncate content if too long (keep first 5000 words)
-        words = content.split()
-        if len(words) > 5000:
-            content = ' '.join(words[:5000]) + '...'
-        
-        prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less.
-
-Write in the clear, engaging, and authoritative style of New York Times Magazine:
- Lead with the most newsworthy information
- Use active voice and vivid language
- Make it accessible and easy to read
- Focus on what matters to readers
- Even if the source is in German or another language, write your summary entirely in English
-
-IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.
-
-Article:
-{content}
-
-New York Times-style summary (max {max_words} words):"""
-        
-        return prompt
-    
    def is_available(self):
        """
        Check if Ollama server is reachable
@@ -462,37 +446,24 @@ New York Times-style summary (max {max_words} words):"""
        start_time = time.time()
        
        try:
-            response = requests.post(
-                f"{self.base_url}/api/generate",
-                json={
-                    "model": self.model,
-                    "prompt": prompt,
-                    "stream": False,
-                    "options": {
-                        "num_predict": max_tokens,
-                        "temperature": 0.1  # Low temperature for consistent answers
-                    }
-                },
-                timeout=self.timeout
+            messages = [{'role': 'user', 'content': prompt}]
+            
+            text = self._chat_request(
+                messages,
+                options={
+                    "num_predict": max_tokens,
+                    "temperature": 0.1
+                }
            )
            
            duration = time.time() - start_time
            
-            if response.status_code == 200:
-                result = response.json()
-                return {
-                    'text': result.get('response', '').strip(),
-                    'success': True,
-                    'error': None,
-                    'duration': duration
-                }
-            else:
-                return {
-                    'text': '',
-                    'success': False,
-                    'error': f"HTTP {response.status_code}: {response.text}",
-                    'duration': duration
-                }
+            return {
+                'text': text,
+                'success': True,
+                'error': None,
+                'duration': duration
+            }
                
        except requests.exceptions.Timeout:
            return {
@@ -537,47 +508,26 @@ New York Times-style summary (max {max_words} words):"""
        start_time = time.time()
        
        try:
-            # Construct prompt for keyword extraction
-            prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
-
-Title: {title}
-Summary: {summary}
-
-Return ONLY the keywords separated by commas, nothing else. Focus on:
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
- Locations (e.g., "Marienplatz", "Airport")
- Events or themes (e.g., "Transportation", "Housing", "Technology")
-
-Keywords:"""
-            
-            # Prepare request
-            url = f"{self.base_url}/api/generate"
-            headers = {'Content-Type': 'application/json'}
-            if self.api_key:
-                headers['Authorization'] = f'Bearer {self.api_key}'
-            
-            payload = {
-                'model': self.model,
-                'prompt': prompt,
-                'stream': False,
-                'options': {
-                    'temperature': 0.3,  # Lower temperature for consistent extraction
-                    'num_predict': 100   # Limit response length
+            # Construct messages for chat API
+            messages = [
+                {
+                    'role': 'system',
+                    'content': f"Extract {max_keywords} key topics or keywords from the article.\\n\\nReturn ONLY the keywords separated by commas, nothing else. Focus on:\\n- Main topics (e.g., 'Bayern Munich', 'Oktoberfest', 'City Council')\\n- Locations (e.g., 'Marienplatz', 'Airport')\\n- Events or themes (e.g., 'Transportation', 'Housing', 'Technology')"
+                },
+                {
+                    'role': 'user',
+                    'content': f"Title: {title}\\nSummary: {summary}"
                }
-            }
+            ]
            
            # Make request
-            response = requests.post(
-                url,
-                json=payload,
-                headers=headers,
-                timeout=self.timeout
+            keywords_text = self._chat_request(
+                messages,
+                options={
+                    'temperature': 0.2,
+                    'num_predict': 100
+                }
            )
-            response.raise_for_status()
-            
-            # Parse response
-            result = response.json()
-            keywords_text = result.get('response', '').strip()
            
            if not keywords_text:
                return {