update

2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions
--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -508,6 +508,110 @@ New York Times-style summary (max {max_words} words):"""
                'error': str(e),
                'duration': time.time() - start_time
            }
+    
+    def extract_keywords(self, title, summary, max_keywords=5):
+        """
+        Extract keywords/topics from article for personalization
+        
+        Args:
+            title: Article title
+            summary: Article summary
+            max_keywords: Maximum number of keywords to extract (default 5)
+            
+        Returns:
+            {
+                'keywords': list,      # List of extracted keywords
+                'success': bool,       # Whether extraction succeeded
+                'error': str or None,  # Error message if failed
+                'duration': float      # Time taken in seconds
+            }
+        """
+        if not self.enabled:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': 'Ollama is disabled',
+                'duration': 0
+            }
+        
+        start_time = time.time()
+        
+        try:
+            # Construct prompt for keyword extraction
+            prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
+
+Title: {title}
+Summary: {summary}
+
+Return ONLY the keywords separated by commas, nothing else. Focus on:
+- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
+- Locations (e.g., "Marienplatz", "Airport")
+- Events or themes (e.g., "Transportation", "Housing", "Technology")
+
+Keywords:"""
+            
+            # Prepare request
+            url = f"{self.base_url}/api/generate"
+            headers = {'Content-Type': 'application/json'}
+            if self.api_key:
+                headers['Authorization'] = f'Bearer {self.api_key}'
+            
+            payload = {
+                'model': self.model,
+                'prompt': prompt,
+                'stream': False,
+                'options': {
+                    'temperature': 0.3,  # Lower temperature for consistent extraction
+                    'num_predict': 100   # Limit response length
+                }
+            }
+            
+            # Make request
+            response = requests.post(
+                url,
+                json=payload,
+                headers=headers,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            
+            # Parse response
+            result = response.json()
+            keywords_text = result.get('response', '').strip()
+            
+            if not keywords_text:
+                return {
+                    'keywords': [],
+                    'success': False,
+                    'error': 'Ollama returned empty response',
+                    'duration': time.time() - start_time
+                }
+            
+            # Parse keywords from response
+            keywords = [k.strip() for k in keywords_text.split(',')]
+            keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
+            
+            return {
+                'keywords': keywords,
+                'success': True,
+                'error': None,
+                'duration': time.time() - start_time
+            }
+            
+        except requests.exceptions.Timeout:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': f"Request timed out after {self.timeout}s",
+                'duration': time.time() - start_time
+            }
+        except Exception as e:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': str(e),
+                'duration': time.time() - start_time
+            }


 if __name__ == '__main__':