update

2025-11-18 14:45:41 +01:00
parent 2e80d64ff6
commit 84fce9a82c
19 changed files with 2437 additions and 3 deletions
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -388,6 +388,21 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                        print(f"   ⚠ Summarization failed: {summary_result['error']}")
                        failed_summaries += 1
                
+                # Extract keywords for personalization
+                keywords_result = None
+                if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
+                    print(f"   🔑 Extracting keywords...")
+                    keywords_result = ollama_client.extract_keywords(
+                        original_title,
+                        summary_result['summary'],
+                        max_keywords=5
+                    )
+                    
+                    if keywords_result['success']:
+                        print(f"   ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
+                    else:
+                        print(f"   ⚠ Keyword extraction failed: {keywords_result['error']}")
+                
                # Prepare document
                article_doc = {
                    'title': original_title,
@@ -396,6 +411,7 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                    'link': article_url,
                    'content': article_data.get('content', ''),  # Full article content
                    'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
+                    'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
                    'word_count': article_data.get('word_count', 0),
                    'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
                    'source': feed_name,
--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -508,6 +508,110 @@ New York Times-style summary (max {max_words} words):"""
                'error': str(e),
                'duration': time.time() - start_time
            }
+    
+    def extract_keywords(self, title, summary, max_keywords=5):
+        """
+        Extract keywords/topics from article for personalization
+        
+        Args:
+            title: Article title
+            summary: Article summary
+            max_keywords: Maximum number of keywords to extract (default 5)
+            
+        Returns:
+            {
+                'keywords': list,      # List of extracted keywords
+                'success': bool,       # Whether extraction succeeded
+                'error': str or None,  # Error message if failed
+                'duration': float      # Time taken in seconds
+            }
+        """
+        if not self.enabled:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': 'Ollama is disabled',
+                'duration': 0
+            }
+        
+        start_time = time.time()
+        
+        try:
+            # Construct prompt for keyword extraction
+            prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
+
+Title: {title}
+Summary: {summary}
+
+Return ONLY the keywords separated by commas, nothing else. Focus on:
+- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
+- Locations (e.g., "Marienplatz", "Airport")
+- Events or themes (e.g., "Transportation", "Housing", "Technology")
+
+Keywords:"""
+            
+            # Prepare request
+            url = f"{self.base_url}/api/generate"
+            headers = {'Content-Type': 'application/json'}
+            if self.api_key:
+                headers['Authorization'] = f'Bearer {self.api_key}'
+            
+            payload = {
+                'model': self.model,
+                'prompt': prompt,
+                'stream': False,
+                'options': {
+                    'temperature': 0.3,  # Lower temperature for consistent extraction
+                    'num_predict': 100   # Limit response length
+                }
+            }
+            
+            # Make request
+            response = requests.post(
+                url,
+                json=payload,
+                headers=headers,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            
+            # Parse response
+            result = response.json()
+            keywords_text = result.get('response', '').strip()
+            
+            if not keywords_text:
+                return {
+                    'keywords': [],
+                    'success': False,
+                    'error': 'Ollama returned empty response',
+                    'duration': time.time() - start_time
+                }
+            
+            # Parse keywords from response
+            keywords = [k.strip() for k in keywords_text.split(',')]
+            keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
+            
+            return {
+                'keywords': keywords,
+                'success': True,
+                'error': None,
+                'duration': time.time() - start_time
+            }
+            
+        except requests.exceptions.Timeout:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': f"Request timed out after {self.timeout}s",
+                'duration': time.time() - start_time
+            }
+        except Exception as e:
+            return {
+                'keywords': [],
+                'success': False,
+                'error': str(e),
+                'duration': time.time() - start_time
+            }


 if __name__ == '__main__':