update

2025-12-10 12:43:18 +00:00
parent 19fabacf5a
commit 57f37c8dc0
1 changed files with 108 additions and 158 deletions
--- a/news_crawler/ollama_client.py
+++ b/news_crawler/ollama_client.py
@@ -26,6 +26,43 @@ class OllamaClient:
        self.enabled = enabled
        self.timeout = timeout
    def _chat_request(self, messages, options=None):
        """
        Helper to make chat requests to Ollama
        Args:
            messages: List of message dicts [{'role': 'user', 'content': '...'}]
            options: Optional dict of model parameters
        Returns:
            str: Generated text content
        """
        if options is None:
            options = {}
        url = f"{self.base_url}/api/chat"
        headers = {'Content-Type': 'application/json'}
        if self.api_key:
            headers['Authorization'] = f'Bearer {self.api_key}'
        payload = {
            'model': self.model,
            'messages': messages,
            'stream': False,
            'options': options
        }
        response = requests.post(
            url,
            json=payload,
            headers=headers,
            timeout=self.timeout
        )
        response.raise_for_status()
        result = response.json()
        return result.get('message', {}).get('content', '').strip()
    def summarize_article(self, content, max_words=150):
        """
        Summarize article content using Ollama
@@ -70,37 +107,26 @@ class OllamaClient:
        start_time = time.time()
        try:
-            # Construct prompt
+            # Construct messages for chat API
-            prompt = self._build_summarization_prompt(content, max_words)
+            messages = [
-            
+                {
-            # Prepare request
+                    'role': 'system',
-            url = f"{self.base_url}/api/generate"
+                    'content': f"You are a skilled journalist writing for The New York Times. Summarize the provided article in English in {max_words} words or less.\\n\\nWrite in the clear, engaging, and authoritative style of New York Times Magazine:\\n- Lead with the most newsworthy information\\n- Use active voice and vivid language\\n- Make it accessible and easy to read\\n- Focus on what matters to readers\\n- Even if the source is in German or another language, write your summary entirely in English\\n\\nIMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose."
-            headers = {'Content-Type': 'application/json'}
+                },
-            if self.api_key:
+                {
-                headers['Authorization'] = f'Bearer {self.api_key}'
+                    'role': 'user',
-            
+                    'content': f"Summarize this article:\\n\\n{content}"
            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.7,
                    'num_predict': 250  # Limit response length
                }
                }
            ]
-            # Make request
+            # Make request using chat endpoint
-            response = requests.post(
+            summary = self._chat_request(
-                url,
+                messages, 
-                json=payload,
+                options={
-                headers=headers,
+                    'temperature': 0.5,
-                timeout=self.timeout
+                    'num_predict': 350
                }
            )
            response.raise_for_status()
            # Parse response
            result = response.json()
            summary = result.get('response', '').strip()
            if not summary:
                return {
@@ -198,37 +224,26 @@ class OllamaClient:
        start_time = time.time()
        try:
-            # Construct prompt
+            # Construct messages for chat API
-            prompt = self._build_translation_prompt(title, target_language)
+            messages = [
-            
+                {
-            # Prepare request
+                    'role': 'system',
-            url = f"{self.base_url}/api/generate"
+                    'content': f"You are a professional translator. Translate the following German news headline to {target_language}.\\n\\nIMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline."
-            headers = {'Content-Type': 'application/json'}
+                },
-            if self.api_key:
+                {
-                headers['Authorization'] = f'Bearer {self.api_key}'
+                    'role': 'user',
-            
+                    'content': title
            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.3,  # Lower temperature for consistent translations
                    'num_predict': 100   # Limit response length for title-length outputs
                }
                }
            ]
-            # Make request
+            # Make request using chat endpoint
-            response = requests.post(
+            translated_title = self._chat_request(
-                url,
+                messages,
-                json=payload,
+                options={
-                headers=headers,
+                    'temperature': 0.1,  # Low temperature for consistent translations
-                timeout=self.timeout
+                    'num_predict': 100   # Limit response length
                }
            )
            response.raise_for_status()
            # Parse response
            result = response.json()
            translated_title = result.get('response', '').strip()
            if not translated_title:
                return {
@@ -241,6 +256,13 @@ class OllamaClient:
            # Clean the translation output
            translated_title = self._clean_translation(translated_title)
            # Validate translation (if it's same as original, it might have failed)
            if translated_title.lower() == title.lower() and target_language == 'English':
                # Retry with more forceful prompt
                messages[0]['content'] += " If the text is already English, just output it as is."
                translated_title = self._chat_request(messages, options={'temperature': 0.1})
                translated_title = self._clean_translation(translated_title)
            return {
                'success': True,
                'translated_title': translated_title,
@@ -277,19 +299,6 @@ class OllamaClient:
                'duration': time.time() - start_time
            }
    def _build_translation_prompt(self, title, target_language):
        """Build prompt for title translation"""
        prompt = f"""Translate the following German news headline to {target_language}. 
 IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline.
 German headline:
 {title}
 {target_language} translation:"""
        return prompt
    def _clean_translation(self, translation):
        """Clean translation output by removing quotes and extra text"""
        # Extract first line only
@@ -335,31 +344,6 @@ German headline:
        return text
    def _build_summarization_prompt(self, content, max_words):
        """Build prompt for article summarization"""
        # Truncate content if too long (keep first 5000 words)
        words = content.split()
        if len(words) > 5000:
            content = ' '.join(words[:5000]) + '...'
        prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less.
 Write in the clear, engaging, and authoritative style of New York Times Magazine:
 - Lead with the most newsworthy information
 - Use active voice and vivid language
 - Make it accessible and easy to read
 - Focus on what matters to readers
 - Even if the source is in German or another language, write your summary entirely in English
 IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.
 Article:
 {content}
 New York Times-style summary (max {max_words} words):"""
        return prompt
    def is_available(self):
        """
        Check if Ollama server is reachable
@@ -462,37 +446,24 @@ New York Times-style summary (max {max_words} words):"""
        start_time = time.time()
        try:
-            response = requests.post(
+            messages = [{'role': 'user', 'content': prompt}]
-                f"{self.base_url}/api/generate",
+            
-                json={
+            text = self._chat_request(
-                    "model": self.model,
+                messages,
-                    "prompt": prompt,
+                options={
                    "stream": False,
                    "options": {
                    "num_predict": max_tokens,
-                        "temperature": 0.1  # Low temperature for consistent answers
+                    "temperature": 0.1
                }
                },
                timeout=self.timeout
            )
            duration = time.time() - start_time
            if response.status_code == 200:
                result = response.json()
            return {
-                    'text': result.get('response', '').strip(),
+                'text': text,
                'success': True,
                'error': None,
                'duration': duration
            }
            else:
                return {
                    'text': '',
                    'success': False,
                    'error': f"HTTP {response.status_code}: {response.text}",
                    'duration': duration
                }
        except requests.exceptions.Timeout:
            return {
@@ -537,47 +508,26 @@ New York Times-style summary (max {max_words} words):"""
        start_time = time.time()
        try:
-            # Construct prompt for keyword extraction
+            # Construct messages for chat API
-            prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
+            messages = [
-
+                {
-Title: {title}
+                    'role': 'system',
-Summary: {summary}
+                    'content': f"Extract {max_keywords} key topics or keywords from the article.\\n\\nReturn ONLY the keywords separated by commas, nothing else. Focus on:\\n- Main topics (e.g., 'Bayern Munich', 'Oktoberfest', 'City Council')\\n- Locations (e.g., 'Marienplatz', 'Airport')\\n- Events or themes (e.g., 'Transportation', 'Housing', 'Technology')"
-
+                },
-Return ONLY the keywords separated by commas, nothing else. Focus on:
+                {
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
+                    'role': 'user',
- Locations (e.g., "Marienplatz", "Airport")
+                    'content': f"Title: {title}\\nSummary: {summary}"
 - Events or themes (e.g., "Transportation", "Housing", "Technology")
 Keywords:"""
            # Prepare request
            url = f"{self.base_url}/api/generate"
            headers = {'Content-Type': 'application/json'}
            if self.api_key:
                headers['Authorization'] = f'Bearer {self.api_key}'
            payload = {
                'model': self.model,
                'prompt': prompt,
                'stream': False,
                'options': {
                    'temperature': 0.3,  # Lower temperature for consistent extraction
                    'num_predict': 100   # Limit response length
                }
                }
            ]
            # Make request
-            response = requests.post(
+            keywords_text = self._chat_request(
-                url,
+                messages,
-                json=payload,
+                options={
-                headers=headers,
+                    'temperature': 0.2,
-                timeout=self.timeout
+                    'num_predict': 100
                }
            )
            response.raise_for_status()
            # Parse response
            result = response.json()
            keywords_text = result.get('response', '').strip()
            if not keywords_text:
                return {