diff --git a/news_crawler/ollama_client.py b/news_crawler/ollama_client.py index 36c3f46..20df206 100644 --- a/news_crawler/ollama_client.py +++ b/news_crawler/ollama_client.py @@ -25,6 +25,43 @@ class OllamaClient: self.api_key = api_key self.enabled = enabled self.timeout = timeout + + def _chat_request(self, messages, options=None): + """ + Helper to make chat requests to Ollama + + Args: + messages: List of message dicts [{'role': 'user', 'content': '...'}] + options: Optional dict of model parameters + + Returns: + str: Generated text content + """ + if options is None: + options = {} + + url = f"{self.base_url}/api/chat" + headers = {'Content-Type': 'application/json'} + if self.api_key: + headers['Authorization'] = f'Bearer {self.api_key}' + + payload = { + 'model': self.model, + 'messages': messages, + 'stream': False, + 'options': options + } + + response = requests.post( + url, + json=payload, + headers=headers, + timeout=self.timeout + ) + response.raise_for_status() + + result = response.json() + return result.get('message', {}).get('content', '').strip() def summarize_article(self, content, max_words=150): """ @@ -70,37 +107,26 @@ class OllamaClient: start_time = time.time() try: - # Construct prompt - prompt = self._build_summarization_prompt(content, max_words) - - # Prepare request - url = f"{self.base_url}/api/generate" - headers = {'Content-Type': 'application/json'} - if self.api_key: - headers['Authorization'] = f'Bearer {self.api_key}' - - payload = { - 'model': self.model, - 'prompt': prompt, - 'stream': False, - 'options': { - 'temperature': 0.7, - 'num_predict': 250 # Limit response length + # Construct messages for chat API + messages = [ + { + 'role': 'system', + 'content': f"You are a skilled journalist writing for The New York Times. Summarize the provided article in English in {max_words} words or less.\\n\\nWrite in the clear, engaging, and authoritative style of New York Times Magazine:\\n- Lead with the most newsworthy information\\n- Use active voice and vivid language\\n- Make it accessible and easy to read\\n- Focus on what matters to readers\\n- Even if the source is in German or another language, write your summary entirely in English\\n\\nIMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose." + }, + { + 'role': 'user', + 'content': f"Summarize this article:\\n\\n{content}" } - } + ] - # Make request - response = requests.post( - url, - json=payload, - headers=headers, - timeout=self.timeout + # Make request using chat endpoint + summary = self._chat_request( + messages, + options={ + 'temperature': 0.5, + 'num_predict': 350 + } ) - response.raise_for_status() - - # Parse response - result = response.json() - summary = result.get('response', '').strip() if not summary: return { @@ -198,37 +224,26 @@ class OllamaClient: start_time = time.time() try: - # Construct prompt - prompt = self._build_translation_prompt(title, target_language) - - # Prepare request - url = f"{self.base_url}/api/generate" - headers = {'Content-Type': 'application/json'} - if self.api_key: - headers['Authorization'] = f'Bearer {self.api_key}' - - payload = { - 'model': self.model, - 'prompt': prompt, - 'stream': False, - 'options': { - 'temperature': 0.3, # Lower temperature for consistent translations - 'num_predict': 100 # Limit response length for title-length outputs + # Construct messages for chat API + messages = [ + { + 'role': 'system', + 'content': f"You are a professional translator. Translate the following German news headline to {target_language}.\\n\\nIMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline." + }, + { + 'role': 'user', + 'content': title } - } + ] - # Make request - response = requests.post( - url, - json=payload, - headers=headers, - timeout=self.timeout + # Make request using chat endpoint + translated_title = self._chat_request( + messages, + options={ + 'temperature': 0.1, # Low temperature for consistent translations + 'num_predict': 100 # Limit response length + } ) - response.raise_for_status() - - # Parse response - result = response.json() - translated_title = result.get('response', '').strip() if not translated_title: return { @@ -241,6 +256,13 @@ class OllamaClient: # Clean the translation output translated_title = self._clean_translation(translated_title) + # Validate translation (if it's same as original, it might have failed) + if translated_title.lower() == title.lower() and target_language == 'English': + # Retry with more forceful prompt + messages[0]['content'] += " If the text is already English, just output it as is." + translated_title = self._chat_request(messages, options={'temperature': 0.1}) + translated_title = self._clean_translation(translated_title) + return { 'success': True, 'translated_title': translated_title, @@ -277,19 +299,6 @@ class OllamaClient: 'duration': time.time() - start_time } - def _build_translation_prompt(self, title, target_language): - """Build prompt for title translation""" - prompt = f"""Translate the following German news headline to {target_language}. - -IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline. - -German headline: -{title} - -{target_language} translation:""" - - return prompt - def _clean_translation(self, translation): """Clean translation output by removing quotes and extra text""" # Extract first line only @@ -335,31 +344,6 @@ German headline: return text - def _build_summarization_prompt(self, content, max_words): - """Build prompt for article summarization""" - # Truncate content if too long (keep first 5000 words) - words = content.split() - if len(words) > 5000: - content = ' '.join(words[:5000]) + '...' - - prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less. - -Write in the clear, engaging, and authoritative style of New York Times Magazine: -- Lead with the most newsworthy information -- Use active voice and vivid language -- Make it accessible and easy to read -- Focus on what matters to readers -- Even if the source is in German or another language, write your summary entirely in English - -IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose. - -Article: -{content} - -New York Times-style summary (max {max_words} words):""" - - return prompt - def is_available(self): """ Check if Ollama server is reachable @@ -462,37 +446,24 @@ New York Times-style summary (max {max_words} words):""" start_time = time.time() try: - response = requests.post( - f"{self.base_url}/api/generate", - json={ - "model": self.model, - "prompt": prompt, - "stream": False, - "options": { - "num_predict": max_tokens, - "temperature": 0.1 # Low temperature for consistent answers - } - }, - timeout=self.timeout + messages = [{'role': 'user', 'content': prompt}] + + text = self._chat_request( + messages, + options={ + "num_predict": max_tokens, + "temperature": 0.1 + } ) duration = time.time() - start_time - if response.status_code == 200: - result = response.json() - return { - 'text': result.get('response', '').strip(), - 'success': True, - 'error': None, - 'duration': duration - } - else: - return { - 'text': '', - 'success': False, - 'error': f"HTTP {response.status_code}: {response.text}", - 'duration': duration - } + return { + 'text': text, + 'success': True, + 'error': None, + 'duration': duration + } except requests.exceptions.Timeout: return { @@ -537,47 +508,26 @@ New York Times-style summary (max {max_words} words):""" start_time = time.time() try: - # Construct prompt for keyword extraction - prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests. - -Title: {title} -Summary: {summary} - -Return ONLY the keywords separated by commas, nothing else. Focus on: -- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council") -- Locations (e.g., "Marienplatz", "Airport") -- Events or themes (e.g., "Transportation", "Housing", "Technology") - -Keywords:""" - - # Prepare request - url = f"{self.base_url}/api/generate" - headers = {'Content-Type': 'application/json'} - if self.api_key: - headers['Authorization'] = f'Bearer {self.api_key}' - - payload = { - 'model': self.model, - 'prompt': prompt, - 'stream': False, - 'options': { - 'temperature': 0.3, # Lower temperature for consistent extraction - 'num_predict': 100 # Limit response length + # Construct messages for chat API + messages = [ + { + 'role': 'system', + 'content': f"Extract {max_keywords} key topics or keywords from the article.\\n\\nReturn ONLY the keywords separated by commas, nothing else. Focus on:\\n- Main topics (e.g., 'Bayern Munich', 'Oktoberfest', 'City Council')\\n- Locations (e.g., 'Marienplatz', 'Airport')\\n- Events or themes (e.g., 'Transportation', 'Housing', 'Technology')" + }, + { + 'role': 'user', + 'content': f"Title: {title}\\nSummary: {summary}" } - } + ] # Make request - response = requests.post( - url, - json=payload, - headers=headers, - timeout=self.timeout + keywords_text = self._chat_request( + messages, + options={ + 'temperature': 0.2, + 'num_predict': 100 + } ) - response.raise_for_status() - - # Parse response - result = response.json() - keywords_text = result.get('response', '').strip() if not keywords_text: return {