653 lines
22 KiB
Python
653 lines
22 KiB
Python
"""
|
|
Ollama client for AI-powered article summarization
|
|
"""
|
|
import requests
|
|
import time
|
|
from datetime import datetime
|
|
|
|
|
|
class OllamaClient:
|
|
"""Client for communicating with Ollama server for text summarization"""
|
|
|
|
def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
|
|
"""
|
|
Initialize Ollama client
|
|
|
|
Args:
|
|
base_url: Ollama server URL (e.g., http://localhost:11434)
|
|
model: Model name to use (e.g., phi3:latest)
|
|
api_key: Optional API key for authentication
|
|
enabled: Whether Ollama is enabled
|
|
timeout: Request timeout in seconds (default 30)
|
|
"""
|
|
self.base_url = base_url.rstrip('/')
|
|
self.model = model
|
|
self.api_key = api_key
|
|
self.enabled = enabled
|
|
self.timeout = timeout
|
|
|
|
def summarize_article(self, content, max_words=150):
|
|
"""
|
|
Summarize article content using Ollama
|
|
|
|
Args:
|
|
content: Full article text
|
|
max_words: Maximum words in summary (default 150)
|
|
|
|
Returns:
|
|
{
|
|
'summary': str, # AI-generated summary
|
|
'summary_word_count': int, # Summary word count
|
|
'original_word_count': int, # Original article word count
|
|
'success': bool, # Whether summarization succeeded
|
|
'error': str or None, # Error message if failed
|
|
'duration': float # Time taken in seconds
|
|
}
|
|
"""
|
|
if not self.enabled:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': 0,
|
|
'success': False,
|
|
'error': 'Ollama is not enabled',
|
|
'duration': 0
|
|
}
|
|
|
|
if not content or len(content.strip()) == 0:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': 0,
|
|
'success': False,
|
|
'error': 'Content is empty',
|
|
'duration': 0
|
|
}
|
|
|
|
# Calculate original word count
|
|
original_word_count = len(content.split())
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Construct prompt
|
|
prompt = self._build_summarization_prompt(content, max_words)
|
|
|
|
# Prepare request
|
|
url = f"{self.base_url}/api/generate"
|
|
headers = {'Content-Type': 'application/json'}
|
|
if self.api_key:
|
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
|
|
payload = {
|
|
'model': self.model,
|
|
'prompt': prompt,
|
|
'stream': False,
|
|
'options': {
|
|
'temperature': 0.7,
|
|
'num_predict': 250 # Limit response length
|
|
}
|
|
}
|
|
|
|
# Make request
|
|
response = requests.post(
|
|
url,
|
|
json=payload,
|
|
headers=headers,
|
|
timeout=self.timeout
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse response
|
|
result = response.json()
|
|
summary = result.get('response', '').strip()
|
|
|
|
if not summary:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': original_word_count,
|
|
'success': False,
|
|
'error': 'Ollama returned empty summary',
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
# Clean markdown formatting from summary
|
|
summary = self._clean_markdown(summary)
|
|
|
|
summary_word_count = len(summary.split())
|
|
|
|
return {
|
|
'summary': summary,
|
|
'summary_word_count': summary_word_count,
|
|
'original_word_count': original_word_count,
|
|
'success': True,
|
|
'error': None,
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
except requests.exceptions.Timeout:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': original_word_count,
|
|
'success': False,
|
|
'error': f'Request timed out after {self.timeout} seconds',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except requests.exceptions.ConnectionError:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': original_word_count,
|
|
'success': False,
|
|
'error': f'Cannot connect to Ollama server at {self.base_url}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except requests.exceptions.HTTPError as e:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': original_word_count,
|
|
'success': False,
|
|
'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'summary': None,
|
|
'summary_word_count': 0,
|
|
'original_word_count': original_word_count,
|
|
'success': False,
|
|
'error': f'Unexpected error: {str(e)}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
def translate_title(self, title, target_language='English'):
|
|
"""
|
|
Translate article title to target language
|
|
|
|
Args:
|
|
title: Original title (typically German)
|
|
target_language: Target language (default: 'English')
|
|
|
|
Returns:
|
|
{
|
|
'success': bool, # Whether translation succeeded
|
|
'translated_title': str or None, # Translated title
|
|
'error': str or None, # Error message if failed
|
|
'duration': float # Time taken in seconds
|
|
}
|
|
"""
|
|
if not self.enabled:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': 'Ollama is not enabled',
|
|
'duration': 0
|
|
}
|
|
|
|
if not title or len(title.strip()) == 0:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': 'Title is empty',
|
|
'duration': 0
|
|
}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Construct prompt
|
|
prompt = self._build_translation_prompt(title, target_language)
|
|
|
|
# Prepare request
|
|
url = f"{self.base_url}/api/generate"
|
|
headers = {'Content-Type': 'application/json'}
|
|
if self.api_key:
|
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
|
|
payload = {
|
|
'model': self.model,
|
|
'prompt': prompt,
|
|
'stream': False,
|
|
'options': {
|
|
'temperature': 0.3, # Lower temperature for consistent translations
|
|
'num_predict': 100 # Limit response length for title-length outputs
|
|
}
|
|
}
|
|
|
|
# Make request
|
|
response = requests.post(
|
|
url,
|
|
json=payload,
|
|
headers=headers,
|
|
timeout=self.timeout
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse response
|
|
result = response.json()
|
|
translated_title = result.get('response', '').strip()
|
|
|
|
if not translated_title:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': 'Ollama returned empty translation',
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
# Clean the translation output
|
|
translated_title = self._clean_translation(translated_title)
|
|
|
|
return {
|
|
'success': True,
|
|
'translated_title': translated_title,
|
|
'error': None,
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
except requests.exceptions.Timeout:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': f'Request timed out after {self.timeout} seconds',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except requests.exceptions.ConnectionError:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': f'Cannot connect to Ollama server at {self.base_url}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except requests.exceptions.HTTPError as e:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'success': False,
|
|
'translated_title': None,
|
|
'error': f'Unexpected error: {str(e)}',
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
def _build_translation_prompt(self, title, target_language):
|
|
"""Build prompt for title translation"""
|
|
prompt = f"""Translate the following German news headline to {target_language}.
|
|
|
|
IMPORTANT: Provide ONLY the {target_language} translation. Do not include explanations, quotes, or any other text. Just the translated headline.
|
|
|
|
German headline:
|
|
{title}
|
|
|
|
{target_language} translation:"""
|
|
|
|
return prompt
|
|
|
|
def _clean_translation(self, translation):
|
|
"""Clean translation output by removing quotes and extra text"""
|
|
# Extract first line only
|
|
translation = translation.split('\n')[0]
|
|
|
|
# Remove surrounding quotes (single and double)
|
|
translation = translation.strip()
|
|
if (translation.startswith('"') and translation.endswith('"')) or \
|
|
(translation.startswith("'") and translation.endswith("'")):
|
|
translation = translation[1:-1]
|
|
|
|
# Trim whitespace again after quote removal
|
|
translation = translation.strip()
|
|
|
|
return translation
|
|
|
|
def _clean_markdown(self, text):
|
|
"""Remove markdown formatting from text"""
|
|
import re
|
|
|
|
# Remove markdown headers (##, ###, etc.)
|
|
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
|
|
|
# Remove bold/italic markers (**text**, *text*, __text__, _text_)
|
|
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
|
|
text = re.sub(r'__([^_]+)__', r'\1', text)
|
|
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
|
|
text = re.sub(r'_([^_]+)_', r'\1', text)
|
|
|
|
# Remove markdown links [text](url) -> text
|
|
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
|
|
|
# Remove inline code `text`
|
|
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
|
|
# Remove bullet points and list markers
|
|
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
|
|
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
|
|
# Clean up extra whitespace
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
text = text.strip()
|
|
|
|
return text
|
|
|
|
def _build_summarization_prompt(self, content, max_words):
|
|
"""Build prompt for article summarization"""
|
|
# Truncate content if too long (keep first 5000 words)
|
|
words = content.split()
|
|
if len(words) > 5000:
|
|
content = ' '.join(words[:5000]) + '...'
|
|
|
|
prompt = f"""You are a skilled journalist writing for The New York Times. Summarize the following article in English in {max_words} words or less.
|
|
|
|
Write in the clear, engaging, and authoritative style of New York Times Magazine:
|
|
- Lead with the most newsworthy information
|
|
- Use active voice and vivid language
|
|
- Make it accessible and easy to read
|
|
- Focus on what matters to readers
|
|
- Even if the source is in German or another language, write your summary entirely in English
|
|
|
|
IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.
|
|
|
|
Article:
|
|
{content}
|
|
|
|
New York Times-style summary (max {max_words} words):"""
|
|
|
|
return prompt
|
|
|
|
def is_available(self):
|
|
"""
|
|
Check if Ollama server is reachable
|
|
|
|
Returns:
|
|
bool: True if server is reachable, False otherwise
|
|
"""
|
|
if not self.enabled:
|
|
return False
|
|
|
|
try:
|
|
url = f"{self.base_url}/api/tags"
|
|
headers = {}
|
|
if self.api_key:
|
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
|
|
response = requests.get(url, headers=headers, timeout=5)
|
|
response.raise_for_status()
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def test_connection(self):
|
|
"""
|
|
Test connection and return server info
|
|
|
|
Returns:
|
|
{
|
|
'available': bool,
|
|
'models': list,
|
|
'current_model': str,
|
|
'error': str or None
|
|
}
|
|
"""
|
|
if not self.enabled:
|
|
return {
|
|
'available': False,
|
|
'models': [],
|
|
'current_model': self.model,
|
|
'error': 'Ollama is not enabled'
|
|
}
|
|
|
|
try:
|
|
url = f"{self.base_url}/api/tags"
|
|
headers = {}
|
|
if self.api_key:
|
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
|
|
response = requests.get(url, headers=headers, timeout=5)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
models = [m.get('name', '') for m in result.get('models', [])]
|
|
|
|
return {
|
|
'available': True,
|
|
'models': models,
|
|
'current_model': self.model,
|
|
'error': None
|
|
}
|
|
except requests.exceptions.ConnectionError:
|
|
return {
|
|
'available': False,
|
|
'models': [],
|
|
'current_model': self.model,
|
|
'error': f'Cannot connect to Ollama server at {self.base_url}'
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'available': False,
|
|
'models': [],
|
|
'current_model': self.model,
|
|
'error': str(e)
|
|
}
|
|
|
|
def generate(self, prompt, max_tokens=100):
|
|
"""
|
|
Generate text using Ollama
|
|
|
|
Args:
|
|
prompt: Text prompt
|
|
max_tokens: Maximum tokens to generate
|
|
|
|
Returns:
|
|
{
|
|
'text': str, # Generated text
|
|
'success': bool, # Whether generation succeeded
|
|
'error': str or None, # Error message if failed
|
|
'duration': float # Time taken in seconds
|
|
}
|
|
"""
|
|
if not self.enabled:
|
|
return {
|
|
'text': '',
|
|
'success': False,
|
|
'error': 'Ollama is disabled',
|
|
'duration': 0
|
|
}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = requests.post(
|
|
f"{self.base_url}/api/generate",
|
|
json={
|
|
"model": self.model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"num_predict": max_tokens,
|
|
"temperature": 0.1 # Low temperature for consistent answers
|
|
}
|
|
},
|
|
timeout=self.timeout
|
|
)
|
|
|
|
duration = time.time() - start_time
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
return {
|
|
'text': result.get('response', '').strip(),
|
|
'success': True,
|
|
'error': None,
|
|
'duration': duration
|
|
}
|
|
else:
|
|
return {
|
|
'text': '',
|
|
'success': False,
|
|
'error': f"HTTP {response.status_code}: {response.text}",
|
|
'duration': duration
|
|
}
|
|
|
|
except requests.exceptions.Timeout:
|
|
return {
|
|
'text': '',
|
|
'success': False,
|
|
'error': f"Request timed out after {self.timeout}s",
|
|
'duration': time.time() - start_time
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'text': '',
|
|
'success': False,
|
|
'error': str(e),
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
def extract_keywords(self, title, summary, max_keywords=5):
|
|
"""
|
|
Extract keywords/topics from article for personalization
|
|
|
|
Args:
|
|
title: Article title
|
|
summary: Article summary
|
|
max_keywords: Maximum number of keywords to extract (default 5)
|
|
|
|
Returns:
|
|
{
|
|
'keywords': list, # List of extracted keywords
|
|
'success': bool, # Whether extraction succeeded
|
|
'error': str or None, # Error message if failed
|
|
'duration': float # Time taken in seconds
|
|
}
|
|
"""
|
|
if not self.enabled:
|
|
return {
|
|
'keywords': [],
|
|
'success': False,
|
|
'error': 'Ollama is disabled',
|
|
'duration': 0
|
|
}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Construct prompt for keyword extraction
|
|
prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
|
|
|
|
Title: {title}
|
|
Summary: {summary}
|
|
|
|
Return ONLY the keywords separated by commas, nothing else. Focus on:
|
|
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
|
|
- Locations (e.g., "Marienplatz", "Airport")
|
|
- Events or themes (e.g., "Transportation", "Housing", "Technology")
|
|
|
|
Keywords:"""
|
|
|
|
# Prepare request
|
|
url = f"{self.base_url}/api/generate"
|
|
headers = {'Content-Type': 'application/json'}
|
|
if self.api_key:
|
|
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
|
|
payload = {
|
|
'model': self.model,
|
|
'prompt': prompt,
|
|
'stream': False,
|
|
'options': {
|
|
'temperature': 0.3, # Lower temperature for consistent extraction
|
|
'num_predict': 100 # Limit response length
|
|
}
|
|
}
|
|
|
|
# Make request
|
|
response = requests.post(
|
|
url,
|
|
json=payload,
|
|
headers=headers,
|
|
timeout=self.timeout
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse response
|
|
result = response.json()
|
|
keywords_text = result.get('response', '').strip()
|
|
|
|
if not keywords_text:
|
|
return {
|
|
'keywords': [],
|
|
'success': False,
|
|
'error': 'Ollama returned empty response',
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
# Parse keywords from response
|
|
keywords = [k.strip() for k in keywords_text.split(',')]
|
|
keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
|
|
|
|
return {
|
|
'keywords': keywords,
|
|
'success': True,
|
|
'error': None,
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
except requests.exceptions.Timeout:
|
|
return {
|
|
'keywords': [],
|
|
'success': False,
|
|
'error': f"Request timed out after {self.timeout}s",
|
|
'duration': time.time() - start_time
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'keywords': [],
|
|
'success': False,
|
|
'error': str(e),
|
|
'duration': time.time() - start_time
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Quick test
|
|
import os
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv(dotenv_path='../.env')
|
|
|
|
client = OllamaClient(
|
|
base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
|
|
model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
|
|
enabled=True
|
|
)
|
|
|
|
print("Testing Ollama connection...")
|
|
result = client.test_connection()
|
|
print(f"Available: {result['available']}")
|
|
print(f"Models: {result['models']}")
|
|
print(f"Current model: {result['current_model']}")
|
|
|
|
if result['available']:
|
|
print("\nTesting summarization...")
|
|
test_content = """
|
|
The new U-Bahn line connecting Munich's city center with the airport opened today.
|
|
Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
|
|
The line will significantly reduce travel time between the airport and downtown Munich.
|
|
Construction took five years and cost approximately 2 billion euros.
|
|
The new line includes 10 stations and runs every 10 minutes during peak hours.
|
|
"""
|
|
|
|
summary_result = client.summarize_article(test_content, max_words=50)
|
|
print(f"Success: {summary_result['success']}")
|
|
print(f"Summary: {summary_result['summary']}")
|
|
print(f"Original word count: {summary_result['original_word_count']}")
|
|
print(f"Summary word count: {summary_result['summary_word_count']}")
|
|
print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
|
|
print(f"Duration: {summary_result['duration']:.2f}s")
|