update
This commit is contained in:
@@ -388,6 +388,21 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||
failed_summaries += 1
|
||||
|
||||
# Extract keywords for personalization
|
||||
keywords_result = None
|
||||
if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
|
||||
print(f" 🔑 Extracting keywords...")
|
||||
keywords_result = ollama_client.extract_keywords(
|
||||
original_title,
|
||||
summary_result['summary'],
|
||||
max_keywords=5
|
||||
)
|
||||
|
||||
if keywords_result['success']:
|
||||
print(f" ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
|
||||
else:
|
||||
print(f" ⚠ Keyword extraction failed: {keywords_result['error']}")
|
||||
|
||||
# Prepare document
|
||||
article_doc = {
|
||||
'title': original_title,
|
||||
@@ -396,6 +411,7 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
'link': article_url,
|
||||
'content': article_data.get('content', ''), # Full article content
|
||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||
'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
|
||||
'word_count': article_data.get('word_count', 0),
|
||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
|
||||
@@ -508,6 +508,110 @@ New York Times-style summary (max {max_words} words):"""
|
||||
'error': str(e),
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
def extract_keywords(self, title, summary, max_keywords=5):
|
||||
"""
|
||||
Extract keywords/topics from article for personalization
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
summary: Article summary
|
||||
max_keywords: Maximum number of keywords to extract (default 5)
|
||||
|
||||
Returns:
|
||||
{
|
||||
'keywords': list, # List of extracted keywords
|
||||
'success': bool, # Whether extraction succeeded
|
||||
'error': str or None, # Error message if failed
|
||||
'duration': float # Time taken in seconds
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': 'Ollama is disabled',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Construct prompt for keyword extraction
|
||||
prompt = f"""Extract {max_keywords} key topics or keywords from this article. These will be used to understand user interests.
|
||||
|
||||
Title: {title}
|
||||
Summary: {summary}
|
||||
|
||||
Return ONLY the keywords separated by commas, nothing else. Focus on:
|
||||
- Main topics (e.g., "Bayern Munich", "Oktoberfest", "City Council")
|
||||
- Locations (e.g., "Marienplatz", "Airport")
|
||||
- Events or themes (e.g., "Transportation", "Housing", "Technology")
|
||||
|
||||
Keywords:"""
|
||||
|
||||
# Prepare request
|
||||
url = f"{self.base_url}/api/generate"
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'prompt': prompt,
|
||||
'stream': False,
|
||||
'options': {
|
||||
'temperature': 0.3, # Lower temperature for consistent extraction
|
||||
'num_predict': 100 # Limit response length
|
||||
}
|
||||
}
|
||||
|
||||
# Make request
|
||||
response = requests.post(
|
||||
url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse response
|
||||
result = response.json()
|
||||
keywords_text = result.get('response', '').strip()
|
||||
|
||||
if not keywords_text:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': 'Ollama returned empty response',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
# Parse keywords from response
|
||||
keywords = [k.strip() for k in keywords_text.split(',')]
|
||||
keywords = [k for k in keywords if k and len(k) > 2][:max_keywords]
|
||||
|
||||
return {
|
||||
'keywords': keywords,
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': f"Request timed out after {self.timeout}s",
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'keywords': [],
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user