This commit is contained in:
2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions

View File

@@ -0,0 +1,246 @@
"""
Article Clustering Module
Detects and groups similar articles from different sources using Ollama AI
"""
from difflib import SequenceMatcher
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from ollama_client import OllamaClient
class ArticleClusterer:
"""
Clusters articles about the same story from different sources using Ollama AI
"""
def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
"""
Initialize clusterer
Args:
ollama_client: OllamaClient instance for AI-based similarity detection
similarity_threshold: Minimum similarity to consider articles as same story (0-1)
time_window_hours: Time window to look for similar articles
"""
self.ollama_client = ollama_client
self.similarity_threshold = similarity_threshold
self.time_window_hours = time_window_hours
def normalize_title(self, title: str) -> str:
"""
Normalize title for comparison
Args:
title: Article title
Returns:
Normalized title (lowercase, stripped)
"""
return title.lower().strip()
def simple_stem(self, word: str) -> str:
"""
Simple German word stemming (remove common suffixes)
Args:
word: Word to stem
Returns:
Stemmed word
"""
# Remove common German suffixes
suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
for suffix in suffixes:
if len(word) > 5 and word.endswith(suffix):
return word[:-len(suffix)]
return word
def extract_keywords(self, text: str) -> set:
"""
Extract important keywords from text with simple stemming
Args:
text: Article title or content
Returns:
Set of stemmed keywords
"""
# Common German stop words to ignore
stop_words = {
'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
}
# Normalize and split
words = text.lower().strip().split()
# Filter out stop words, short words, and apply stemming
keywords = set()
for word in words:
# Remove punctuation
word = ''.join(c for c in word if c.isalnum() or c == '-')
if len(word) > 3 and word not in stop_words:
# Apply simple stemming
stemmed = self.simple_stem(word)
keywords.add(stemmed)
return keywords
def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
"""
Use Ollama AI to determine if two articles are about the same story
Args:
article1: First article
article2: Second article
Returns:
True if same story, False otherwise
"""
if not self.ollama_client.enabled:
# Fallback to keyword-based similarity
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
title1 = article1.get('title', '')
title2 = article2.get('title', '')
content1 = article1.get('content', '')[:300] # First 300 chars
content2 = article2.get('content', '')[:300]
prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.
Article 1:
Title: {title1}
Content: {content1}
Article 2:
Title: {title2}
Content: {content2}
Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
Consider them the same story if they report on the same event, even if from different perspectives.
Answer:"""
try:
response = self.ollama_client.generate(prompt, max_tokens=10)
answer = response.get('text', '').strip().upper()
return 'YES' in answer
except Exception as e:
print(f" ⚠ AI clustering failed: {e}, using fallback")
# Fallback to keyword-based similarity
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
"""
Calculate similarity between two articles using title and content
Args:
article1: First article (dict with 'title' and optionally 'content')
article2: Second article (dict with 'title' and optionally 'content')
Returns:
Similarity score (0-1)
"""
title1 = article1.get('title', '')
title2 = article2.get('title', '')
content1 = article1.get('content', '')
content2 = article2.get('content', '')
# Extract keywords from titles
title_keywords1 = self.extract_keywords(title1)
title_keywords2 = self.extract_keywords(title2)
# Calculate title similarity
if title_keywords1 and title_keywords2:
title_intersection = title_keywords1.intersection(title_keywords2)
title_union = title_keywords1.union(title_keywords2)
title_similarity = len(title_intersection) / len(title_union) if title_union else 0
else:
# Fallback to string similarity
t1 = self.normalize_title(title1)
t2 = self.normalize_title(title2)
title_similarity = SequenceMatcher(None, t1, t2).ratio()
# If we have content, use it for better accuracy
if content1 and content2:
# Extract keywords from first 500 chars of content (for performance)
content_keywords1 = self.extract_keywords(content1[:500])
content_keywords2 = self.extract_keywords(content2[:500])
if content_keywords1 and content_keywords2:
content_intersection = content_keywords1.intersection(content_keywords2)
content_union = content_keywords1.union(content_keywords2)
content_similarity = len(content_intersection) / len(content_union) if content_union else 0
# Weighted average: title (40%) + content (60%)
return (title_similarity * 0.4) + (content_similarity * 0.6)
# If no content, use only title similarity
return title_similarity
def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
"""
Find if article belongs to an existing cluster using AI
Args:
article: New article to cluster (dict with 'title' and optionally 'content')
existing_articles: List of existing articles
Returns:
cluster_id if found, None otherwise
"""
cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
for existing in existing_articles:
# Only compare recent articles
published_at = existing.get('published_at')
if published_at and published_at < cutoff_time:
continue
# Use AI to check if same story
if self.check_same_story_with_ai(article, existing):
return existing.get('cluster_id', str(existing.get('_id')))
return None
def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
"""
Cluster a single article
Args:
article: Article to cluster
existing_articles: List of existing articles
Returns:
Article with cluster_id and is_primary fields
"""
cluster_id = self.find_cluster(article, existing_articles)
if cluster_id:
# Add to existing cluster
article['cluster_id'] = cluster_id
article['is_primary'] = False
else:
# Create new cluster
article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
article['is_primary'] = True
return article
def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
"""
Get all articles in a cluster
Args:
cluster_id: Cluster ID
articles_collection: MongoDB collection
Returns:
List of articles in the cluster
"""
return list(articles_collection.find({'cluster_id': cluster_id}))

View File

@@ -0,0 +1,213 @@
"""
Cluster Summarizer Module
Generates neutral summaries from multiple clustered articles
"""
from typing import List, Dict, Optional
from datetime import datetime
from ollama_client import OllamaClient
class ClusterSummarizer:
"""
Generates neutral summaries by synthesizing multiple articles about the same story
"""
def __init__(self, ollama_client: OllamaClient, max_words=200):
"""
Initialize cluster summarizer
Args:
ollama_client: OllamaClient instance for AI-based summarization
max_words: Maximum words in neutral summary
"""
self.ollama_client = ollama_client
self.max_words = max_words
def generate_neutral_summary(self, articles: List[Dict]) -> Dict:
"""
Generate a neutral summary from multiple articles about the same story
Args:
articles: List of article dicts with 'title', 'content', 'source'
Returns:
{
'neutral_summary': str,
'sources': list,
'article_count': int,
'success': bool,
'error': str or None,
'duration': float
}
"""
if not articles or len(articles) == 0:
return {
'neutral_summary': None,
'sources': [],
'article_count': 0,
'success': False,
'error': 'No articles provided',
'duration': 0
}
# If only one article, return its summary
if len(articles) == 1:
return {
'neutral_summary': articles[0].get('summary', articles[0].get('content', '')[:500]),
'sources': [articles[0].get('source', 'unknown')],
'article_count': 1,
'success': True,
'error': None,
'duration': 0
}
# Build combined context from all articles
combined_context = self._build_combined_context(articles)
# Generate neutral summary using AI
prompt = self._build_neutral_summary_prompt(combined_context, len(articles))
result = self.ollama_client.generate(prompt, max_tokens=300)
if result['success']:
return {
'neutral_summary': result['text'],
'sources': list(set(a.get('source', 'unknown') for a in articles)),
'article_count': len(articles),
'success': True,
'error': None,
'duration': result['duration']
}
else:
return {
'neutral_summary': None,
'sources': list(set(a.get('source', 'unknown') for a in articles)),
'article_count': len(articles),
'success': False,
'error': result['error'],
'duration': result['duration']
}
def _build_combined_context(self, articles: List[Dict]) -> str:
"""Build combined context from multiple articles"""
context_parts = []
for i, article in enumerate(articles, 1):
source = article.get('source', 'Unknown')
title = article.get('title', 'No title')
# Use summary if available, otherwise use first 500 chars of content
content = article.get('summary') or article.get('content', '')[:500]
context_parts.append(f"Source {i} ({source}):\nTitle: {title}\nContent: {content}")
return "\n\n".join(context_parts)
def _build_neutral_summary_prompt(self, combined_context: str, article_count: int) -> str:
"""Build prompt for neutral summary generation"""
prompt = f"""You are a neutral news aggregator. You have {article_count} articles from different sources about the same story. Your task is to create a single, balanced summary that:
1. Combines information from all sources
2. Remains neutral and objective
3. Highlights key facts that all sources agree on
4. Notes any significant differences in perspective (if any)
5. Is written in clear, professional English
6. Is approximately {self.max_words} words
Here are the articles:
{combined_context}
Write a neutral summary in English that synthesizes these perspectives:"""
return prompt
def create_cluster_summaries(db, ollama_client: OllamaClient, cluster_ids: Optional[List[str]] = None):
"""
Create or update neutral summaries for article clusters
Args:
db: MongoDB database instance
ollama_client: OllamaClient instance
cluster_ids: Optional list of specific cluster IDs to process. If None, processes all clusters.
Returns:
{
'processed': int,
'succeeded': int,
'failed': int,
'errors': list
}
"""
summarizer = ClusterSummarizer(ollama_client, max_words=200)
# Find clusters to process
if cluster_ids:
clusters_to_process = cluster_ids
else:
# Get all cluster IDs with multiple articles
pipeline = [
{"$match": {"cluster_id": {"$exists": True}}},
{"$group": {"_id": "$cluster_id", "count": {"$sum": 1}}},
{"$match": {"count": {"$gt": 1}}},
{"$project": {"_id": 1}}
]
clusters_to_process = [c['_id'] for c in db.articles.aggregate(pipeline)]
processed = 0
succeeded = 0
failed = 0
errors = []
for cluster_id in clusters_to_process:
try:
# Get all articles in this cluster
articles = list(db.articles.find({"cluster_id": cluster_id}))
if len(articles) < 2:
continue
print(f"Processing cluster {cluster_id}: {len(articles)} articles")
# Generate neutral summary
result = summarizer.generate_neutral_summary(articles)
processed += 1
if result['success']:
# Save cluster summary
db.cluster_summaries.update_one(
{"cluster_id": cluster_id},
{
"$set": {
"cluster_id": cluster_id,
"neutral_summary": result['neutral_summary'],
"sources": result['sources'],
"article_count": result['article_count'],
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow()
}
},
upsert=True
)
succeeded += 1
print(f" ✓ Generated neutral summary ({len(result['neutral_summary'])} chars)")
else:
failed += 1
error_msg = f"Cluster {cluster_id}: {result['error']}"
errors.append(error_msg)
print(f" ✗ Failed: {result['error']}")
except Exception as e:
failed += 1
error_msg = f"Cluster {cluster_id}: {str(e)}"
errors.append(error_msg)
print(f" ✗ Error: {e}")
return {
'processed': processed,
'succeeded': succeeded,
'failed': failed,
'errors': errors
}

View File

@@ -13,6 +13,8 @@ from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
from config import Config
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import create_cluster_summaries
# Load environment variables
load_dotenv(dotenv_path='../.env')
@@ -33,6 +35,9 @@ ollama_client = OllamaClient(
timeout=Config.OLLAMA_TIMEOUT
)
# Initialize Article Clusterer (will be initialized after ollama_client)
article_clusterer = None
# Print configuration on startup
if __name__ != '__main__':
Config.print_config()
@@ -44,6 +49,14 @@ if __name__ != '__main__':
print("⚠ Warning: Ollama server is not reachable")
else:
print(" Ollama AI summarization: DISABLED")
# Initialize Article Clusterer with ollama_client
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60, # Not used when AI is enabled
time_window_hours=24 # Look back 24 hours
)
print("🔗 Article clustering: ENABLED (AI-powered)")
def get_active_rss_feeds():
@@ -394,6 +407,13 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
'created_at': datetime.utcnow()
}
# Cluster article with existing articles (detect duplicates from other sources)
from datetime import timedelta
recent_articles = list(articles_collection.find({
'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
}))
article_doc = article_clusterer.cluster_article(article_doc, recent_articles)
try:
# Upsert: update if exists, insert if not
articles_collection.update_one(
@@ -434,6 +454,16 @@ def crawl_all_feeds(max_articles_per_feed=10):
Crawl all active RSS feeds
Returns: dict with statistics
"""
global article_clusterer
# Initialize clusterer if not already done
if article_clusterer is None:
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60,
time_window_hours=24
)
print("\n" + "="*60)
print("🚀 Starting RSS Feed Crawler")
print("="*60)
@@ -485,12 +515,29 @@ def crawl_all_feeds(max_articles_per_feed=10):
print(f" Average time per article: {duration/total_crawled:.1f}s")
print("="*60 + "\n")
# Generate neutral summaries for clustered articles
cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
if Config.OLLAMA_ENABLED and total_crawled > 0:
print("\n" + "="*60)
print("🔄 Generating Neutral Summaries for Clustered Articles")
print("="*60)
cluster_summary_stats = create_cluster_summaries(db, ollama_client)
print("\n" + "="*60)
print(f"✓ Cluster Summarization Complete!")
print(f" Clusters processed: {cluster_summary_stats['processed']}")
print(f" Succeeded: {cluster_summary_stats['succeeded']}")
print(f" Failed: {cluster_summary_stats['failed']}")
print("="*60 + "\n")
return {
'total_feeds': len(feeds),
'total_articles_crawled': total_crawled,
'total_summarized': total_summarized,
'failed_summaries': total_failed,
'duration_seconds': round(duration, 2)
'duration_seconds': round(duration, 2),
'cluster_summaries': cluster_summary_stats
}

View File

@@ -391,6 +391,80 @@ English Summary (max {max_words} words):"""
'current_model': self.model,
'error': str(e)
}
def generate(self, prompt, max_tokens=100):
"""
Generate text using Ollama
Args:
prompt: Text prompt
max_tokens: Maximum tokens to generate
Returns:
{
'text': str, # Generated text
'success': bool, # Whether generation succeeded
'error': str or None, # Error message if failed
'duration': float # Time taken in seconds
}
"""
if not self.enabled:
return {
'text': '',
'success': False,
'error': 'Ollama is disabled',
'duration': 0
}
start_time = time.time()
try:
response = requests.post(
f"{self.base_url}/api/generate",
json={
"model": self.model,
"prompt": prompt,
"stream": False,
"options": {
"num_predict": max_tokens,
"temperature": 0.1 # Low temperature for consistent answers
}
},
timeout=self.timeout
)
duration = time.time() - start_time
if response.status_code == 200:
result = response.json()
return {
'text': result.get('response', '').strip(),
'success': True,
'error': None,
'duration': duration
}
else:
return {
'text': '',
'success': False,
'error': f"HTTP {response.status_code}: {response.text}",
'duration': duration
}
except requests.exceptions.Timeout:
return {
'text': '',
'success': False,
'error': f"Request timed out after {self.timeout}s",
'duration': time.time() - start_time
}
except Exception as e:
return {
'text': '',
'success': False,
'error': str(e),
'duration': time.time() - start_time
}
if __name__ == '__main__':