247 lines
9.0 KiB
Python
247 lines
9.0 KiB
Python
"""
|
|
Article Clustering Module
|
|
Detects and groups similar articles from different sources using Ollama AI
|
|
"""
|
|
from difflib import SequenceMatcher
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional
|
|
from ollama_client import OllamaClient
|
|
|
|
|
|
class ArticleClusterer:
|
|
"""
|
|
Clusters articles about the same story from different sources using Ollama AI
|
|
"""
|
|
|
|
def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
|
|
"""
|
|
Initialize clusterer
|
|
|
|
Args:
|
|
ollama_client: OllamaClient instance for AI-based similarity detection
|
|
similarity_threshold: Minimum similarity to consider articles as same story (0-1)
|
|
time_window_hours: Time window to look for similar articles
|
|
"""
|
|
self.ollama_client = ollama_client
|
|
self.similarity_threshold = similarity_threshold
|
|
self.time_window_hours = time_window_hours
|
|
|
|
def normalize_title(self, title: str) -> str:
|
|
"""
|
|
Normalize title for comparison
|
|
|
|
Args:
|
|
title: Article title
|
|
|
|
Returns:
|
|
Normalized title (lowercase, stripped)
|
|
"""
|
|
return title.lower().strip()
|
|
|
|
def simple_stem(self, word: str) -> str:
|
|
"""
|
|
Simple German word stemming (remove common suffixes)
|
|
|
|
Args:
|
|
word: Word to stem
|
|
|
|
Returns:
|
|
Stemmed word
|
|
"""
|
|
# Remove common German suffixes
|
|
suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
|
|
for suffix in suffixes:
|
|
if len(word) > 5 and word.endswith(suffix):
|
|
return word[:-len(suffix)]
|
|
return word
|
|
|
|
def extract_keywords(self, text: str) -> set:
|
|
"""
|
|
Extract important keywords from text with simple stemming
|
|
|
|
Args:
|
|
text: Article title or content
|
|
|
|
Returns:
|
|
Set of stemmed keywords
|
|
"""
|
|
# Common German stop words to ignore
|
|
stop_words = {
|
|
'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
|
|
'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
|
|
'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
|
|
'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
|
|
'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
|
|
'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
|
|
}
|
|
|
|
# Normalize and split
|
|
words = text.lower().strip().split()
|
|
|
|
# Filter out stop words, short words, and apply stemming
|
|
keywords = set()
|
|
for word in words:
|
|
# Remove punctuation
|
|
word = ''.join(c for c in word if c.isalnum() or c == '-')
|
|
|
|
if len(word) > 3 and word not in stop_words:
|
|
# Apply simple stemming
|
|
stemmed = self.simple_stem(word)
|
|
keywords.add(stemmed)
|
|
|
|
return keywords
|
|
|
|
def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
|
|
"""
|
|
Use Ollama AI to determine if two articles are about the same story
|
|
|
|
Args:
|
|
article1: First article
|
|
article2: Second article
|
|
|
|
Returns:
|
|
True if same story, False otherwise
|
|
"""
|
|
if not self.ollama_client.enabled:
|
|
# Fallback to keyword-based similarity
|
|
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
|
|
|
|
title1 = article1.get('title', '')
|
|
title2 = article2.get('title', '')
|
|
content1 = article1.get('content', '')[:300] # First 300 chars
|
|
content2 = article2.get('content', '')[:300]
|
|
|
|
prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.
|
|
|
|
Article 1:
|
|
Title: {title1}
|
|
Content: {content1}
|
|
|
|
Article 2:
|
|
Title: {title2}
|
|
Content: {content2}
|
|
|
|
Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
|
|
Consider them the same story if they report on the same event, even if from different perspectives.
|
|
|
|
Answer:"""
|
|
|
|
try:
|
|
response = self.ollama_client.generate(prompt, max_tokens=10)
|
|
answer = response.get('text', '').strip().upper()
|
|
return 'YES' in answer
|
|
except Exception as e:
|
|
print(f" ⚠ AI clustering failed: {e}, using fallback")
|
|
# Fallback to keyword-based similarity
|
|
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
|
|
|
|
def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
|
|
"""
|
|
Calculate similarity between two articles using title and content
|
|
|
|
Args:
|
|
article1: First article (dict with 'title' and optionally 'content')
|
|
article2: Second article (dict with 'title' and optionally 'content')
|
|
|
|
Returns:
|
|
Similarity score (0-1)
|
|
"""
|
|
title1 = article1.get('title', '')
|
|
title2 = article2.get('title', '')
|
|
content1 = article1.get('content', '')
|
|
content2 = article2.get('content', '')
|
|
|
|
# Extract keywords from titles
|
|
title_keywords1 = self.extract_keywords(title1)
|
|
title_keywords2 = self.extract_keywords(title2)
|
|
|
|
# Calculate title similarity
|
|
if title_keywords1 and title_keywords2:
|
|
title_intersection = title_keywords1.intersection(title_keywords2)
|
|
title_union = title_keywords1.union(title_keywords2)
|
|
title_similarity = len(title_intersection) / len(title_union) if title_union else 0
|
|
else:
|
|
# Fallback to string similarity
|
|
t1 = self.normalize_title(title1)
|
|
t2 = self.normalize_title(title2)
|
|
title_similarity = SequenceMatcher(None, t1, t2).ratio()
|
|
|
|
# If we have content, use it for better accuracy
|
|
if content1 and content2:
|
|
# Extract keywords from first 500 chars of content (for performance)
|
|
content_keywords1 = self.extract_keywords(content1[:500])
|
|
content_keywords2 = self.extract_keywords(content2[:500])
|
|
|
|
if content_keywords1 and content_keywords2:
|
|
content_intersection = content_keywords1.intersection(content_keywords2)
|
|
content_union = content_keywords1.union(content_keywords2)
|
|
content_similarity = len(content_intersection) / len(content_union) if content_union else 0
|
|
|
|
# Weighted average: title (40%) + content (60%)
|
|
return (title_similarity * 0.4) + (content_similarity * 0.6)
|
|
|
|
# If no content, use only title similarity
|
|
return title_similarity
|
|
|
|
def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
|
|
"""
|
|
Find if article belongs to an existing cluster using AI
|
|
|
|
Args:
|
|
article: New article to cluster (dict with 'title' and optionally 'content')
|
|
existing_articles: List of existing articles
|
|
|
|
Returns:
|
|
cluster_id if found, None otherwise
|
|
"""
|
|
cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
|
|
|
|
for existing in existing_articles:
|
|
# Only compare recent articles
|
|
published_at = existing.get('published_at')
|
|
if published_at and published_at < cutoff_time:
|
|
continue
|
|
|
|
# Use AI to check if same story
|
|
if self.check_same_story_with_ai(article, existing):
|
|
return existing.get('cluster_id', str(existing.get('_id')))
|
|
|
|
return None
|
|
|
|
def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
|
|
"""
|
|
Cluster a single article
|
|
|
|
Args:
|
|
article: Article to cluster
|
|
existing_articles: List of existing articles
|
|
|
|
Returns:
|
|
Article with cluster_id and is_primary fields
|
|
"""
|
|
cluster_id = self.find_cluster(article, existing_articles)
|
|
|
|
if cluster_id:
|
|
# Add to existing cluster
|
|
article['cluster_id'] = cluster_id
|
|
article['is_primary'] = False
|
|
else:
|
|
# Create new cluster
|
|
article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
|
|
article['is_primary'] = True
|
|
|
|
return article
|
|
|
|
def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
|
|
"""
|
|
Get all articles in a cluster
|
|
|
|
Args:
|
|
cluster_id: Cluster ID
|
|
articles_collection: MongoDB collection
|
|
|
|
Returns:
|
|
List of articles in the cluster
|
|
"""
|
|
return list(articles_collection.find({'cluster_id': cluster_id}))
|