update
This commit is contained in:
246
news_crawler/article_clustering.py
Normal file
246
news_crawler/article_clustering.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
Article Clustering Module
|
||||
Detects and groups similar articles from different sources using Ollama AI
|
||||
"""
|
||||
from difflib import SequenceMatcher
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Optional
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
|
||||
class ArticleClusterer:
|
||||
"""
|
||||
Clusters articles about the same story from different sources using Ollama AI
|
||||
"""
|
||||
|
||||
def __init__(self, ollama_client: OllamaClient, similarity_threshold=0.75, time_window_hours=24):
|
||||
"""
|
||||
Initialize clusterer
|
||||
|
||||
Args:
|
||||
ollama_client: OllamaClient instance for AI-based similarity detection
|
||||
similarity_threshold: Minimum similarity to consider articles as same story (0-1)
|
||||
time_window_hours: Time window to look for similar articles
|
||||
"""
|
||||
self.ollama_client = ollama_client
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.time_window_hours = time_window_hours
|
||||
|
||||
def normalize_title(self, title: str) -> str:
|
||||
"""
|
||||
Normalize title for comparison
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
|
||||
Returns:
|
||||
Normalized title (lowercase, stripped)
|
||||
"""
|
||||
return title.lower().strip()
|
||||
|
||||
def simple_stem(self, word: str) -> str:
|
||||
"""
|
||||
Simple German word stemming (remove common suffixes)
|
||||
|
||||
Args:
|
||||
word: Word to stem
|
||||
|
||||
Returns:
|
||||
Stemmed word
|
||||
"""
|
||||
# Remove common German suffixes
|
||||
suffixes = ['ungen', 'ung', 'en', 'er', 'e', 'n', 's']
|
||||
for suffix in suffixes:
|
||||
if len(word) > 5 and word.endswith(suffix):
|
||||
return word[:-len(suffix)]
|
||||
return word
|
||||
|
||||
def extract_keywords(self, text: str) -> set:
|
||||
"""
|
||||
Extract important keywords from text with simple stemming
|
||||
|
||||
Args:
|
||||
text: Article title or content
|
||||
|
||||
Returns:
|
||||
Set of stemmed keywords
|
||||
"""
|
||||
# Common German stop words to ignore
|
||||
stop_words = {
|
||||
'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'eines',
|
||||
'und', 'oder', 'aber', 'in', 'im', 'am', 'um', 'für', 'von', 'zu', 'nach',
|
||||
'bei', 'mit', 'auf', 'an', 'aus', 'über', 'unter', 'gegen', 'durch',
|
||||
'ist', 'sind', 'war', 'waren', 'hat', 'haben', 'wird', 'werden', 'wurde', 'wurden',
|
||||
'neue', 'neuer', 'neues', 'neuen', 'sich', 'auch', 'nicht', 'nur', 'noch',
|
||||
'mehr', 'als', 'wie', 'beim', 'zum', 'zur', 'vom', 'ins', 'ans'
|
||||
}
|
||||
|
||||
# Normalize and split
|
||||
words = text.lower().strip().split()
|
||||
|
||||
# Filter out stop words, short words, and apply stemming
|
||||
keywords = set()
|
||||
for word in words:
|
||||
# Remove punctuation
|
||||
word = ''.join(c for c in word if c.isalnum() or c == '-')
|
||||
|
||||
if len(word) > 3 and word not in stop_words:
|
||||
# Apply simple stemming
|
||||
stemmed = self.simple_stem(word)
|
||||
keywords.add(stemmed)
|
||||
|
||||
return keywords
|
||||
|
||||
def check_same_story_with_ai(self, article1: Dict, article2: Dict) -> bool:
|
||||
"""
|
||||
Use Ollama AI to determine if two articles are about the same story
|
||||
|
||||
Args:
|
||||
article1: First article
|
||||
article2: Second article
|
||||
|
||||
Returns:
|
||||
True if same story, False otherwise
|
||||
"""
|
||||
if not self.ollama_client.enabled:
|
||||
# Fallback to keyword-based similarity
|
||||
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
|
||||
|
||||
title1 = article1.get('title', '')
|
||||
title2 = article2.get('title', '')
|
||||
content1 = article1.get('content', '')[:300] # First 300 chars
|
||||
content2 = article2.get('content', '')[:300]
|
||||
|
||||
prompt = f"""Compare these two news articles and determine if they are about the SAME story/event.
|
||||
|
||||
Article 1:
|
||||
Title: {title1}
|
||||
Content: {content1}
|
||||
|
||||
Article 2:
|
||||
Title: {title2}
|
||||
Content: {content2}
|
||||
|
||||
Answer with ONLY "YES" if they are about the same story/event, or "NO" if they are different stories.
|
||||
Consider them the same story if they report on the same event, even if from different perspectives.
|
||||
|
||||
Answer:"""
|
||||
|
||||
try:
|
||||
response = self.ollama_client.generate(prompt, max_tokens=10)
|
||||
answer = response.get('text', '').strip().upper()
|
||||
return 'YES' in answer
|
||||
except Exception as e:
|
||||
print(f" ⚠ AI clustering failed: {e}, using fallback")
|
||||
# Fallback to keyword-based similarity
|
||||
return self.calculate_similarity(article1, article2) >= self.similarity_threshold
|
||||
|
||||
def calculate_similarity(self, article1: Dict, article2: Dict) -> float:
|
||||
"""
|
||||
Calculate similarity between two articles using title and content
|
||||
|
||||
Args:
|
||||
article1: First article (dict with 'title' and optionally 'content')
|
||||
article2: Second article (dict with 'title' and optionally 'content')
|
||||
|
||||
Returns:
|
||||
Similarity score (0-1)
|
||||
"""
|
||||
title1 = article1.get('title', '')
|
||||
title2 = article2.get('title', '')
|
||||
content1 = article1.get('content', '')
|
||||
content2 = article2.get('content', '')
|
||||
|
||||
# Extract keywords from titles
|
||||
title_keywords1 = self.extract_keywords(title1)
|
||||
title_keywords2 = self.extract_keywords(title2)
|
||||
|
||||
# Calculate title similarity
|
||||
if title_keywords1 and title_keywords2:
|
||||
title_intersection = title_keywords1.intersection(title_keywords2)
|
||||
title_union = title_keywords1.union(title_keywords2)
|
||||
title_similarity = len(title_intersection) / len(title_union) if title_union else 0
|
||||
else:
|
||||
# Fallback to string similarity
|
||||
t1 = self.normalize_title(title1)
|
||||
t2 = self.normalize_title(title2)
|
||||
title_similarity = SequenceMatcher(None, t1, t2).ratio()
|
||||
|
||||
# If we have content, use it for better accuracy
|
||||
if content1 and content2:
|
||||
# Extract keywords from first 500 chars of content (for performance)
|
||||
content_keywords1 = self.extract_keywords(content1[:500])
|
||||
content_keywords2 = self.extract_keywords(content2[:500])
|
||||
|
||||
if content_keywords1 and content_keywords2:
|
||||
content_intersection = content_keywords1.intersection(content_keywords2)
|
||||
content_union = content_keywords1.union(content_keywords2)
|
||||
content_similarity = len(content_intersection) / len(content_union) if content_union else 0
|
||||
|
||||
# Weighted average: title (40%) + content (60%)
|
||||
return (title_similarity * 0.4) + (content_similarity * 0.6)
|
||||
|
||||
# If no content, use only title similarity
|
||||
return title_similarity
|
||||
|
||||
def find_cluster(self, article: Dict, existing_articles: List[Dict]) -> Optional[str]:
|
||||
"""
|
||||
Find if article belongs to an existing cluster using AI
|
||||
|
||||
Args:
|
||||
article: New article to cluster (dict with 'title' and optionally 'content')
|
||||
existing_articles: List of existing articles
|
||||
|
||||
Returns:
|
||||
cluster_id if found, None otherwise
|
||||
"""
|
||||
cutoff_time = datetime.utcnow() - timedelta(hours=self.time_window_hours)
|
||||
|
||||
for existing in existing_articles:
|
||||
# Only compare recent articles
|
||||
published_at = existing.get('published_at')
|
||||
if published_at and published_at < cutoff_time:
|
||||
continue
|
||||
|
||||
# Use AI to check if same story
|
||||
if self.check_same_story_with_ai(article, existing):
|
||||
return existing.get('cluster_id', str(existing.get('_id')))
|
||||
|
||||
return None
|
||||
|
||||
def cluster_article(self, article: Dict, existing_articles: List[Dict]) -> Dict:
|
||||
"""
|
||||
Cluster a single article
|
||||
|
||||
Args:
|
||||
article: Article to cluster
|
||||
existing_articles: List of existing articles
|
||||
|
||||
Returns:
|
||||
Article with cluster_id and is_primary fields
|
||||
"""
|
||||
cluster_id = self.find_cluster(article, existing_articles)
|
||||
|
||||
if cluster_id:
|
||||
# Add to existing cluster
|
||||
article['cluster_id'] = cluster_id
|
||||
article['is_primary'] = False
|
||||
else:
|
||||
# Create new cluster
|
||||
article['cluster_id'] = str(article.get('_id', datetime.utcnow().timestamp()))
|
||||
article['is_primary'] = True
|
||||
|
||||
return article
|
||||
|
||||
def get_cluster_articles(self, cluster_id: str, articles_collection) -> List[Dict]:
|
||||
"""
|
||||
Get all articles in a cluster
|
||||
|
||||
Args:
|
||||
cluster_id: Cluster ID
|
||||
articles_collection: MongoDB collection
|
||||
|
||||
Returns:
|
||||
List of articles in the cluster
|
||||
"""
|
||||
return list(articles_collection.find({'cluster_id': cluster_id}))
|
||||
Reference in New Issue
Block a user