This commit is contained in:
2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions

View File

@@ -13,6 +13,8 @@ from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
from config import Config
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import create_cluster_summaries
# Load environment variables
load_dotenv(dotenv_path='../.env')
@@ -33,6 +35,9 @@ ollama_client = OllamaClient(
timeout=Config.OLLAMA_TIMEOUT
)
# Initialize Article Clusterer (will be initialized after ollama_client)
article_clusterer = None
# Print configuration on startup
if __name__ != '__main__':
Config.print_config()
@@ -44,6 +49,14 @@ if __name__ != '__main__':
print("⚠ Warning: Ollama server is not reachable")
else:
print(" Ollama AI summarization: DISABLED")
# Initialize Article Clusterer with ollama_client
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60, # Not used when AI is enabled
time_window_hours=24 # Look back 24 hours
)
print("🔗 Article clustering: ENABLED (AI-powered)")
def get_active_rss_feeds():
@@ -394,6 +407,13 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
'created_at': datetime.utcnow()
}
# Cluster article with existing articles (detect duplicates from other sources)
from datetime import timedelta
recent_articles = list(articles_collection.find({
'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
}))
article_doc = article_clusterer.cluster_article(article_doc, recent_articles)
try:
# Upsert: update if exists, insert if not
articles_collection.update_one(
@@ -434,6 +454,16 @@ def crawl_all_feeds(max_articles_per_feed=10):
Crawl all active RSS feeds
Returns: dict with statistics
"""
global article_clusterer
# Initialize clusterer if not already done
if article_clusterer is None:
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60,
time_window_hours=24
)
print("\n" + "="*60)
print("🚀 Starting RSS Feed Crawler")
print("="*60)
@@ -485,12 +515,29 @@ def crawl_all_feeds(max_articles_per_feed=10):
print(f" Average time per article: {duration/total_crawled:.1f}s")
print("="*60 + "\n")
# Generate neutral summaries for clustered articles
cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
if Config.OLLAMA_ENABLED and total_crawled > 0:
print("\n" + "="*60)
print("🔄 Generating Neutral Summaries for Clustered Articles")
print("="*60)
cluster_summary_stats = create_cluster_summaries(db, ollama_client)
print("\n" + "="*60)
print(f"✓ Cluster Summarization Complete!")
print(f" Clusters processed: {cluster_summary_stats['processed']}")
print(f" Succeeded: {cluster_summary_stats['succeeded']}")
print(f" Failed: {cluster_summary_stats['failed']}")
print("="*60 + "\n")
return {
'total_feeds': len(feeds),
'total_articles_crawled': total_crawled,
'total_summarized': total_summarized,
'failed_summaries': total_failed,
'duration_seconds': round(duration, 2)
'duration_seconds': round(duration, 2),
'cluster_summaries': cluster_summary_stats
}