update
This commit is contained in:
@@ -13,6 +13,8 @@ from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
from article_clustering import ArticleClusterer
|
||||
from cluster_summarizer import create_cluster_summaries
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
@@ -33,6 +35,9 @@ ollama_client = OllamaClient(
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Initialize Article Clusterer (will be initialized after ollama_client)
|
||||
article_clusterer = None
|
||||
|
||||
# Print configuration on startup
|
||||
if __name__ != '__main__':
|
||||
Config.print_config()
|
||||
@@ -44,6 +49,14 @@ if __name__ != '__main__':
|
||||
print("⚠ Warning: Ollama server is not reachable")
|
||||
else:
|
||||
print("ℹ Ollama AI summarization: DISABLED")
|
||||
|
||||
# Initialize Article Clusterer with ollama_client
|
||||
article_clusterer = ArticleClusterer(
|
||||
ollama_client=ollama_client,
|
||||
similarity_threshold=0.60, # Not used when AI is enabled
|
||||
time_window_hours=24 # Look back 24 hours
|
||||
)
|
||||
print("🔗 Article clustering: ENABLED (AI-powered)")
|
||||
|
||||
|
||||
def get_active_rss_feeds():
|
||||
@@ -394,6 +407,13 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
# Cluster article with existing articles (detect duplicates from other sources)
|
||||
from datetime import timedelta
|
||||
recent_articles = list(articles_collection.find({
|
||||
'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
|
||||
}))
|
||||
article_doc = article_clusterer.cluster_article(article_doc, recent_articles)
|
||||
|
||||
try:
|
||||
# Upsert: update if exists, insert if not
|
||||
articles_collection.update_one(
|
||||
@@ -434,6 +454,16 @@ def crawl_all_feeds(max_articles_per_feed=10):
|
||||
Crawl all active RSS feeds
|
||||
Returns: dict with statistics
|
||||
"""
|
||||
global article_clusterer
|
||||
|
||||
# Initialize clusterer if not already done
|
||||
if article_clusterer is None:
|
||||
article_clusterer = ArticleClusterer(
|
||||
ollama_client=ollama_client,
|
||||
similarity_threshold=0.60,
|
||||
time_window_hours=24
|
||||
)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("🚀 Starting RSS Feed Crawler")
|
||||
print("="*60)
|
||||
@@ -485,12 +515,29 @@ def crawl_all_feeds(max_articles_per_feed=10):
|
||||
print(f" Average time per article: {duration/total_crawled:.1f}s")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Generate neutral summaries for clustered articles
|
||||
cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
|
||||
if Config.OLLAMA_ENABLED and total_crawled > 0:
|
||||
print("\n" + "="*60)
|
||||
print("🔄 Generating Neutral Summaries for Clustered Articles")
|
||||
print("="*60)
|
||||
|
||||
cluster_summary_stats = create_cluster_summaries(db, ollama_client)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"✓ Cluster Summarization Complete!")
|
||||
print(f" Clusters processed: {cluster_summary_stats['processed']}")
|
||||
print(f" Succeeded: {cluster_summary_stats['succeeded']}")
|
||||
print(f" Failed: {cluster_summary_stats['failed']}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
return {
|
||||
'total_feeds': len(feeds),
|
||||
'total_articles_crawled': total_crawled,
|
||||
'total_summarized': total_summarized,
|
||||
'failed_summaries': total_failed,
|
||||
'duration_seconds': round(duration, 2)
|
||||
'duration_seconds': round(duration, 2),
|
||||
'cluster_summaries': cluster_summary_stats
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user