diff --git a/news_sender/sender_service.py b/news_sender/sender_service.py index cfe732d..8a43ad5 100644 --- a/news_sender/sender_service.py +++ b/news_sender/sender_service.py @@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3, # Fetch articles for each category separately for category in categories: # Query for articles in this category from today + # Fetch more than needed to allow for source diversity filtering cursor = articles_collection.find({ 'summary': {'$exists': True, '$ne': None}, 'category': category, @@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3, {'published_at': {'$gte': today_start}}, {'created_at': {'$gte': today_start}} ] - }).sort('created_at', -1).limit(articles_per_category) + }).sort('created_at', -1).limit(articles_per_category * 3) # Fetch 3x to allow diversity category_articles = [] + source_count = {} # Track how many articles from each source for doc in cursor: # Double-check the date to ensure it's from today @@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3, 'is_clustered': False }) + # Diversify sources: prioritize articles from different sources + # Sort by: clustered first, then by source diversity (fewer articles from same source) + diversified_articles = [] + source_usage = {} + + # First pass: add clustered articles (they represent multiple sources) + for article in category_articles: + if article.get('is_clustered'): + diversified_articles.append(article) + + # Second pass: add non-clustered articles with source diversity + for article in category_articles: + if not article.get('is_clustered'): + source = article.get('source', 'unknown') + # Prefer sources we haven't used much yet + if source not in source_usage: + source_usage[source] = 0 + + # Add article and track source usage + diversified_articles.append(article) + source_usage[source] += 1 + + # Sort by source diversity: clustered first, then by how many times we've used this source + diversified_articles.sort(key=lambda x: ( + 0 if x.get('is_clustered') else 1, # Clustered first + -x.get('article_count', 1), # More sources in cluster = higher priority + source_usage.get(x.get('source', 'unknown'), 0) # Fewer from same source = higher priority + )) + + # Take only the requested number per category + category_articles = diversified_articles[:articles_per_category] + # Add this category's articles to the main list articles.extend(category_articles) - # Sort articles: clustered articles first (by source count), then by recency - # This prioritizes stories covered by multiple sources - articles.sort(key=lambda x: ( - -1 if x.get('is_clustered') else 0, # Clustered first - -x.get('article_count', 1), # More sources = higher priority - ), reverse=True) + # Final sort with source diversity across all categories + # Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency + import random + + # Group by clustered vs non-clustered + clustered = [a for a in articles if a.get('is_clustered')] + non_clustered = [a for a in articles if not a.get('is_clustered')] + + # Sort clustered by article count (more sources = more important) + clustered.sort(key=lambda x: -x.get('article_count', 1)) + + # For non-clustered, shuffle within each category to add variety + # This prevents the same sources from always appearing first + from collections import defaultdict + by_category = defaultdict(list) + for article in non_clustered: + by_category[article.get('category', 'general')].append(article) + + # Shuffle each category's articles to mix sources + for cat_articles in by_category.values(): + random.shuffle(cat_articles) + + # Reconstruct non-clustered list with shuffled articles + non_clustered = [] + for cat in ['general', 'local', 'sports', 'science']: + non_clustered.extend(by_category[cat]) + + # Combine: clustered first, then shuffled non-clustered + articles = clustered + non_clustered return articles