This commit is contained in:
2025-11-20 12:20:46 +01:00
parent 2034d96c9e
commit 4415e895e2

View File

@@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
# Fetch articles for each category separately
for category in categories:
# Query for articles in this category from today
# Fetch more than needed to allow for source diversity filtering
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'category': category,
@@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
{'published_at': {'$gte': today_start}},
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(articles_per_category)
}).sort('created_at', -1).limit(articles_per_category * 3) # Fetch 3x to allow diversity
category_articles = []
source_count = {} # Track how many articles from each source
for doc in cursor:
# Double-check the date to ensure it's from today
@@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
'is_clustered': False
})
# Diversify sources: prioritize articles from different sources
# Sort by: clustered first, then by source diversity (fewer articles from same source)
diversified_articles = []
source_usage = {}
# First pass: add clustered articles (they represent multiple sources)
for article in category_articles:
if article.get('is_clustered'):
diversified_articles.append(article)
# Second pass: add non-clustered articles with source diversity
for article in category_articles:
if not article.get('is_clustered'):
source = article.get('source', 'unknown')
# Prefer sources we haven't used much yet
if source not in source_usage:
source_usage[source] = 0
# Add article and track source usage
diversified_articles.append(article)
source_usage[source] += 1
# Sort by source diversity: clustered first, then by how many times we've used this source
diversified_articles.sort(key=lambda x: (
0 if x.get('is_clustered') else 1, # Clustered first
-x.get('article_count', 1), # More sources in cluster = higher priority
source_usage.get(x.get('source', 'unknown'), 0) # Fewer from same source = higher priority
))
# Take only the requested number per category
category_articles = diversified_articles[:articles_per_category]
# Add this category's articles to the main list
articles.extend(category_articles)
# Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources
articles.sort(key=lambda x: (
-1 if x.get('is_clustered') else 0, # Clustered first
-x.get('article_count', 1), # More sources = higher priority
), reverse=True)
# Final sort with source diversity across all categories
# Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
import random
# Group by clustered vs non-clustered
clustered = [a for a in articles if a.get('is_clustered')]
non_clustered = [a for a in articles if not a.get('is_clustered')]
# Sort clustered by article count (more sources = more important)
clustered.sort(key=lambda x: -x.get('article_count', 1))
# For non-clustered, shuffle within each category to add variety
# This prevents the same sources from always appearing first
from collections import defaultdict
by_category = defaultdict(list)
for article in non_clustered:
by_category[article.get('category', 'general')].append(article)
# Shuffle each category's articles to mix sources
for cat_articles in by_category.values():
random.shuffle(cat_articles)
# Reconstruct non-clustered list with shuffled articles
non_clustered = []
for cat in ['general', 'local', 'sports', 'science']:
non_clustered.extend(by_category[cat])
# Combine: clustered first, then shuffled non-clustered
articles = clustered + non_clustered
return articles