update
This commit is contained in:
@@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
||||
# Fetch articles for each category separately
|
||||
for category in categories:
|
||||
# Query for articles in this category from today
|
||||
# Fetch more than needed to allow for source diversity filtering
|
||||
cursor = articles_collection.find({
|
||||
'summary': {'$exists': True, '$ne': None},
|
||||
'category': category,
|
||||
@@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
||||
{'published_at': {'$gte': today_start}},
|
||||
{'created_at': {'$gte': today_start}}
|
||||
]
|
||||
}).sort('created_at', -1).limit(articles_per_category)
|
||||
}).sort('created_at', -1).limit(articles_per_category * 3) # Fetch 3x to allow diversity
|
||||
|
||||
category_articles = []
|
||||
source_count = {} # Track how many articles from each source
|
||||
|
||||
for doc in cursor:
|
||||
# Double-check the date to ensure it's from today
|
||||
@@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
||||
'is_clustered': False
|
||||
})
|
||||
|
||||
# Diversify sources: prioritize articles from different sources
|
||||
# Sort by: clustered first, then by source diversity (fewer articles from same source)
|
||||
diversified_articles = []
|
||||
source_usage = {}
|
||||
|
||||
# First pass: add clustered articles (they represent multiple sources)
|
||||
for article in category_articles:
|
||||
if article.get('is_clustered'):
|
||||
diversified_articles.append(article)
|
||||
|
||||
# Second pass: add non-clustered articles with source diversity
|
||||
for article in category_articles:
|
||||
if not article.get('is_clustered'):
|
||||
source = article.get('source', 'unknown')
|
||||
# Prefer sources we haven't used much yet
|
||||
if source not in source_usage:
|
||||
source_usage[source] = 0
|
||||
|
||||
# Add article and track source usage
|
||||
diversified_articles.append(article)
|
||||
source_usage[source] += 1
|
||||
|
||||
# Sort by source diversity: clustered first, then by how many times we've used this source
|
||||
diversified_articles.sort(key=lambda x: (
|
||||
0 if x.get('is_clustered') else 1, # Clustered first
|
||||
-x.get('article_count', 1), # More sources in cluster = higher priority
|
||||
source_usage.get(x.get('source', 'unknown'), 0) # Fewer from same source = higher priority
|
||||
))
|
||||
|
||||
# Take only the requested number per category
|
||||
category_articles = diversified_articles[:articles_per_category]
|
||||
|
||||
# Add this category's articles to the main list
|
||||
articles.extend(category_articles)
|
||||
|
||||
# Sort articles: clustered articles first (by source count), then by recency
|
||||
# This prioritizes stories covered by multiple sources
|
||||
articles.sort(key=lambda x: (
|
||||
-1 if x.get('is_clustered') else 0, # Clustered first
|
||||
-x.get('article_count', 1), # More sources = higher priority
|
||||
), reverse=True)
|
||||
# Final sort with source diversity across all categories
|
||||
# Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
|
||||
import random
|
||||
|
||||
# Group by clustered vs non-clustered
|
||||
clustered = [a for a in articles if a.get('is_clustered')]
|
||||
non_clustered = [a for a in articles if not a.get('is_clustered')]
|
||||
|
||||
# Sort clustered by article count (more sources = more important)
|
||||
clustered.sort(key=lambda x: -x.get('article_count', 1))
|
||||
|
||||
# For non-clustered, shuffle within each category to add variety
|
||||
# This prevents the same sources from always appearing first
|
||||
from collections import defaultdict
|
||||
by_category = defaultdict(list)
|
||||
for article in non_clustered:
|
||||
by_category[article.get('category', 'general')].append(article)
|
||||
|
||||
# Shuffle each category's articles to mix sources
|
||||
for cat_articles in by_category.values():
|
||||
random.shuffle(cat_articles)
|
||||
|
||||
# Reconstruct non-clustered list with shuffled articles
|
||||
non_clustered = []
|
||||
for cat in ['general', 'local', 'sports', 'science']:
|
||||
non_clustered.extend(by_category[cat])
|
||||
|
||||
# Combine: clustered first, then shuffled non-clustered
|
||||
articles = clustered + non_clustered
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
Reference in New Issue
Block a user