update
This commit is contained in:
@@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
# Fetch articles for each category separately
|
# Fetch articles for each category separately
|
||||||
for category in categories:
|
for category in categories:
|
||||||
# Query for articles in this category from today
|
# Query for articles in this category from today
|
||||||
|
# Fetch more than needed to allow for source diversity filtering
|
||||||
cursor = articles_collection.find({
|
cursor = articles_collection.find({
|
||||||
'summary': {'$exists': True, '$ne': None},
|
'summary': {'$exists': True, '$ne': None},
|
||||||
'category': category,
|
'category': category,
|
||||||
@@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
{'published_at': {'$gte': today_start}},
|
{'published_at': {'$gte': today_start}},
|
||||||
{'created_at': {'$gte': today_start}}
|
{'created_at': {'$gte': today_start}}
|
||||||
]
|
]
|
||||||
}).sort('created_at', -1).limit(articles_per_category)
|
}).sort('created_at', -1).limit(articles_per_category * 3) # Fetch 3x to allow diversity
|
||||||
|
|
||||||
category_articles = []
|
category_articles = []
|
||||||
|
source_count = {} # Track how many articles from each source
|
||||||
|
|
||||||
for doc in cursor:
|
for doc in cursor:
|
||||||
# Double-check the date to ensure it's from today
|
# Double-check the date to ensure it's from today
|
||||||
@@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
|
|||||||
'is_clustered': False
|
'is_clustered': False
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Diversify sources: prioritize articles from different sources
|
||||||
|
# Sort by: clustered first, then by source diversity (fewer articles from same source)
|
||||||
|
diversified_articles = []
|
||||||
|
source_usage = {}
|
||||||
|
|
||||||
|
# First pass: add clustered articles (they represent multiple sources)
|
||||||
|
for article in category_articles:
|
||||||
|
if article.get('is_clustered'):
|
||||||
|
diversified_articles.append(article)
|
||||||
|
|
||||||
|
# Second pass: add non-clustered articles with source diversity
|
||||||
|
for article in category_articles:
|
||||||
|
if not article.get('is_clustered'):
|
||||||
|
source = article.get('source', 'unknown')
|
||||||
|
# Prefer sources we haven't used much yet
|
||||||
|
if source not in source_usage:
|
||||||
|
source_usage[source] = 0
|
||||||
|
|
||||||
|
# Add article and track source usage
|
||||||
|
diversified_articles.append(article)
|
||||||
|
source_usage[source] += 1
|
||||||
|
|
||||||
|
# Sort by source diversity: clustered first, then by how many times we've used this source
|
||||||
|
diversified_articles.sort(key=lambda x: (
|
||||||
|
0 if x.get('is_clustered') else 1, # Clustered first
|
||||||
|
-x.get('article_count', 1), # More sources in cluster = higher priority
|
||||||
|
source_usage.get(x.get('source', 'unknown'), 0) # Fewer from same source = higher priority
|
||||||
|
))
|
||||||
|
|
||||||
|
# Take only the requested number per category
|
||||||
|
category_articles = diversified_articles[:articles_per_category]
|
||||||
|
|
||||||
# Add this category's articles to the main list
|
# Add this category's articles to the main list
|
||||||
articles.extend(category_articles)
|
articles.extend(category_articles)
|
||||||
|
|
||||||
# Sort articles: clustered articles first (by source count), then by recency
|
# Final sort with source diversity across all categories
|
||||||
# This prioritizes stories covered by multiple sources
|
# Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
|
||||||
articles.sort(key=lambda x: (
|
import random
|
||||||
-1 if x.get('is_clustered') else 0, # Clustered first
|
|
||||||
-x.get('article_count', 1), # More sources = higher priority
|
# Group by clustered vs non-clustered
|
||||||
), reverse=True)
|
clustered = [a for a in articles if a.get('is_clustered')]
|
||||||
|
non_clustered = [a for a in articles if not a.get('is_clustered')]
|
||||||
|
|
||||||
|
# Sort clustered by article count (more sources = more important)
|
||||||
|
clustered.sort(key=lambda x: -x.get('article_count', 1))
|
||||||
|
|
||||||
|
# For non-clustered, shuffle within each category to add variety
|
||||||
|
# This prevents the same sources from always appearing first
|
||||||
|
from collections import defaultdict
|
||||||
|
by_category = defaultdict(list)
|
||||||
|
for article in non_clustered:
|
||||||
|
by_category[article.get('category', 'general')].append(article)
|
||||||
|
|
||||||
|
# Shuffle each category's articles to mix sources
|
||||||
|
for cat_articles in by_category.values():
|
||||||
|
random.shuffle(cat_articles)
|
||||||
|
|
||||||
|
# Reconstruct non-clustered list with shuffled articles
|
||||||
|
non_clustered = []
|
||||||
|
for cat in ['general', 'local', 'sports', 'science']:
|
||||||
|
non_clustered.extend(by_category[cat])
|
||||||
|
|
||||||
|
# Combine: clustered first, then shuffled non-clustered
|
||||||
|
articles = clustered + non_clustered
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user