update

2025-11-20 12:20:46 +01:00
parent 2034d96c9e
commit 4415e895e2
1 changed files with 64 additions and 7 deletions
--- a/news_sender/sender_service.py
+++ b/news_sender/sender_service.py
@@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
    # Fetch articles for each category separately
    for category in categories:
        # Query for articles in this category from today
        # Fetch more than needed to allow for source diversity filtering
        cursor = articles_collection.find({
            'summary': {'$exists': True, '$ne': None},
            'category': category,
@@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
                {'published_at': {'$gte': today_start}},
                {'created_at': {'$gte': today_start}}
            ]
-        }).sort('created_at', -1).limit(articles_per_category)
+        }).sort('created_at', -1).limit(articles_per_category * 3)  # Fetch 3x to allow diversity
        category_articles = []
        source_count = {}  # Track how many articles from each source
        for doc in cursor:
            # Double-check the date to ensure it's from today
@@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
                    'is_clustered': False
                })
        # Diversify sources: prioritize articles from different sources
        # Sort by: clustered first, then by source diversity (fewer articles from same source)
        diversified_articles = []
        source_usage = {}
        # First pass: add clustered articles (they represent multiple sources)
        for article in category_articles:
            if article.get('is_clustered'):
                diversified_articles.append(article)
        # Second pass: add non-clustered articles with source diversity
        for article in category_articles:
            if not article.get('is_clustered'):
                source = article.get('source', 'unknown')
                # Prefer sources we haven't used much yet
                if source not in source_usage:
                    source_usage[source] = 0
                # Add article and track source usage
                diversified_articles.append(article)
                source_usage[source] += 1
        # Sort by source diversity: clustered first, then by how many times we've used this source
        diversified_articles.sort(key=lambda x: (
            0 if x.get('is_clustered') else 1,  # Clustered first
            -x.get('article_count', 1),  # More sources in cluster = higher priority
            source_usage.get(x.get('source', 'unknown'), 0)  # Fewer from same source = higher priority
        ))
        # Take only the requested number per category
        category_articles = diversified_articles[:articles_per_category]
        # Add this category's articles to the main list
        articles.extend(category_articles)
-    # Sort articles: clustered articles first (by source count), then by recency
+    # Final sort with source diversity across all categories
-    # This prioritizes stories covered by multiple sources
+    # Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
-    articles.sort(key=lambda x: (
+    import random
-        -1 if x.get('is_clustered') else 0,  # Clustered first
+    
-        -x.get('article_count', 1),  # More sources = higher priority
+    # Group by clustered vs non-clustered
-    ), reverse=True)
+    clustered = [a for a in articles if a.get('is_clustered')]
    non_clustered = [a for a in articles if not a.get('is_clustered')]
    # Sort clustered by article count (more sources = more important)
    clustered.sort(key=lambda x: -x.get('article_count', 1))
    # For non-clustered, shuffle within each category to add variety
    # This prevents the same sources from always appearing first
    from collections import defaultdict
    by_category = defaultdict(list)
    for article in non_clustered:
        by_category[article.get('category', 'general')].append(article)
    # Shuffle each category's articles to mix sources
    for cat_articles in by_category.values():
        random.shuffle(cat_articles)
    # Reconstruct non-clustered list with shuffled articles
    non_clustered = []
    for cat in ['general', 'local', 'sports', 'science']:
        non_clustered.extend(by_category[cat])
    # Combine: clustered first, then shuffled non-clustered
    articles = clustered + non_clustered
    return articles