update

2025-11-20 12:20:46 +01:00
parent 2034d96c9e
commit 4415e895e2
1 changed files with 64 additions and 7 deletions
--- a/news_sender/sender_service.py
+++ b/news_sender/sender_service.py
@@ -176,6 +176,7 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
    # Fetch articles for each category separately
    for category in categories:
        # Query for articles in this category from today
+        # Fetch more than needed to allow for source diversity filtering
        cursor = articles_collection.find({
            'summary': {'$exists': True, '$ne': None},
            'category': category,
@@ -183,9 +184,10 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
                {'published_at': {'$gte': today_start}},
                {'created_at': {'$gte': today_start}}
            ]
-        }).sort('created_at', -1).limit(articles_per_category)
+        }).sort('created_at', -1).limit(articles_per_category * 3)  # Fetch 3x to allow diversity
        
        category_articles = []
+        source_count = {}  # Track how many articles from each source
        
        for doc in cursor:
            # Double-check the date to ensure it's from today
@@ -268,15 +270,70 @@ def get_latest_articles_by_categories(categories=None, articles_per_category=3,
                    'is_clustered': False
                })
        
+        # Diversify sources: prioritize articles from different sources
+        # Sort by: clustered first, then by source diversity (fewer articles from same source)
+        diversified_articles = []
+        source_usage = {}
+        
+        # First pass: add clustered articles (they represent multiple sources)
+        for article in category_articles:
+            if article.get('is_clustered'):
+                diversified_articles.append(article)
+        
+        # Second pass: add non-clustered articles with source diversity
+        for article in category_articles:
+            if not article.get('is_clustered'):
+                source = article.get('source', 'unknown')
+                # Prefer sources we haven't used much yet
+                if source not in source_usage:
+                    source_usage[source] = 0
+                
+                # Add article and track source usage
+                diversified_articles.append(article)
+                source_usage[source] += 1
+        
+        # Sort by source diversity: clustered first, then by how many times we've used this source
+        diversified_articles.sort(key=lambda x: (
+            0 if x.get('is_clustered') else 1,  # Clustered first
+            -x.get('article_count', 1),  # More sources in cluster = higher priority
+            source_usage.get(x.get('source', 'unknown'), 0)  # Fewer from same source = higher priority
+        ))
+        
+        # Take only the requested number per category
+        category_articles = diversified_articles[:articles_per_category]
+        
        # Add this category's articles to the main list
        articles.extend(category_articles)
    
-    # Sort articles: clustered articles first (by source count), then by recency
-    # This prioritizes stories covered by multiple sources
-    articles.sort(key=lambda x: (
-        -1 if x.get('is_clustered') else 0,  # Clustered first
-        -x.get('article_count', 1),  # More sources = higher priority
-    ), reverse=True)
+    # Final sort with source diversity across all categories
+    # Prioritize: 1) Clustered articles, 2) Source diversity, 3) Recency
+    import random
+    
+    # Group by clustered vs non-clustered
+    clustered = [a for a in articles if a.get('is_clustered')]
+    non_clustered = [a for a in articles if not a.get('is_clustered')]
+    
+    # Sort clustered by article count (more sources = more important)
+    clustered.sort(key=lambda x: -x.get('article_count', 1))
+    
+    # For non-clustered, shuffle within each category to add variety
+    # This prevents the same sources from always appearing first
+    from collections import defaultdict
+    by_category = defaultdict(list)
+    for article in non_clustered:
+        by_category[article.get('category', 'general')].append(article)
+    
+    # Shuffle each category's articles to mix sources
+    for cat_articles in by_category.values():
+        random.shuffle(cat_articles)
+    
+    # Reconstruct non-clustered list with shuffled articles
+    non_clustered = []
+    for cat in ['general', 'local', 'sports', 'science']:
+        non_clustered.extend(by_category[cat])
+    
+    # Combine: clustered first, then shuffled non-clustered
+    articles = clustered + non_clustered
    
    return articles