update

2025-11-12 23:28:51 +01:00
parent 5dcbf9002c
commit 029befba96
1 changed files with 94 additions and 88 deletions
--- a/news_sender/sender_service.py
+++ b/news_sender/sender_service.py
@@ -79,99 +79,117 @@ articles_collection = db['articles']
 subscribers_collection = db['subscribers']


-def get_latest_articles(max_articles=10, hours=24):
+def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
    """
-    Get latest articles with AI summaries from database (from today only)
+    Get latest articles with AI summaries from database, fetched per category
    Includes cluster information for articles with multiple sources
    
    Args:
-        max_articles: Maximum number of articles to return
+        categories: List of categories to fetch (None = all categories)
+        articles_per_category: Maximum number of articles per category (default 3)
        hours: Number of hours to look back (default 24)
    
    Returns:
-        list: Articles with summaries published today, including cluster info
+        list: Articles with summaries published today, grouped by category
    """
    from datetime import timedelta
    
-    # Calculate cutoff time (e.g., 24 hours ago)
-    cutoff_time = datetime.utcnow() - timedelta(hours=hours)
-    
    # Get start of today (00:00:00 UTC)
    today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
    
    # Get cluster summaries collection
    cluster_summaries_collection = db['cluster_summaries']
    
-    # Query for articles with summaries published today OR created today
-    # This ensures we only get fresh articles from today
-    cursor = articles_collection.find({
-        'summary': {'$exists': True, '$ne': None},
-        '$or': [
-            # Articles published today (if published_at is available)
-            {'published_at': {'$gte': today_start}},
-            # Articles created today (fallback if published_at is missing)
-            {'created_at': {'$gte': today_start}}
-        ]
-    }).sort('created_at', -1).limit(max_articles)
+    # If no categories specified, get all available categories
+    if categories is None:
+        categories = ['general', 'local', 'sports', 'science']
    
    articles = []
    processed_clusters = set()
    
-    for doc in cursor:
-        # Double-check the date to ensure it's from today
-        published_at = doc.get('published_at')
-        created_at = doc.get('created_at')
+    # Fetch articles for each category separately
+    for category in categories:
+        # Query for articles in this category from today
+        cursor = articles_collection.find({
+            'summary': {'$exists': True, '$ne': None},
+            'category': category,
+            '$or': [
+                {'published_at': {'$gte': today_start}},
+                {'created_at': {'$gte': today_start}}
+            ]
+        }).sort('created_at', -1).limit(articles_per_category)
        
-        # Skip if both dates are old (extra safety check)
-        if published_at and isinstance(published_at, datetime):
-            if published_at < today_start:
-                continue
-        elif created_at and isinstance(created_at, datetime):
-            if created_at < today_start:
-                continue
+        category_articles = []
        
-        cluster_id = doc.get('cluster_id')
-        
-        # Check if this article is part of a cluster
-        if cluster_id and cluster_id not in processed_clusters:
-            # Get cluster summary
-            cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
+        for doc in cursor:
+            # Double-check the date to ensure it's from today
+            published_at = doc.get('published_at')
+            created_at = doc.get('created_at')
            
-            if cluster and cluster.get('article_count', 0) > 1:
-                # This is a clustered article - get all source links
-                processed_clusters.add(cluster_id)
+            # Skip if both dates are old (extra safety check)
+            if published_at and isinstance(published_at, datetime):
+                if published_at < today_start:
+                    continue
+            elif created_at and isinstance(created_at, datetime):
+                if created_at < today_start:
+                    continue
+            
+            cluster_id = doc.get('cluster_id')
+            
+            # Check if this article is part of a cluster
+            if cluster_id and cluster_id not in processed_clusters:
+                # Get cluster summary
+                cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
                
-                # Get all articles in this cluster
-                cluster_articles = list(articles_collection.find({
-                    'cluster_id': cluster_id
-                }))
-                
-                # Build sources list with links
-                sources = []
-                for art in cluster_articles:
-                    sources.append({
-                        'name': art.get('source', ''),
-                        'link': art.get('link', ''),
-                        'title': art.get('title', '')
+                if cluster and cluster.get('article_count', 0) > 1:
+                    # This is a clustered article - get all source links
+                    processed_clusters.add(cluster_id)
+                    
+                    # Get all articles in this cluster
+                    cluster_articles = list(articles_collection.find({
+                        'cluster_id': cluster_id
+                    }))
+                    
+                    # Build sources list with links
+                    sources = []
+                    for art in cluster_articles:
+                        sources.append({
+                            'name': art.get('source', ''),
+                            'link': art.get('link', ''),
+                            'title': art.get('title', '')
+                        })
+                    
+                    category_articles.append({
+                        'title': doc.get('title', ''),
+                        'title_en': doc.get('title_en'),
+                        'translated_at': doc.get('translated_at'),
+                        'author': doc.get('author'),
+                        'link': doc.get('link', ''),
+                        'summary': cluster.get('neutral_summary', doc.get('summary', '')),
+                        'source': doc.get('source', ''),
+                        'category': doc.get('category', 'general'),
+                        'published_at': doc.get('published_at', ''),
+                        'is_clustered': True,
+                        'sources': sources,
+                        'article_count': len(sources)
                    })
-                
-                articles.append({
-                    'title': doc.get('title', ''),
-                    'title_en': doc.get('title_en'),
-                    'translated_at': doc.get('translated_at'),
-                    'author': doc.get('author'),
-                    'link': doc.get('link', ''),
-                    'summary': cluster.get('neutral_summary', doc.get('summary', '')),
-                    'source': doc.get('source', ''),
-                    'category': doc.get('category', 'general'),
-                    'published_at': doc.get('published_at', ''),
-                    'is_clustered': True,
-                    'sources': sources,
-                    'article_count': len(sources)
-                })
-            else:
-                # Single article (no cluster or cluster with only 1 article)
-                articles.append({
+                else:
+                    # Single article (no cluster or cluster with only 1 article)
+                    category_articles.append({
+                        'title': doc.get('title', ''),
+                        'title_en': doc.get('title_en'),
+                        'translated_at': doc.get('translated_at'),
+                        'author': doc.get('author'),
+                        'link': doc.get('link', ''),
+                        'summary': doc.get('summary', ''),
+                        'source': doc.get('source', ''),
+                        'category': doc.get('category', 'general'),
+                        'published_at': doc.get('published_at', ''),
+                        'is_clustered': False
+                    })
+            elif not cluster_id or cluster_id not in processed_clusters:
+                # No cluster - single article
+                category_articles.append({
                    'title': doc.get('title', ''),
                    'title_en': doc.get('title_en'),
                    'translated_at': doc.get('translated_at'),
@@ -183,20 +201,9 @@ def get_latest_articles(max_articles=10, hours=24):
                    'published_at': doc.get('published_at', ''),
                    'is_clustered': False
                })
-        elif not cluster_id or cluster_id not in processed_clusters:
-            # No cluster - single article
-            articles.append({
-                'title': doc.get('title', ''),
-                'title_en': doc.get('title_en'),
-                'translated_at': doc.get('translated_at'),
-                'author': doc.get('author'),
-                'link': doc.get('link', ''),
-                'summary': doc.get('summary', ''),
-                'source': doc.get('source', ''),
-                'category': doc.get('category', 'general'),
-                'published_at': doc.get('published_at', ''),
-                'is_clustered': False
-            })
+        
+        # Add this category's articles to the main list
+        articles.extend(category_articles)
    
    # Sort articles: clustered articles first (by source count), then by recency
    # This prioritizes stories covered by multiple sources
@@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None):
        }
    
    # Get articles from today only
-    max_articles = max_articles or Config.MAX_ARTICLES
    today_date = datetime.now().strftime('%B %d, %Y')
    print(f"\nFetching articles published TODAY ({today_date})...")
-    print(f"  Max articles: {max_articles}")
-    articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK)
+    print(f"  Articles per category: 3")
+    # Fetch all categories - filtering per subscriber happens later
+    articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
    
    if not articles:
        print("❌ No articles from today with summaries found")
@@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None):
    Generate newsletter HTML for preview (doesn't send)
    
    Args:
-        max_articles: Maximum number of articles to include
+        max_articles: Maximum number of articles to include (ignored, uses 3 per category)
        hours: Hours to look back (default from config)
        
    Returns:
        str: HTML content
    """
-    max_articles = max_articles or Config.MAX_ARTICLES
    hours = hours or Config.HOURS_LOOKBACK
-    articles = get_latest_articles(max_articles, hours=hours)
+    articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
    
    if not articles:
        today_date = datetime.now().strftime('%B %d, %Y')