update

2025-11-12 13:45:39 +01:00
parent ce6c2f88bd
commit b7d957f100
2 changed files with 230 additions and 15 deletions
--- a/news_sender/sender_service.py
+++ b/news_sender/sender_service.py
@@ -81,13 +81,14 @@ subscribers_collection = db['subscribers']
 def get_latest_articles(max_articles=10, hours=24):
    """
    Get latest articles with AI summaries from database (from today only)
+    Includes cluster information for articles with multiple sources
    
    Args:
        max_articles: Maximum number of articles to return
        hours: Number of hours to look back (default 24)
    
    Returns:
-        list: Articles with summaries published today
+        list: Articles with summaries published today, including cluster info
    """
    from datetime import timedelta
    
@@ -97,6 +98,9 @@ def get_latest_articles(max_articles=10, hours=24):
    # Get start of today (00:00:00 UTC)
    today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
    
+    # Get cluster summaries collection
+    cluster_summaries_collection = db['cluster_summaries']
+    
    # Query for articles with summaries published today OR created today
    # This ensures we only get fresh articles from today
    cursor = articles_collection.find({
@@ -110,6 +114,8 @@ def get_latest_articles(max_articles=10, hours=24):
    }).sort('created_at', -1).limit(max_articles)
    
    articles = []
+    processed_clusters = set()
+    
    for doc in cursor:
        # Double-check the date to ensure it's from today
        published_at = doc.get('published_at')
@@ -123,16 +129,77 @@ def get_latest_articles(max_articles=10, hours=24):
            if created_at < today_start:
                continue
        
-        articles.append({
-            'title': doc.get('title', ''),
-            'title_en': doc.get('title_en'),
-            'translated_at': doc.get('translated_at'),
-            'author': doc.get('author'),
-            'link': doc.get('link', ''),
-            'summary': doc.get('summary', ''),
-            'source': doc.get('source', ''),
-            'published_at': doc.get('published_at', '')
-        })
+        cluster_id = doc.get('cluster_id')
+        
+        # Check if this article is part of a cluster
+        if cluster_id and cluster_id not in processed_clusters:
+            # Get cluster summary
+            cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
+            
+            if cluster and cluster.get('article_count', 0) > 1:
+                # This is a clustered article - get all source links
+                processed_clusters.add(cluster_id)
+                
+                # Get all articles in this cluster
+                cluster_articles = list(articles_collection.find({
+                    'cluster_id': cluster_id
+                }))
+                
+                # Build sources list with links
+                sources = []
+                for art in cluster_articles:
+                    sources.append({
+                        'name': art.get('source', ''),
+                        'link': art.get('link', ''),
+                        'title': art.get('title', '')
+                    })
+                
+                articles.append({
+                    'title': doc.get('title', ''),
+                    'title_en': doc.get('title_en'),
+                    'translated_at': doc.get('translated_at'),
+                    'author': doc.get('author'),
+                    'link': doc.get('link', ''),
+                    'summary': cluster.get('neutral_summary', doc.get('summary', '')),
+                    'source': doc.get('source', ''),
+                    'published_at': doc.get('published_at', ''),
+                    'is_clustered': True,
+                    'sources': sources,
+                    'article_count': len(sources)
+                })
+            else:
+                # Single article (no cluster or cluster with only 1 article)
+                articles.append({
+                    'title': doc.get('title', ''),
+                    'title_en': doc.get('title_en'),
+                    'translated_at': doc.get('translated_at'),
+                    'author': doc.get('author'),
+                    'link': doc.get('link', ''),
+                    'summary': doc.get('summary', ''),
+                    'source': doc.get('source', ''),
+                    'published_at': doc.get('published_at', ''),
+                    'is_clustered': False
+                })
+        elif not cluster_id or cluster_id not in processed_clusters:
+            # No cluster - single article
+            articles.append({
+                'title': doc.get('title', ''),
+                'title_en': doc.get('title_en'),
+                'translated_at': doc.get('translated_at'),
+                'author': doc.get('author'),
+                'link': doc.get('link', ''),
+                'summary': doc.get('summary', ''),
+                'source': doc.get('source', ''),
+                'published_at': doc.get('published_at', ''),
+                'is_clustered': False
+            })
+    
+    # Sort articles: clustered articles first (by source count), then by recency
+    # This prioritizes stories covered by multiple sources
+    articles.sort(key=lambda x: (
+        -1 if x.get('is_clustered') else 0,  # Clustered first
+        -x.get('article_count', 1),  # More sources = higher priority
+    ), reverse=True)
    
    return articles

@@ -170,13 +237,19 @@ def render_newsletter_html(articles, tracking_enabled=False, pixel_tracking_id=N
    
    template = Template(template_content)
    
+    # Split articles into sections
+    # Top 3 are "trending", rest are "other articles"
+    trending_articles = articles[:3] if len(articles) >= 3 else articles
+    other_articles = articles[3:] if len(articles) > 3 else []
+    
    # Prepare template data
    now = datetime.now()
    template_data = {
        'date': now.strftime('%A, %B %d, %Y'),
        'year': now.year,
        'article_count': len(articles),
-        'articles': articles,
+        'trending_articles': trending_articles,
+        'other_articles': other_articles,
        'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
        'website_link': Config.WEBSITE_URL,
        'tracking_enabled': tracking_enabled