diff --git a/news_sender/sender_service.py b/news_sender/sender_service.py index e7dd84f..7cc883a 100644 --- a/news_sender/sender_service.py +++ b/news_sender/sender_service.py @@ -79,99 +79,117 @@ articles_collection = db['articles'] subscribers_collection = db['subscribers'] -def get_latest_articles(max_articles=10, hours=24): +def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24): """ - Get latest articles with AI summaries from database (from today only) + Get latest articles with AI summaries from database, fetched per category Includes cluster information for articles with multiple sources Args: - max_articles: Maximum number of articles to return + categories: List of categories to fetch (None = all categories) + articles_per_category: Maximum number of articles per category (default 3) hours: Number of hours to look back (default 24) Returns: - list: Articles with summaries published today, including cluster info + list: Articles with summaries published today, grouped by category """ from datetime import timedelta - # Calculate cutoff time (e.g., 24 hours ago) - cutoff_time = datetime.utcnow() - timedelta(hours=hours) - # Get start of today (00:00:00 UTC) today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) # Get cluster summaries collection cluster_summaries_collection = db['cluster_summaries'] - # Query for articles with summaries published today OR created today - # This ensures we only get fresh articles from today - cursor = articles_collection.find({ - 'summary': {'$exists': True, '$ne': None}, - '$or': [ - # Articles published today (if published_at is available) - {'published_at': {'$gte': today_start}}, - # Articles created today (fallback if published_at is missing) - {'created_at': {'$gte': today_start}} - ] - }).sort('created_at', -1).limit(max_articles) + # If no categories specified, get all available categories + if categories is None: + categories = ['general', 'local', 'sports', 'science'] articles = [] processed_clusters = set() - for doc in cursor: - # Double-check the date to ensure it's from today - published_at = doc.get('published_at') - created_at = doc.get('created_at') + # Fetch articles for each category separately + for category in categories: + # Query for articles in this category from today + cursor = articles_collection.find({ + 'summary': {'$exists': True, '$ne': None}, + 'category': category, + '$or': [ + {'published_at': {'$gte': today_start}}, + {'created_at': {'$gte': today_start}} + ] + }).sort('created_at', -1).limit(articles_per_category) - # Skip if both dates are old (extra safety check) - if published_at and isinstance(published_at, datetime): - if published_at < today_start: - continue - elif created_at and isinstance(created_at, datetime): - if created_at < today_start: - continue + category_articles = [] - cluster_id = doc.get('cluster_id') - - # Check if this article is part of a cluster - if cluster_id and cluster_id not in processed_clusters: - # Get cluster summary - cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id}) + for doc in cursor: + # Double-check the date to ensure it's from today + published_at = doc.get('published_at') + created_at = doc.get('created_at') - if cluster and cluster.get('article_count', 0) > 1: - # This is a clustered article - get all source links - processed_clusters.add(cluster_id) + # Skip if both dates are old (extra safety check) + if published_at and isinstance(published_at, datetime): + if published_at < today_start: + continue + elif created_at and isinstance(created_at, datetime): + if created_at < today_start: + continue + + cluster_id = doc.get('cluster_id') + + # Check if this article is part of a cluster + if cluster_id and cluster_id not in processed_clusters: + # Get cluster summary + cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id}) - # Get all articles in this cluster - cluster_articles = list(articles_collection.find({ - 'cluster_id': cluster_id - })) - - # Build sources list with links - sources = [] - for art in cluster_articles: - sources.append({ - 'name': art.get('source', ''), - 'link': art.get('link', ''), - 'title': art.get('title', '') + if cluster and cluster.get('article_count', 0) > 1: + # This is a clustered article - get all source links + processed_clusters.add(cluster_id) + + # Get all articles in this cluster + cluster_articles = list(articles_collection.find({ + 'cluster_id': cluster_id + })) + + # Build sources list with links + sources = [] + for art in cluster_articles: + sources.append({ + 'name': art.get('source', ''), + 'link': art.get('link', ''), + 'title': art.get('title', '') + }) + + category_articles.append({ + 'title': doc.get('title', ''), + 'title_en': doc.get('title_en'), + 'translated_at': doc.get('translated_at'), + 'author': doc.get('author'), + 'link': doc.get('link', ''), + 'summary': cluster.get('neutral_summary', doc.get('summary', '')), + 'source': doc.get('source', ''), + 'category': doc.get('category', 'general'), + 'published_at': doc.get('published_at', ''), + 'is_clustered': True, + 'sources': sources, + 'article_count': len(sources) }) - - articles.append({ - 'title': doc.get('title', ''), - 'title_en': doc.get('title_en'), - 'translated_at': doc.get('translated_at'), - 'author': doc.get('author'), - 'link': doc.get('link', ''), - 'summary': cluster.get('neutral_summary', doc.get('summary', '')), - 'source': doc.get('source', ''), - 'category': doc.get('category', 'general'), - 'published_at': doc.get('published_at', ''), - 'is_clustered': True, - 'sources': sources, - 'article_count': len(sources) - }) - else: - # Single article (no cluster or cluster with only 1 article) - articles.append({ + else: + # Single article (no cluster or cluster with only 1 article) + category_articles.append({ + 'title': doc.get('title', ''), + 'title_en': doc.get('title_en'), + 'translated_at': doc.get('translated_at'), + 'author': doc.get('author'), + 'link': doc.get('link', ''), + 'summary': doc.get('summary', ''), + 'source': doc.get('source', ''), + 'category': doc.get('category', 'general'), + 'published_at': doc.get('published_at', ''), + 'is_clustered': False + }) + elif not cluster_id or cluster_id not in processed_clusters: + # No cluster - single article + category_articles.append({ 'title': doc.get('title', ''), 'title_en': doc.get('title_en'), 'translated_at': doc.get('translated_at'), @@ -183,20 +201,9 @@ def get_latest_articles(max_articles=10, hours=24): 'published_at': doc.get('published_at', ''), 'is_clustered': False }) - elif not cluster_id or cluster_id not in processed_clusters: - # No cluster - single article - articles.append({ - 'title': doc.get('title', ''), - 'title_en': doc.get('title_en'), - 'translated_at': doc.get('translated_at'), - 'author': doc.get('author'), - 'link': doc.get('link', ''), - 'summary': doc.get('summary', ''), - 'source': doc.get('source', ''), - 'category': doc.get('category', 'general'), - 'published_at': doc.get('published_at', ''), - 'is_clustered': False - }) + + # Add this category's articles to the main list + articles.extend(category_articles) # Sort articles: clustered articles first (by source count), then by recency # This prioritizes stories covered by multiple sources @@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None): } # Get articles from today only - max_articles = max_articles or Config.MAX_ARTICLES today_date = datetime.now().strftime('%B %d, %Y') print(f"\nFetching articles published TODAY ({today_date})...") - print(f" Max articles: {max_articles}") - articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK) + print(f" Articles per category: 3") + # Fetch all categories - filtering per subscriber happens later + articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK) if not articles: print("❌ No articles from today with summaries found") @@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None): Generate newsletter HTML for preview (doesn't send) Args: - max_articles: Maximum number of articles to include + max_articles: Maximum number of articles to include (ignored, uses 3 per category) hours: Hours to look back (default from config) Returns: str: HTML content """ - max_articles = max_articles or Config.MAX_ARTICLES hours = hours or Config.HOURS_LOOKBACK - articles = get_latest_articles(max_articles, hours=hours) + articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours) if not articles: today_date = datetime.now().strftime('%B %d, %Y')