This commit is contained in:
2025-11-12 23:28:51 +01:00
parent 5dcbf9002c
commit 029befba96

View File

@@ -79,44 +79,48 @@ articles_collection = db['articles']
subscribers_collection = db['subscribers']
def get_latest_articles(max_articles=10, hours=24):
def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
"""
Get latest articles with AI summaries from database (from today only)
Get latest articles with AI summaries from database, fetched per category
Includes cluster information for articles with multiple sources
Args:
max_articles: Maximum number of articles to return
categories: List of categories to fetch (None = all categories)
articles_per_category: Maximum number of articles per category (default 3)
hours: Number of hours to look back (default 24)
Returns:
list: Articles with summaries published today, including cluster info
list: Articles with summaries published today, grouped by category
"""
from datetime import timedelta
# Calculate cutoff time (e.g., 24 hours ago)
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
# Get start of today (00:00:00 UTC)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Get cluster summaries collection
cluster_summaries_collection = db['cluster_summaries']
# Query for articles with summaries published today OR created today
# This ensures we only get fresh articles from today
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'$or': [
# Articles published today (if published_at is available)
{'published_at': {'$gte': today_start}},
# Articles created today (fallback if published_at is missing)
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(max_articles)
# If no categories specified, get all available categories
if categories is None:
categories = ['general', 'local', 'sports', 'science']
articles = []
processed_clusters = set()
# Fetch articles for each category separately
for category in categories:
# Query for articles in this category from today
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'category': category,
'$or': [
{'published_at': {'$gte': today_start}},
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(articles_per_category)
category_articles = []
for doc in cursor:
# Double-check the date to ensure it's from today
published_at = doc.get('published_at')
@@ -155,7 +159,7 @@ def get_latest_articles(max_articles=10, hours=24):
'title': art.get('title', '')
})
articles.append({
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
@@ -171,7 +175,7 @@ def get_latest_articles(max_articles=10, hours=24):
})
else:
# Single article (no cluster or cluster with only 1 article)
articles.append({
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
@@ -185,7 +189,7 @@ def get_latest_articles(max_articles=10, hours=24):
})
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article
articles.append({
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
@@ -198,6 +202,9 @@ def get_latest_articles(max_articles=10, hours=24):
'is_clustered': False
})
# Add this category's articles to the main list
articles.extend(category_articles)
# Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources
articles.sort(key=lambda x: (
@@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None):
}
# Get articles from today only
max_articles = max_articles or Config.MAX_ARTICLES
today_date = datetime.now().strftime('%B %d, %Y')
print(f"\nFetching articles published TODAY ({today_date})...")
print(f" Max articles: {max_articles}")
articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK)
print(f" Articles per category: 3")
# Fetch all categories - filtering per subscriber happens later
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
if not articles:
print("❌ No articles from today with summaries found")
@@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None):
Generate newsletter HTML for preview (doesn't send)
Args:
max_articles: Maximum number of articles to include
max_articles: Maximum number of articles to include (ignored, uses 3 per category)
hours: Hours to look back (default from config)
Returns:
str: HTML content
"""
max_articles = max_articles or Config.MAX_ARTICLES
hours = hours or Config.HOURS_LOOKBACK
articles = get_latest_articles(max_articles, hours=hours)
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
if not articles:
today_date = datetime.now().strftime('%B %d, %Y')