This commit is contained in:
2025-11-12 23:28:51 +01:00
parent 5dcbf9002c
commit 029befba96

View File

@@ -79,99 +79,117 @@ articles_collection = db['articles']
subscribers_collection = db['subscribers']
def get_latest_articles(max_articles=10, hours=24):
def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
"""
Get latest articles with AI summaries from database (from today only)
Get latest articles with AI summaries from database, fetched per category
Includes cluster information for articles with multiple sources
Args:
max_articles: Maximum number of articles to return
categories: List of categories to fetch (None = all categories)
articles_per_category: Maximum number of articles per category (default 3)
hours: Number of hours to look back (default 24)
Returns:
list: Articles with summaries published today, including cluster info
list: Articles with summaries published today, grouped by category
"""
from datetime import timedelta
# Calculate cutoff time (e.g., 24 hours ago)
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
# Get start of today (00:00:00 UTC)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Get cluster summaries collection
cluster_summaries_collection = db['cluster_summaries']
# Query for articles with summaries published today OR created today
# This ensures we only get fresh articles from today
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'$or': [
# Articles published today (if published_at is available)
{'published_at': {'$gte': today_start}},
# Articles created today (fallback if published_at is missing)
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(max_articles)
# If no categories specified, get all available categories
if categories is None:
categories = ['general', 'local', 'sports', 'science']
articles = []
processed_clusters = set()
for doc in cursor:
# Double-check the date to ensure it's from today
published_at = doc.get('published_at')
created_at = doc.get('created_at')
# Fetch articles for each category separately
for category in categories:
# Query for articles in this category from today
cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'category': category,
'$or': [
{'published_at': {'$gte': today_start}},
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(articles_per_category)
# Skip if both dates are old (extra safety check)
if published_at and isinstance(published_at, datetime):
if published_at < today_start:
continue
elif created_at and isinstance(created_at, datetime):
if created_at < today_start:
continue
category_articles = []
cluster_id = doc.get('cluster_id')
# Check if this article is part of a cluster
if cluster_id and cluster_id not in processed_clusters:
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
for doc in cursor:
# Double-check the date to ensure it's from today
published_at = doc.get('published_at')
created_at = doc.get('created_at')
if cluster and cluster.get('article_count', 0) > 1:
# This is a clustered article - get all source links
processed_clusters.add(cluster_id)
# Skip if both dates are old (extra safety check)
if published_at and isinstance(published_at, datetime):
if published_at < today_start:
continue
elif created_at and isinstance(created_at, datetime):
if created_at < today_start:
continue
cluster_id = doc.get('cluster_id')
# Check if this article is part of a cluster
if cluster_id and cluster_id not in processed_clusters:
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
# Get all articles in this cluster
cluster_articles = list(articles_collection.find({
'cluster_id': cluster_id
}))
# Build sources list with links
sources = []
for art in cluster_articles:
sources.append({
'name': art.get('source', ''),
'link': art.get('link', ''),
'title': art.get('title', '')
if cluster and cluster.get('article_count', 0) > 1:
# This is a clustered article - get all source links
processed_clusters.add(cluster_id)
# Get all articles in this cluster
cluster_articles = list(articles_collection.find({
'cluster_id': cluster_id
}))
# Build sources list with links
sources = []
for art in cluster_articles:
sources.append({
'name': art.get('source', ''),
'link': art.get('link', ''),
'title': art.get('title', '')
})
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': True,
'sources': sources,
'article_count': len(sources)
})
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': True,
'sources': sources,
'article_count': len(sources)
})
else:
# Single article (no cluster or cluster with only 1 article)
articles.append({
else:
# Single article (no cluster or cluster with only 1 article)
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
@@ -183,20 +201,9 @@ def get_latest_articles(max_articles=10, hours=24):
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
# Add this category's articles to the main list
articles.extend(category_articles)
# Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources
@@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None):
}
# Get articles from today only
max_articles = max_articles or Config.MAX_ARTICLES
today_date = datetime.now().strftime('%B %d, %Y')
print(f"\nFetching articles published TODAY ({today_date})...")
print(f" Max articles: {max_articles}")
articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK)
print(f" Articles per category: 3")
# Fetch all categories - filtering per subscriber happens later
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
if not articles:
print("❌ No articles from today with summaries found")
@@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None):
Generate newsletter HTML for preview (doesn't send)
Args:
max_articles: Maximum number of articles to include
max_articles: Maximum number of articles to include (ignored, uses 3 per category)
hours: Hours to look back (default from config)
Returns:
str: HTML content
"""
max_articles = max_articles or Config.MAX_ARTICLES
hours = hours or Config.HOURS_LOOKBACK
articles = get_latest_articles(max_articles, hours=hours)
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
if not articles:
today_date = datetime.now().strftime('%B %d, %Y')