This commit is contained in:
2025-11-12 13:45:39 +01:00
parent ce6c2f88bd
commit b7d957f100
2 changed files with 230 additions and 15 deletions

View File

@@ -81,13 +81,14 @@ subscribers_collection = db['subscribers']
def get_latest_articles(max_articles=10, hours=24):
"""
Get latest articles with AI summaries from database (from today only)
Includes cluster information for articles with multiple sources
Args:
max_articles: Maximum number of articles to return
hours: Number of hours to look back (default 24)
Returns:
list: Articles with summaries published today
list: Articles with summaries published today, including cluster info
"""
from datetime import timedelta
@@ -97,6 +98,9 @@ def get_latest_articles(max_articles=10, hours=24):
# Get start of today (00:00:00 UTC)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Get cluster summaries collection
cluster_summaries_collection = db['cluster_summaries']
# Query for articles with summaries published today OR created today
# This ensures we only get fresh articles from today
cursor = articles_collection.find({
@@ -110,6 +114,8 @@ def get_latest_articles(max_articles=10, hours=24):
}).sort('created_at', -1).limit(max_articles)
articles = []
processed_clusters = set()
for doc in cursor:
# Double-check the date to ensure it's from today
published_at = doc.get('published_at')
@@ -123,16 +129,77 @@ def get_latest_articles(max_articles=10, hours=24):
if created_at < today_start:
continue
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', '')
})
cluster_id = doc.get('cluster_id')
# Check if this article is part of a cluster
if cluster_id and cluster_id not in processed_clusters:
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
if cluster and cluster.get('article_count', 0) > 1:
# This is a clustered article - get all source links
processed_clusters.add(cluster_id)
# Get all articles in this cluster
cluster_articles = list(articles_collection.find({
'cluster_id': cluster_id
}))
# Build sources list with links
sources = []
for art in cluster_articles:
sources.append({
'name': art.get('source', ''),
'link': art.get('link', ''),
'title': art.get('title', '')
})
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', ''),
'is_clustered': True,
'sources': sources,
'article_count': len(sources)
})
else:
# Single article (no cluster or cluster with only 1 article)
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article
articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
# Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources
articles.sort(key=lambda x: (
-1 if x.get('is_clustered') else 0, # Clustered first
-x.get('article_count', 1), # More sources = higher priority
), reverse=True)
return articles
@@ -170,13 +237,19 @@ def render_newsletter_html(articles, tracking_enabled=False, pixel_tracking_id=N
template = Template(template_content)
# Split articles into sections
# Top 3 are "trending", rest are "other articles"
trending_articles = articles[:3] if len(articles) >= 3 else articles
other_articles = articles[3:] if len(articles) > 3 else []
# Prepare template data
now = datetime.now()
template_data = {
'date': now.strftime('%A, %B %d, %Y'),
'year': now.year,
'article_count': len(articles),
'articles': articles,
'trending_articles': trending_articles,
'other_articles': other_articles,
'unsubscribe_link': f'{Config.WEBSITE_URL}/unsubscribe',
'website_link': Config.WEBSITE_URL,
'tracking_enabled': tracking_enabled