This commit is contained in:
2025-11-12 23:28:51 +01:00
parent 5dcbf9002c
commit 029befba96

View File

@@ -79,99 +79,117 @@ articles_collection = db['articles']
subscribers_collection = db['subscribers'] subscribers_collection = db['subscribers']
def get_latest_articles(max_articles=10, hours=24): def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
""" """
Get latest articles with AI summaries from database (from today only) Get latest articles with AI summaries from database, fetched per category
Includes cluster information for articles with multiple sources Includes cluster information for articles with multiple sources
Args: Args:
max_articles: Maximum number of articles to return categories: List of categories to fetch (None = all categories)
articles_per_category: Maximum number of articles per category (default 3)
hours: Number of hours to look back (default 24) hours: Number of hours to look back (default 24)
Returns: Returns:
list: Articles with summaries published today, including cluster info list: Articles with summaries published today, grouped by category
""" """
from datetime import timedelta from datetime import timedelta
# Calculate cutoff time (e.g., 24 hours ago)
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
# Get start of today (00:00:00 UTC) # Get start of today (00:00:00 UTC)
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Get cluster summaries collection # Get cluster summaries collection
cluster_summaries_collection = db['cluster_summaries'] cluster_summaries_collection = db['cluster_summaries']
# Query for articles with summaries published today OR created today # If no categories specified, get all available categories
# This ensures we only get fresh articles from today if categories is None:
cursor = articles_collection.find({ categories = ['general', 'local', 'sports', 'science']
'summary': {'$exists': True, '$ne': None},
'$or': [
# Articles published today (if published_at is available)
{'published_at': {'$gte': today_start}},
# Articles created today (fallback if published_at is missing)
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(max_articles)
articles = [] articles = []
processed_clusters = set() processed_clusters = set()
for doc in cursor: # Fetch articles for each category separately
# Double-check the date to ensure it's from today for category in categories:
published_at = doc.get('published_at') # Query for articles in this category from today
created_at = doc.get('created_at') cursor = articles_collection.find({
'summary': {'$exists': True, '$ne': None},
'category': category,
'$or': [
{'published_at': {'$gte': today_start}},
{'created_at': {'$gte': today_start}}
]
}).sort('created_at', -1).limit(articles_per_category)
# Skip if both dates are old (extra safety check) category_articles = []
if published_at and isinstance(published_at, datetime):
if published_at < today_start:
continue
elif created_at and isinstance(created_at, datetime):
if created_at < today_start:
continue
cluster_id = doc.get('cluster_id') for doc in cursor:
# Double-check the date to ensure it's from today
# Check if this article is part of a cluster published_at = doc.get('published_at')
if cluster_id and cluster_id not in processed_clusters: created_at = doc.get('created_at')
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
if cluster and cluster.get('article_count', 0) > 1: # Skip if both dates are old (extra safety check)
# This is a clustered article - get all source links if published_at and isinstance(published_at, datetime):
processed_clusters.add(cluster_id) if published_at < today_start:
continue
elif created_at and isinstance(created_at, datetime):
if created_at < today_start:
continue
cluster_id = doc.get('cluster_id')
# Check if this article is part of a cluster
if cluster_id and cluster_id not in processed_clusters:
# Get cluster summary
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
# Get all articles in this cluster if cluster and cluster.get('article_count', 0) > 1:
cluster_articles = list(articles_collection.find({ # This is a clustered article - get all source links
'cluster_id': cluster_id processed_clusters.add(cluster_id)
}))
# Get all articles in this cluster
# Build sources list with links cluster_articles = list(articles_collection.find({
sources = [] 'cluster_id': cluster_id
for art in cluster_articles: }))
sources.append({
'name': art.get('source', ''), # Build sources list with links
'link': art.get('link', ''), sources = []
'title': art.get('title', '') for art in cluster_articles:
sources.append({
'name': art.get('source', ''),
'link': art.get('link', ''),
'title': art.get('title', '')
})
category_articles.append({
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': True,
'sources': sources,
'article_count': len(sources)
}) })
else:
articles.append({ # Single article (no cluster or cluster with only 1 article)
'title': doc.get('title', ''), category_articles.append({
'title_en': doc.get('title_en'), 'title': doc.get('title', ''),
'translated_at': doc.get('translated_at'), 'title_en': doc.get('title_en'),
'author': doc.get('author'), 'translated_at': doc.get('translated_at'),
'link': doc.get('link', ''), 'author': doc.get('author'),
'summary': cluster.get('neutral_summary', doc.get('summary', '')), 'link': doc.get('link', ''),
'source': doc.get('source', ''), 'summary': doc.get('summary', ''),
'category': doc.get('category', 'general'), 'source': doc.get('source', ''),
'published_at': doc.get('published_at', ''), 'category': doc.get('category', 'general'),
'is_clustered': True, 'published_at': doc.get('published_at', ''),
'sources': sources, 'is_clustered': False
'article_count': len(sources) })
}) elif not cluster_id or cluster_id not in processed_clusters:
else: # No cluster - single article
# Single article (no cluster or cluster with only 1 article) category_articles.append({
articles.append({
'title': doc.get('title', ''), 'title': doc.get('title', ''),
'title_en': doc.get('title_en'), 'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'), 'translated_at': doc.get('translated_at'),
@@ -183,20 +201,9 @@ def get_latest_articles(max_articles=10, hours=24):
'published_at': doc.get('published_at', ''), 'published_at': doc.get('published_at', ''),
'is_clustered': False 'is_clustered': False
}) })
elif not cluster_id or cluster_id not in processed_clusters:
# No cluster - single article # Add this category's articles to the main list
articles.append({ articles.extend(category_articles)
'title': doc.get('title', ''),
'title_en': doc.get('title_en'),
'translated_at': doc.get('translated_at'),
'author': doc.get('author'),
'link': doc.get('link', ''),
'summary': doc.get('summary', ''),
'source': doc.get('source', ''),
'category': doc.get('category', 'general'),
'published_at': doc.get('published_at', ''),
'is_clustered': False
})
# Sort articles: clustered articles first (by source count), then by recency # Sort articles: clustered articles first (by source count), then by recency
# This prioritizes stories covered by multiple sources # This prioritizes stories covered by multiple sources
@@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None):
} }
# Get articles from today only # Get articles from today only
max_articles = max_articles or Config.MAX_ARTICLES
today_date = datetime.now().strftime('%B %d, %Y') today_date = datetime.now().strftime('%B %d, %Y')
print(f"\nFetching articles published TODAY ({today_date})...") print(f"\nFetching articles published TODAY ({today_date})...")
print(f" Max articles: {max_articles}") print(f" Articles per category: 3")
articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK) # Fetch all categories - filtering per subscriber happens later
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
if not articles: if not articles:
print("❌ No articles from today with summaries found") print("❌ No articles from today with summaries found")
@@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None):
Generate newsletter HTML for preview (doesn't send) Generate newsletter HTML for preview (doesn't send)
Args: Args:
max_articles: Maximum number of articles to include max_articles: Maximum number of articles to include (ignored, uses 3 per category)
hours: Hours to look back (default from config) hours: Hours to look back (default from config)
Returns: Returns:
str: HTML content str: HTML content
""" """
max_articles = max_articles or Config.MAX_ARTICLES
hours = hours or Config.HOURS_LOOKBACK hours = hours or Config.HOURS_LOOKBACK
articles = get_latest_articles(max_articles, hours=hours) articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
if not articles: if not articles:
today_date = datetime.now().strftime('%B %d, %Y') today_date = datetime.now().strftime('%B %d, %Y')