update
This commit is contained in:
@@ -79,99 +79,117 @@ articles_collection = db['articles']
|
|||||||
subscribers_collection = db['subscribers']
|
subscribers_collection = db['subscribers']
|
||||||
|
|
||||||
|
|
||||||
def get_latest_articles(max_articles=10, hours=24):
|
def get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=24):
|
||||||
"""
|
"""
|
||||||
Get latest articles with AI summaries from database (from today only)
|
Get latest articles with AI summaries from database, fetched per category
|
||||||
Includes cluster information for articles with multiple sources
|
Includes cluster information for articles with multiple sources
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
max_articles: Maximum number of articles to return
|
categories: List of categories to fetch (None = all categories)
|
||||||
|
articles_per_category: Maximum number of articles per category (default 3)
|
||||||
hours: Number of hours to look back (default 24)
|
hours: Number of hours to look back (default 24)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: Articles with summaries published today, including cluster info
|
list: Articles with summaries published today, grouped by category
|
||||||
"""
|
"""
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
# Calculate cutoff time (e.g., 24 hours ago)
|
|
||||||
cutoff_time = datetime.utcnow() - timedelta(hours=hours)
|
|
||||||
|
|
||||||
# Get start of today (00:00:00 UTC)
|
# Get start of today (00:00:00 UTC)
|
||||||
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||||
|
|
||||||
# Get cluster summaries collection
|
# Get cluster summaries collection
|
||||||
cluster_summaries_collection = db['cluster_summaries']
|
cluster_summaries_collection = db['cluster_summaries']
|
||||||
|
|
||||||
# Query for articles with summaries published today OR created today
|
# If no categories specified, get all available categories
|
||||||
# This ensures we only get fresh articles from today
|
if categories is None:
|
||||||
cursor = articles_collection.find({
|
categories = ['general', 'local', 'sports', 'science']
|
||||||
'summary': {'$exists': True, '$ne': None},
|
|
||||||
'$or': [
|
|
||||||
# Articles published today (if published_at is available)
|
|
||||||
{'published_at': {'$gte': today_start}},
|
|
||||||
# Articles created today (fallback if published_at is missing)
|
|
||||||
{'created_at': {'$gte': today_start}}
|
|
||||||
]
|
|
||||||
}).sort('created_at', -1).limit(max_articles)
|
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
processed_clusters = set()
|
processed_clusters = set()
|
||||||
|
|
||||||
for doc in cursor:
|
# Fetch articles for each category separately
|
||||||
# Double-check the date to ensure it's from today
|
for category in categories:
|
||||||
published_at = doc.get('published_at')
|
# Query for articles in this category from today
|
||||||
created_at = doc.get('created_at')
|
cursor = articles_collection.find({
|
||||||
|
'summary': {'$exists': True, '$ne': None},
|
||||||
|
'category': category,
|
||||||
|
'$or': [
|
||||||
|
{'published_at': {'$gte': today_start}},
|
||||||
|
{'created_at': {'$gte': today_start}}
|
||||||
|
]
|
||||||
|
}).sort('created_at', -1).limit(articles_per_category)
|
||||||
|
|
||||||
# Skip if both dates are old (extra safety check)
|
category_articles = []
|
||||||
if published_at and isinstance(published_at, datetime):
|
|
||||||
if published_at < today_start:
|
|
||||||
continue
|
|
||||||
elif created_at and isinstance(created_at, datetime):
|
|
||||||
if created_at < today_start:
|
|
||||||
continue
|
|
||||||
|
|
||||||
cluster_id = doc.get('cluster_id')
|
for doc in cursor:
|
||||||
|
# Double-check the date to ensure it's from today
|
||||||
|
published_at = doc.get('published_at')
|
||||||
|
created_at = doc.get('created_at')
|
||||||
|
|
||||||
# Check if this article is part of a cluster
|
# Skip if both dates are old (extra safety check)
|
||||||
if cluster_id and cluster_id not in processed_clusters:
|
if published_at and isinstance(published_at, datetime):
|
||||||
# Get cluster summary
|
if published_at < today_start:
|
||||||
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
|
continue
|
||||||
|
elif created_at and isinstance(created_at, datetime):
|
||||||
|
if created_at < today_start:
|
||||||
|
continue
|
||||||
|
|
||||||
if cluster and cluster.get('article_count', 0) > 1:
|
cluster_id = doc.get('cluster_id')
|
||||||
# This is a clustered article - get all source links
|
|
||||||
processed_clusters.add(cluster_id)
|
|
||||||
|
|
||||||
# Get all articles in this cluster
|
# Check if this article is part of a cluster
|
||||||
cluster_articles = list(articles_collection.find({
|
if cluster_id and cluster_id not in processed_clusters:
|
||||||
'cluster_id': cluster_id
|
# Get cluster summary
|
||||||
}))
|
cluster = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
|
||||||
|
|
||||||
# Build sources list with links
|
if cluster and cluster.get('article_count', 0) > 1:
|
||||||
sources = []
|
# This is a clustered article - get all source links
|
||||||
for art in cluster_articles:
|
processed_clusters.add(cluster_id)
|
||||||
sources.append({
|
|
||||||
'name': art.get('source', ''),
|
# Get all articles in this cluster
|
||||||
'link': art.get('link', ''),
|
cluster_articles = list(articles_collection.find({
|
||||||
'title': art.get('title', '')
|
'cluster_id': cluster_id
|
||||||
|
}))
|
||||||
|
|
||||||
|
# Build sources list with links
|
||||||
|
sources = []
|
||||||
|
for art in cluster_articles:
|
||||||
|
sources.append({
|
||||||
|
'name': art.get('source', ''),
|
||||||
|
'link': art.get('link', ''),
|
||||||
|
'title': art.get('title', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
category_articles.append({
|
||||||
|
'title': doc.get('title', ''),
|
||||||
|
'title_en': doc.get('title_en'),
|
||||||
|
'translated_at': doc.get('translated_at'),
|
||||||
|
'author': doc.get('author'),
|
||||||
|
'link': doc.get('link', ''),
|
||||||
|
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
|
||||||
|
'source': doc.get('source', ''),
|
||||||
|
'category': doc.get('category', 'general'),
|
||||||
|
'published_at': doc.get('published_at', ''),
|
||||||
|
'is_clustered': True,
|
||||||
|
'sources': sources,
|
||||||
|
'article_count': len(sources)
|
||||||
})
|
})
|
||||||
|
else:
|
||||||
articles.append({
|
# Single article (no cluster or cluster with only 1 article)
|
||||||
'title': doc.get('title', ''),
|
category_articles.append({
|
||||||
'title_en': doc.get('title_en'),
|
'title': doc.get('title', ''),
|
||||||
'translated_at': doc.get('translated_at'),
|
'title_en': doc.get('title_en'),
|
||||||
'author': doc.get('author'),
|
'translated_at': doc.get('translated_at'),
|
||||||
'link': doc.get('link', ''),
|
'author': doc.get('author'),
|
||||||
'summary': cluster.get('neutral_summary', doc.get('summary', '')),
|
'link': doc.get('link', ''),
|
||||||
'source': doc.get('source', ''),
|
'summary': doc.get('summary', ''),
|
||||||
'category': doc.get('category', 'general'),
|
'source': doc.get('source', ''),
|
||||||
'published_at': doc.get('published_at', ''),
|
'category': doc.get('category', 'general'),
|
||||||
'is_clustered': True,
|
'published_at': doc.get('published_at', ''),
|
||||||
'sources': sources,
|
'is_clustered': False
|
||||||
'article_count': len(sources)
|
})
|
||||||
})
|
elif not cluster_id or cluster_id not in processed_clusters:
|
||||||
else:
|
# No cluster - single article
|
||||||
# Single article (no cluster or cluster with only 1 article)
|
category_articles.append({
|
||||||
articles.append({
|
|
||||||
'title': doc.get('title', ''),
|
'title': doc.get('title', ''),
|
||||||
'title_en': doc.get('title_en'),
|
'title_en': doc.get('title_en'),
|
||||||
'translated_at': doc.get('translated_at'),
|
'translated_at': doc.get('translated_at'),
|
||||||
@@ -183,20 +201,9 @@ def get_latest_articles(max_articles=10, hours=24):
|
|||||||
'published_at': doc.get('published_at', ''),
|
'published_at': doc.get('published_at', ''),
|
||||||
'is_clustered': False
|
'is_clustered': False
|
||||||
})
|
})
|
||||||
elif not cluster_id or cluster_id not in processed_clusters:
|
|
||||||
# No cluster - single article
|
# Add this category's articles to the main list
|
||||||
articles.append({
|
articles.extend(category_articles)
|
||||||
'title': doc.get('title', ''),
|
|
||||||
'title_en': doc.get('title_en'),
|
|
||||||
'translated_at': doc.get('translated_at'),
|
|
||||||
'author': doc.get('author'),
|
|
||||||
'link': doc.get('link', ''),
|
|
||||||
'summary': doc.get('summary', ''),
|
|
||||||
'source': doc.get('source', ''),
|
|
||||||
'category': doc.get('category', 'general'),
|
|
||||||
'published_at': doc.get('published_at', ''),
|
|
||||||
'is_clustered': False
|
|
||||||
})
|
|
||||||
|
|
||||||
# Sort articles: clustered articles first (by source count), then by recency
|
# Sort articles: clustered articles first (by source count), then by recency
|
||||||
# This prioritizes stories covered by multiple sources
|
# This prioritizes stories covered by multiple sources
|
||||||
@@ -383,11 +390,11 @@ def send_newsletter(max_articles=None, test_email=None):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Get articles from today only
|
# Get articles from today only
|
||||||
max_articles = max_articles or Config.MAX_ARTICLES
|
|
||||||
today_date = datetime.now().strftime('%B %d, %Y')
|
today_date = datetime.now().strftime('%B %d, %Y')
|
||||||
print(f"\nFetching articles published TODAY ({today_date})...")
|
print(f"\nFetching articles published TODAY ({today_date})...")
|
||||||
print(f" Max articles: {max_articles}")
|
print(f" Articles per category: 3")
|
||||||
articles = get_latest_articles(max_articles, hours=Config.HOURS_LOOKBACK)
|
# Fetch all categories - filtering per subscriber happens later
|
||||||
|
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=Config.HOURS_LOOKBACK)
|
||||||
|
|
||||||
if not articles:
|
if not articles:
|
||||||
print("❌ No articles from today with summaries found")
|
print("❌ No articles from today with summaries found")
|
||||||
@@ -509,15 +516,14 @@ def preview_newsletter(max_articles=None, hours=None):
|
|||||||
Generate newsletter HTML for preview (doesn't send)
|
Generate newsletter HTML for preview (doesn't send)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
max_articles: Maximum number of articles to include
|
max_articles: Maximum number of articles to include (ignored, uses 3 per category)
|
||||||
hours: Hours to look back (default from config)
|
hours: Hours to look back (default from config)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: HTML content
|
str: HTML content
|
||||||
"""
|
"""
|
||||||
max_articles = max_articles or Config.MAX_ARTICLES
|
|
||||||
hours = hours or Config.HOURS_LOOKBACK
|
hours = hours or Config.HOURS_LOOKBACK
|
||||||
articles = get_latest_articles(max_articles, hours=hours)
|
articles = get_latest_articles_by_categories(categories=None, articles_per_category=3, hours=hours)
|
||||||
|
|
||||||
if not articles:
|
if not articles:
|
||||||
today_date = datetime.now().strftime('%B %d, %Y')
|
today_date = datetime.now().strftime('%B %d, %Y')
|
||||||
|
|||||||
Reference in New Issue
Block a user