Files
Munich-news/backend/routes/news_routes.py
2025-12-10 15:50:11 +00:00

233 lines
9.2 KiB
Python

from flask import Blueprint, jsonify, request
from database import articles_collection, db
from services.news_service import fetch_munich_news, save_articles_to_db
news_bp = Blueprint('news', __name__)
@news_bp.route('/api/news', methods=['GET'])
def get_news():
"""Get latest Munich news"""
try:
# Check if clustered mode is requested
mode = request.args.get('mode', 'all')
if mode == 'clustered':
return get_clustered_news_internal()
# Fetch fresh news and save to database
articles = fetch_munich_news()
save_articles_to_db(articles)
# Get articles from MongoDB, sorted by created_at (newest first)
cursor = articles_collection.find().sort('created_at', -1).limit(20)
db_articles = []
for doc in cursor:
# Use English title if available, otherwise fallback to original
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
article = {
'title': title,
'author': doc.get('author'),
'link': doc.get('link', ''),
'source': doc.get('source', ''),
'published': doc.get('published_at', ''),
'word_count': doc.get('word_count'),
'has_full_content': bool(doc.get('content')),
'has_summary': bool(doc.get('summary'))
}
# Include AI summary if available
if doc.get('summary'):
article['summary'] = doc.get('summary', '')
article['summary_word_count'] = doc.get('summary_word_count')
article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
# Fallback: Include preview of content if no summary (first 200 chars)
elif doc.get('content'):
article['preview'] = doc.get('content', '')[:200] + '...'
db_articles.append(article)
# Combine fresh articles with database articles and deduplicate
seen_links = set()
combined = []
# Add fresh articles first (they're more recent)
for article in articles:
link = article.get('link', '')
if link and link not in seen_links:
seen_links.add(link)
combined.append(article)
# Add database articles
for article in db_articles:
link = article.get('link', '')
if link and link not in seen_links:
seen_links.add(link)
combined.append(article)
return jsonify({'articles': combined[:20]}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
def get_clustered_news_internal():
"""
Get news with neutral summaries for clustered articles
Returns only primary articles with their neutral summaries
Prioritizes stories covered by multiple sources (more popular/important)
"""
try:
limit = int(request.args.get('limit', 20))
# Use aggregation to get articles with their cluster size
# This allows us to prioritize multi-source stories
pipeline = [
{"$match": {"is_primary": True}},
{"$lookup": {
"from": "articles",
"localField": "cluster_id",
"foreignField": "cluster_id",
"as": "cluster_articles"
}},
{"$addFields": {
"article_count": {"$size": "$cluster_articles"},
"sources_list": {"$setUnion": ["$cluster_articles.source", []]}
}},
{"$addFields": {
"source_count": {"$size": "$sources_list"}
}},
# Sort by: 1) source count (desc), 2) published date (desc)
{"$sort": {"source_count": -1, "published_at": -1}},
{"$limit": limit}
]
cursor = articles_collection.aggregate(pipeline)
result = []
cluster_summaries_collection = db['cluster_summaries']
for doc in cursor:
cluster_id = doc.get('cluster_id')
# Get neutral summary if available
cluster_summary = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
# Use cluster_articles from aggregation (already fetched)
cluster_articles = doc.get('cluster_articles', [])
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
article = {
'title': title,
'link': doc.get('link', ''),
'source': doc.get('source', ''),
'published': doc.get('published_at', ''),
'category': doc.get('category', 'general'),
'cluster_id': cluster_id,
'article_count': doc.get('article_count', 1),
'source_count': doc.get('source_count', 1),
'sources': list(doc.get('sources_list', [doc.get('source', '')]))
}
# Use neutral summary if available, otherwise use article's own summary
if cluster_summary and doc.get('article_count', 1) > 1:
article['summary'] = cluster_summary.get('neutral_summary', '')
article['summary_type'] = 'neutral'
article['is_clustered'] = True
else:
article['summary'] = doc.get('summary', '')
article['summary_type'] = 'individual'
article['is_clustered'] = False
# Add related articles info
if doc.get('article_count', 1) > 1:
article['related_articles'] = [
{
'source': a.get('source', ''),
'title': a.get('title', ''),
'link': a.get('link', '')
}
for a in cluster_articles if a.get('_id') != doc.get('_id')
]
result.append(article)
return jsonify({
'articles': result,
'mode': 'clustered',
'description': 'Shows one article per story with neutral summaries'
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
def get_article_by_url(article_url):
"""Get full article content by URL"""
try:
# Decode URL
from urllib.parse import unquote
decoded_url = unquote(article_url)
# Find article by link
article = articles_collection.find_one({'link': decoded_url})
if not article:
return jsonify({'error': 'Article not found'}), 404
return jsonify({
'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
'author': article.get('author'),
'link': article.get('link', ''),
'content': article.get('content', ''),
'summary': article.get('summary'),
'word_count': article.get('word_count', 0),
'summary_word_count': article.get('summary_word_count'),
'source': article.get('source', ''),
'published_at': article.get('published_at', ''),
'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@news_bp.route('/api/stats', methods=['GET'])
def get_stats():
"""Get subscription statistics"""
try:
from database import subscribers_collection
# Count only active subscribers
subscriber_count = subscribers_collection.count_documents({'status': 'active'})
# Also get total article count
article_count = articles_collection.count_documents({})
# Count crawled articles
crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})
# Count summarized articles
summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
# Count clustered articles
clustered_count = articles_collection.count_documents({'cluster_id': {'$exists': True}})
# Count cluster summaries
cluster_summaries_collection = db['cluster_summaries']
neutral_summaries_count = cluster_summaries_collection.count_documents({})
return jsonify({
'subscribers': subscriber_count,
'articles': article_count,
'crawled_articles': crawled_count,
'summarized_articles': summarized_count,
'clustered_articles': clustered_count,
'neutral_summaries': neutral_summaries_count
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500