228 lines
8.9 KiB
Python
228 lines
8.9 KiB
Python
from flask import Blueprint, jsonify, request
|
|
from database import articles_collection, db
|
|
from services.news_service import fetch_munich_news, save_articles_to_db
|
|
|
|
news_bp = Blueprint('news', __name__)
|
|
|
|
|
|
@news_bp.route('/api/news', methods=['GET'])
|
|
def get_news():
|
|
"""Get latest Munich news"""
|
|
try:
|
|
# Check if clustered mode is requested
|
|
mode = request.args.get('mode', 'all')
|
|
|
|
if mode == 'clustered':
|
|
return get_clustered_news_internal()
|
|
|
|
# Fetch fresh news and save to database
|
|
articles = fetch_munich_news()
|
|
save_articles_to_db(articles)
|
|
|
|
# Get articles from MongoDB, sorted by created_at (newest first)
|
|
cursor = articles_collection.find().sort('created_at', -1).limit(20)
|
|
|
|
db_articles = []
|
|
for doc in cursor:
|
|
article = {
|
|
'title': doc.get('title', ''),
|
|
'author': doc.get('author'),
|
|
'link': doc.get('link', ''),
|
|
'source': doc.get('source', ''),
|
|
'published': doc.get('published_at', ''),
|
|
'word_count': doc.get('word_count'),
|
|
'has_full_content': bool(doc.get('content')),
|
|
'has_summary': bool(doc.get('summary'))
|
|
}
|
|
|
|
# Include AI summary if available
|
|
if doc.get('summary'):
|
|
article['summary'] = doc.get('summary', '')
|
|
article['summary_word_count'] = doc.get('summary_word_count')
|
|
article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
|
|
# Fallback: Include preview of content if no summary (first 200 chars)
|
|
elif doc.get('content'):
|
|
article['preview'] = doc.get('content', '')[:200] + '...'
|
|
|
|
db_articles.append(article)
|
|
|
|
# Combine fresh articles with database articles and deduplicate
|
|
seen_links = set()
|
|
combined = []
|
|
|
|
# Add fresh articles first (they're more recent)
|
|
for article in articles:
|
|
link = article.get('link', '')
|
|
if link and link not in seen_links:
|
|
seen_links.add(link)
|
|
combined.append(article)
|
|
|
|
# Add database articles
|
|
for article in db_articles:
|
|
link = article.get('link', '')
|
|
if link and link not in seen_links:
|
|
seen_links.add(link)
|
|
combined.append(article)
|
|
|
|
return jsonify({'articles': combined[:20]}), 200
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
|
|
def get_clustered_news_internal():
|
|
"""
|
|
Get news with neutral summaries for clustered articles
|
|
Returns only primary articles with their neutral summaries
|
|
Prioritizes stories covered by multiple sources (more popular/important)
|
|
"""
|
|
try:
|
|
limit = int(request.args.get('limit', 20))
|
|
|
|
# Use aggregation to get articles with their cluster size
|
|
# This allows us to prioritize multi-source stories
|
|
pipeline = [
|
|
{"$match": {"is_primary": True}},
|
|
{"$lookup": {
|
|
"from": "articles",
|
|
"localField": "cluster_id",
|
|
"foreignField": "cluster_id",
|
|
"as": "cluster_articles"
|
|
}},
|
|
{"$addFields": {
|
|
"article_count": {"$size": "$cluster_articles"},
|
|
"sources_list": {"$setUnion": ["$cluster_articles.source", []]}
|
|
}},
|
|
{"$addFields": {
|
|
"source_count": {"$size": "$sources_list"}
|
|
}},
|
|
# Sort by: 1) source count (desc), 2) published date (desc)
|
|
{"$sort": {"source_count": -1, "published_at": -1}},
|
|
{"$limit": limit}
|
|
]
|
|
|
|
cursor = articles_collection.aggregate(pipeline)
|
|
|
|
result = []
|
|
cluster_summaries_collection = db['cluster_summaries']
|
|
|
|
for doc in cursor:
|
|
cluster_id = doc.get('cluster_id')
|
|
|
|
# Get neutral summary if available
|
|
cluster_summary = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
|
|
|
|
# Use cluster_articles from aggregation (already fetched)
|
|
cluster_articles = doc.get('cluster_articles', [])
|
|
|
|
article = {
|
|
'title': doc.get('title', ''),
|
|
'link': doc.get('link', ''),
|
|
'source': doc.get('source', ''),
|
|
'published': doc.get('published_at', ''),
|
|
'category': doc.get('category', 'general'),
|
|
'cluster_id': cluster_id,
|
|
'article_count': doc.get('article_count', 1),
|
|
'source_count': doc.get('source_count', 1),
|
|
'sources': list(doc.get('sources_list', [doc.get('source', '')]))
|
|
}
|
|
|
|
# Use neutral summary if available, otherwise use article's own summary
|
|
if cluster_summary and doc.get('article_count', 1) > 1:
|
|
article['summary'] = cluster_summary.get('neutral_summary', '')
|
|
article['summary_type'] = 'neutral'
|
|
article['is_clustered'] = True
|
|
else:
|
|
article['summary'] = doc.get('summary', '')
|
|
article['summary_type'] = 'individual'
|
|
article['is_clustered'] = False
|
|
|
|
# Add related articles info
|
|
if doc.get('article_count', 1) > 1:
|
|
article['related_articles'] = [
|
|
{
|
|
'source': a.get('source', ''),
|
|
'title': a.get('title', ''),
|
|
'link': a.get('link', '')
|
|
}
|
|
for a in cluster_articles if a.get('_id') != doc.get('_id')
|
|
]
|
|
|
|
result.append(article)
|
|
|
|
return jsonify({
|
|
'articles': result,
|
|
'mode': 'clustered',
|
|
'description': 'Shows one article per story with neutral summaries'
|
|
}), 200
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
|
|
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
|
|
def get_article_by_url(article_url):
|
|
"""Get full article content by URL"""
|
|
try:
|
|
# Decode URL
|
|
from urllib.parse import unquote
|
|
decoded_url = unquote(article_url)
|
|
|
|
# Find article by link
|
|
article = articles_collection.find_one({'link': decoded_url})
|
|
|
|
if not article:
|
|
return jsonify({'error': 'Article not found'}), 404
|
|
|
|
return jsonify({
|
|
'title': article.get('title', ''),
|
|
'author': article.get('author'),
|
|
'link': article.get('link', ''),
|
|
'content': article.get('content', ''),
|
|
'summary': article.get('summary'),
|
|
'word_count': article.get('word_count', 0),
|
|
'summary_word_count': article.get('summary_word_count'),
|
|
'source': article.get('source', ''),
|
|
'published_at': article.get('published_at', ''),
|
|
'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
|
|
'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
|
|
'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
|
|
}), 200
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
|
|
@news_bp.route('/api/stats', methods=['GET'])
|
|
def get_stats():
|
|
"""Get subscription statistics"""
|
|
try:
|
|
from database import subscribers_collection
|
|
|
|
# Count only active subscribers
|
|
subscriber_count = subscribers_collection.count_documents({'status': 'active'})
|
|
|
|
# Also get total article count
|
|
article_count = articles_collection.count_documents({})
|
|
|
|
# Count crawled articles
|
|
crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})
|
|
|
|
# Count summarized articles
|
|
summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
|
|
|
|
# Count clustered articles
|
|
clustered_count = articles_collection.count_documents({'cluster_id': {'$exists': True}})
|
|
|
|
# Count cluster summaries
|
|
cluster_summaries_collection = db['cluster_summaries']
|
|
neutral_summaries_count = cluster_summaries_collection.count_documents({})
|
|
|
|
return jsonify({
|
|
'subscribers': subscriber_count,
|
|
'articles': article_count,
|
|
'crawled_articles': crawled_count,
|
|
'summarized_articles': summarized_count,
|
|
'clustered_articles': clustered_count,
|
|
'neutral_summaries': neutral_summaries_count
|
|
}), 200
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)}), 500
|