This commit is contained in:
2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions

View File

@@ -1,5 +1,5 @@
from flask import Blueprint, jsonify
from database import articles_collection
from flask import Blueprint, jsonify, request
from database import articles_collection, db
from services.news_service import fetch_munich_news, save_articles_to_db
news_bp = Blueprint('news', __name__)
@@ -9,6 +9,12 @@ news_bp = Blueprint('news', __name__)
def get_news():
"""Get latest Munich news"""
try:
# Check if clustered mode is requested
mode = request.args.get('mode', 'all')
if mode == 'clustered':
return get_clustered_news_internal()
# Fetch fresh news and save to database
articles = fetch_munich_news()
save_articles_to_db(articles)
@@ -63,6 +69,95 @@ def get_news():
return jsonify({'error': str(e)}), 500
def get_clustered_news_internal():
"""
Get news with neutral summaries for clustered articles
Returns only primary articles with their neutral summaries
Prioritizes stories covered by multiple sources (more popular/important)
"""
try:
limit = int(request.args.get('limit', 20))
# Use aggregation to get articles with their cluster size
# This allows us to prioritize multi-source stories
pipeline = [
{"$match": {"is_primary": True}},
{"$lookup": {
"from": "articles",
"localField": "cluster_id",
"foreignField": "cluster_id",
"as": "cluster_articles"
}},
{"$addFields": {
"article_count": {"$size": "$cluster_articles"},
"sources_list": {"$setUnion": ["$cluster_articles.source", []]}
}},
{"$addFields": {
"source_count": {"$size": "$sources_list"}
}},
# Sort by: 1) source count (desc), 2) published date (desc)
{"$sort": {"source_count": -1, "published_at": -1}},
{"$limit": limit}
]
cursor = articles_collection.aggregate(pipeline)
result = []
cluster_summaries_collection = db['cluster_summaries']
for doc in cursor:
cluster_id = doc.get('cluster_id')
# Get neutral summary if available
cluster_summary = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
# Use cluster_articles from aggregation (already fetched)
cluster_articles = doc.get('cluster_articles', [])
article = {
'title': doc.get('title', ''),
'link': doc.get('link', ''),
'source': doc.get('source', ''),
'published': doc.get('published_at', ''),
'category': doc.get('category', 'general'),
'cluster_id': cluster_id,
'article_count': doc.get('article_count', 1),
'source_count': doc.get('source_count', 1),
'sources': list(doc.get('sources_list', [doc.get('source', '')]))
}
# Use neutral summary if available, otherwise use article's own summary
if cluster_summary and doc.get('article_count', 1) > 1:
article['summary'] = cluster_summary.get('neutral_summary', '')
article['summary_type'] = 'neutral'
article['is_clustered'] = True
else:
article['summary'] = doc.get('summary', '')
article['summary_type'] = 'individual'
article['is_clustered'] = False
# Add related articles info
if doc.get('article_count', 1) > 1:
article['related_articles'] = [
{
'source': a.get('source', ''),
'title': a.get('title', ''),
'link': a.get('link', '')
}
for a in cluster_articles if a.get('_id') != doc.get('_id')
]
result.append(article)
return jsonify({
'articles': result,
'mode': 'clustered',
'description': 'Shows one article per story with neutral summaries'
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
def get_article_by_url(article_url):
"""Get full article content by URL"""
@@ -113,11 +208,20 @@ def get_stats():
# Count summarized articles
summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
# Count clustered articles
clustered_count = articles_collection.count_documents({'cluster_id': {'$exists': True}})
# Count cluster summaries
cluster_summaries_collection = db['cluster_summaries']
neutral_summaries_count = cluster_summaries_collection.count_documents({})
return jsonify({
'subscribers': subscriber_count,
'articles': article_count,
'crawled_articles': crawled_count,
'summarized_articles': summarized_count
'summarized_articles': summarized_count,
'clustered_articles': clustered_count,
'neutral_summaries': neutral_summaries_count
}), 200
except Exception as e:
return jsonify({'error': str(e)}), 500