update

2025-11-12 11:34:33 +01:00
parent f35f8eef8a
commit 94c89589af
32 changed files with 3272 additions and 3805 deletions
--- a/backend/routes/news_routes.py
+++ b/backend/routes/news_routes.py
@@ -1,5 +1,5 @@
-from flask import Blueprint, jsonify
-from database import articles_collection
+from flask import Blueprint, jsonify, request
+from database import articles_collection, db
 from services.news_service import fetch_munich_news, save_articles_to_db

 news_bp = Blueprint('news', __name__)
@@ -9,6 +9,12 @@ news_bp = Blueprint('news', __name__)
 def get_news():
    """Get latest Munich news"""
    try:
+        # Check if clustered mode is requested
+        mode = request.args.get('mode', 'all')
+        
+        if mode == 'clustered':
+            return get_clustered_news_internal()
+        
        # Fetch fresh news and save to database
        articles = fetch_munich_news()
        save_articles_to_db(articles)
@@ -63,6 +69,95 @@ def get_news():
        return jsonify({'error': str(e)}), 500


+def get_clustered_news_internal():
+    """
+    Get news with neutral summaries for clustered articles
+    Returns only primary articles with their neutral summaries
+    Prioritizes stories covered by multiple sources (more popular/important)
+    """
+    try:
+        limit = int(request.args.get('limit', 20))
+        
+        # Use aggregation to get articles with their cluster size
+        # This allows us to prioritize multi-source stories
+        pipeline = [
+            {"$match": {"is_primary": True}},
+            {"$lookup": {
+                "from": "articles",
+                "localField": "cluster_id",
+                "foreignField": "cluster_id",
+                "as": "cluster_articles"
+            }},
+            {"$addFields": {
+                "article_count": {"$size": "$cluster_articles"},
+                "sources_list": {"$setUnion": ["$cluster_articles.source", []]}
+            }},
+            {"$addFields": {
+                "source_count": {"$size": "$sources_list"}
+            }},
+            # Sort by: 1) source count (desc), 2) published date (desc)
+            {"$sort": {"source_count": -1, "published_at": -1}},
+            {"$limit": limit}
+        ]
+        
+        cursor = articles_collection.aggregate(pipeline)
+        
+        result = []
+        cluster_summaries_collection = db['cluster_summaries']
+        
+        for doc in cursor:
+            cluster_id = doc.get('cluster_id')
+            
+            # Get neutral summary if available
+            cluster_summary = cluster_summaries_collection.find_one({'cluster_id': cluster_id})
+            
+            # Use cluster_articles from aggregation (already fetched)
+            cluster_articles = doc.get('cluster_articles', [])
+            
+            article = {
+                'title': doc.get('title', ''),
+                'link': doc.get('link', ''),
+                'source': doc.get('source', ''),
+                'published': doc.get('published_at', ''),
+                'category': doc.get('category', 'general'),
+                'cluster_id': cluster_id,
+                'article_count': doc.get('article_count', 1),
+                'source_count': doc.get('source_count', 1),
+                'sources': list(doc.get('sources_list', [doc.get('source', '')]))
+            }
+            
+            # Use neutral summary if available, otherwise use article's own summary
+            if cluster_summary and doc.get('article_count', 1) > 1:
+                article['summary'] = cluster_summary.get('neutral_summary', '')
+                article['summary_type'] = 'neutral'
+                article['is_clustered'] = True
+            else:
+                article['summary'] = doc.get('summary', '')
+                article['summary_type'] = 'individual'
+                article['is_clustered'] = False
+            
+            # Add related articles info
+            if doc.get('article_count', 1) > 1:
+                article['related_articles'] = [
+                    {
+                        'source': a.get('source', ''),
+                        'title': a.get('title', ''),
+                        'link': a.get('link', '')
+                    }
+                    for a in cluster_articles if a.get('_id') != doc.get('_id')
+                ]
+            
+            result.append(article)
+        
+        return jsonify({
+            'articles': result,
+            'mode': 'clustered',
+            'description': 'Shows one article per story with neutral summaries'
+        }), 200
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+
@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
 def get_article_by_url(article_url):
    """Get full article content by URL"""
@@ -113,11 +208,20 @@ def get_stats():
        # Count summarized articles
        summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})
        
+        # Count clustered articles
+        clustered_count = articles_collection.count_documents({'cluster_id': {'$exists': True}})
+        
+        # Count cluster summaries
+        cluster_summaries_collection = db['cluster_summaries']
+        neutral_summaries_count = cluster_summaries_collection.count_documents({})
+        
        return jsonify({
            'subscribers': subscriber_count,
            'articles': article_count,
            'crawled_articles': crawled_count,
-            'summarized_articles': summarized_count
+            'summarized_articles': summarized_count,
+            'clustered_articles': clustered_count,
+            'neutral_summaries': neutral_summaries_count
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500