Munich-news/backend/routes/news_routes.py

from flask import Blueprint, jsonify, request
from database import articles_collection, db
from services.news_service import fetch_munich_news, save_articles_to_db

news_bp = Blueprint('news', __name__)


@news_bp.route('/api/news', methods=['GET'])
def get_news():
    """Get latest Munich news"""
    try:
        # Check if clustered mode is requested
        mode = request.args.get('mode', 'all')

        if mode == 'clustered':
            return get_clustered_news_internal()

        # Fetch fresh news and save to database
        articles = fetch_munich_news()
        save_articles_to_db(articles)

        # Get articles from MongoDB, sorted by created_at (newest first)
        cursor = articles_collection.find().sort('created_at', -1).limit(20)

        db_articles = []
        for doc in cursor:
            # Use English title if available, otherwise fallback to original
            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')

            article = {
                'title': title,
                'author': doc.get('author'),
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
                'published': doc.get('published_at', ''),
                'word_count': doc.get('word_count'),
                'has_full_content': bool(doc.get('content')),
                'has_summary': bool(doc.get('summary'))
            }

            # Include AI summary if available
            if doc.get('summary'):
                article['summary'] = doc.get('summary', '')
                article['summary_word_count'] = doc.get('summary_word_count')
                article['summarized_at'] = doc.get('summarized_at', '').isoformat() if doc.get('summarized_at') else None
            # Fallback: Include preview of content if no summary (first 200 chars)
            elif doc.get('content'):
                article['preview'] = doc.get('content', '')[:200] + '...'

            db_articles.append(article)

        # Combine fresh articles with database articles and deduplicate
        seen_links = set()
        combined = []

        # Add fresh articles first (they're more recent)
        for article in articles:
            link = article.get('link', '')
            if link and link not in seen_links:
                seen_links.add(link)
                combined.append(article)

        # Add database articles
        for article in db_articles:
            link = article.get('link', '')
            if link and link not in seen_links:
                seen_links.add(link)
                combined.append(article)

        return jsonify({'articles': combined[:20]}), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500


def get_clustered_news_internal():
    """
    Get news with neutral summaries for clustered articles
    Returns only primary articles with their neutral summaries
    Prioritizes stories covered by multiple sources (more popular/important)
    """
    try:
        limit = int(request.args.get('limit', 20))

        # Use aggregation to get articles with their cluster size
        # This allows us to prioritize multi-source stories
        pipeline = [
            {"$match": {"is_primary": True}},
            {"$lookup": {
                "from": "articles",
                "localField": "cluster_id",
                "foreignField": "cluster_id",
                "as": "cluster_articles"
            }},
            {"$addFields": {
                "article_count": {"$size": "$cluster_articles"},
                "sources_list": {"$setUnion": ["$cluster_articles.source", []]}
            }},
            {"$addFields": {
                "source_count": {"$size": "$sources_list"}
            }},
            # Sort by: 1) source count (desc), 2) published date (desc)
            {"$sort": {"source_count": -1, "published_at": -1}},
            {"$limit": limit}
        ]

        cursor = articles_collection.aggregate(pipeline)

        result = []
        cluster_summaries_collection = db['cluster_summaries']

        for doc in cursor:
            cluster_id = doc.get('cluster_id')

            # Get neutral summary if available
            cluster_summary = cluster_summaries_collection.find_one({'cluster_id': cluster_id})

            # Use cluster_articles from aggregation (already fetched)
            cluster_articles = doc.get('cluster_articles', [])

            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')

            article = {
                'title': title,
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
                'published': doc.get('published_at', ''),
                'category': doc.get('category', 'general'),
                'cluster_id': cluster_id,
                'article_count': doc.get('article_count', 1),
                'source_count': doc.get('source_count', 1),
                'sources': list(doc.get('sources_list', [doc.get('source', '')]))
            }

            # Use neutral summary if available, otherwise use article's own summary
            if cluster_summary and doc.get('article_count', 1) > 1:
                article['summary'] = cluster_summary.get('neutral_summary', '')
                article['summary_type'] = 'neutral'
                article['is_clustered'] = True
            else:
                article['summary'] = doc.get('summary', '')
                article['summary_type'] = 'individual'
                article['is_clustered'] = False

            # Add related articles info
            if doc.get('article_count', 1) > 1:
                article['related_articles'] = [
                    {
                        'source': a.get('source', ''),
                        'title': a.get('title', ''),
                        'link': a.get('link', '')
                    }
                    for a in cluster_articles if a.get('_id') != doc.get('_id')
                ]

            result.append(article)

        return jsonify({
            'articles': result,
            'mode': 'clustered',
            'description': 'Shows one article per story with neutral summaries'
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500


@news_bp.route('/api/news/<path:article_url>', methods=['GET'])
def get_article_by_url(article_url):
    """Get full article content by URL"""
    try:
        # Decode URL
        from urllib.parse import unquote
        decoded_url = unquote(article_url)

        # Find article by link
        article = articles_collection.find_one({'link': decoded_url})

        if not article:
            return jsonify({'error': 'Article not found'}), 404

        return jsonify({
            'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
            'author': article.get('author'),
            'link': article.get('link', ''),
            'content': article.get('content', ''),
            'summary': article.get('summary'),
            'word_count': article.get('word_count', 0),
            'summary_word_count': article.get('summary_word_count'),
            'source': article.get('source', ''),
            'published_at': article.get('published_at', ''),
            'crawled_at': article.get('crawled_at', '').isoformat() if article.get('crawled_at') else None,
            'summarized_at': article.get('summarized_at', '').isoformat() if article.get('summarized_at') else None,
            'created_at': article.get('created_at', '').isoformat() if article.get('created_at') else None
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500


@news_bp.route('/api/stats', methods=['GET'])
def get_stats():
    """Get subscription statistics"""
    try:
        from database import subscribers_collection

        # Count only active subscribers
        subscriber_count = subscribers_collection.count_documents({'status': 'active'})

        # Also get total article count
        article_count = articles_collection.count_documents({})

        # Count crawled articles
        crawled_count = articles_collection.count_documents({'content': {'$exists': True, '$ne': ''}})

        # Count summarized articles
        summarized_count = articles_collection.count_documents({'summary': {'$exists': True, '$ne': ''}})

        # Count clustered articles
        clustered_count = articles_collection.count_documents({'cluster_id': {'$exists': True}})

        # Count cluster summaries
        cluster_summaries_collection = db['cluster_summaries']
        neutral_summaries_count = cluster_summaries_collection.count_documents({})

        return jsonify({
            'subscribers': subscriber_count,
            'articles': article_count,
            'crawled_articles': crawled_count,
            'summarized_articles': summarized_count,
            'clustered_articles': clustered_count,
            'neutral_summaries': neutral_summaries_count
        }), 200
    except Exception as e:
        return jsonify({'error': str(e)}), 500