Munich-news/backend/routes/search_routes.py

from flask import Blueprint, jsonify, request
from config import Config
from chroma_client import ChromaClient
import logging

search_bp = Blueprint('search', __name__)

# Initialize ChromaDB client
# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
chroma_client = ChromaClient(
    host=Config.CHROMA_HOST,
    port=Config.CHROMA_PORT,
    collection_name=Config.CHROMA_COLLECTION
)

@search_bp.route('/api/search', methods=['GET'])
def search_news():
    """
    Semantic search for news articles using ChromaDB.
    Query parameters:
    - q: Search query (required)
    - limit: Number of results (default: 10)
    - category: Filter by category (optional)
    """
    try:
        query = request.args.get('q')
        if not query:
            return jsonify({'error': 'Missing search query'}), 400

        limit = int(request.args.get('limit', 10))
        category = request.args.get('category')

        # Build filter if category provided
        where_filter = None
        if category:
            where_filter = {"category": category}

        # Perform search
        results = chroma_client.search(
            query_text=query,
            n_results=limit,
            where=where_filter
        )

        # Format for frontend
        formatted_response = []
        for item in results:
            metadata = item.get('metadata', {})
            # Use translated title if availble (stored in metadata as title_en or title)
            # Note: Chroma metadata structure is flat. If we store title_en, we should use it.
            # But currently we store: title, url, source, category, published_at.
            # We need to make sure title_en is stored in Chroma OR fetch it from DB.
            # Faster approach: just rely on what is in Chroma.
            # BETTER: In crawl, we store title as title_en in metadata if available?
            # Let's check how we store it in crawler_service.py/chroma_client.py

            # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
            # In chroma_client.py line 97, we only extract title, url, source, category, published_at.
            # We are NOT storing title_en in Chroma metadata currently.

            # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
            # However, since the user IS complaining about English, let's assume valid English titles
            # are what we want to display.

            # Wait, if we change the metadata in ChromaClient to use title_en as the main title,
            # then search results will automatically show English.

            title = metadata.get('title', 'Unknown Title')

            formatted_response.append({
                'title': title,
                'link': metadata.get('url', ''),
                'source': metadata.get('source', 'Unknown'),
                'category': metadata.get('category', 'general'),
                'published_at': metadata.get('published_at', ''),
                'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
                'snippet': item.get('document', '')[:200] + '...' # Preview
            })

        return jsonify({
            'query': query,
            'count': len(formatted_response),
            'results': formatted_response
        }), 200

    except Exception as e:
        logging.error(f"Search error: {str(e)}")
        return jsonify({'error': str(e)}), 500