from flask import Blueprint, jsonify, request from config import Config from chroma_client import ChromaClient import logging search_bp = Blueprint('search', __name__) # Initialize ChromaDB client # Note: We use the hostname 'chromadb' as defined in docker-compose for the backend chroma_client = ChromaClient( host=Config.CHROMA_HOST, port=Config.CHROMA_PORT, collection_name=Config.CHROMA_COLLECTION ) @search_bp.route('/api/search', methods=['GET']) def search_news(): """ Semantic search for news articles using ChromaDB. Query parameters: - q: Search query (required) - limit: Number of results (default: 10) - category: Filter by category (optional) """ try: query = request.args.get('q') if not query: return jsonify({'error': 'Missing search query'}), 400 limit = int(request.args.get('limit', 10)) category = request.args.get('category') # Build filter if category provided where_filter = None if category: where_filter = {"category": category} # Perform search results = chroma_client.search( query_text=query, n_results=limit, where=where_filter ) # Format for frontend formatted_response = [] for item in results: metadata = item.get('metadata', {}) # Use translated title if availble (stored in metadata as title_en or title) # Note: Chroma metadata structure is flat. If we store title_en, we should use it. # But currently we store: title, url, source, category, published_at. # We need to make sure title_en is stored in Chroma OR fetch it from DB. # Faster approach: just rely on what is in Chroma. # BETTER: In crawl, we store title as title_en in metadata if available? # Let's check how we store it in crawler_service.py/chroma_client.py # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles. # In chroma_client.py line 97, we only extract title, url, source, category, published_at. # We are NOT storing title_en in Chroma metadata currently. # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic. # However, since the user IS complaining about English, let's assume valid English titles # are what we want to display. # Wait, if we change the metadata in ChromaClient to use title_en as the main title, # then search results will automatically show English. title = metadata.get('title', 'Unknown Title') formatted_response.append({ 'title': title, 'link': metadata.get('url', ''), 'source': metadata.get('source', 'Unknown'), 'category': metadata.get('category', 'general'), 'published_at': metadata.get('published_at', ''), 'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx) 'snippet': item.get('document', '')[:200] + '...' # Preview }) return jsonify({ 'query': query, 'count': len(formatted_response), 'results': formatted_response }), 200 except Exception as e: logging.error(f"Search error: {str(e)}") return jsonify({'error': str(e)}), 500