89 lines
3.6 KiB
Python
89 lines
3.6 KiB
Python
from flask import Blueprint, jsonify, request
|
|
from config import Config
|
|
from chroma_client import ChromaClient
|
|
import logging
|
|
|
|
search_bp = Blueprint('search', __name__)
|
|
|
|
# Initialize ChromaDB client
|
|
# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
|
|
chroma_client = ChromaClient(
|
|
host=Config.CHROMA_HOST,
|
|
port=Config.CHROMA_PORT,
|
|
collection_name=Config.CHROMA_COLLECTION
|
|
)
|
|
|
|
@search_bp.route('/api/search', methods=['GET'])
|
|
def search_news():
|
|
"""
|
|
Semantic search for news articles using ChromaDB.
|
|
Query parameters:
|
|
- q: Search query (required)
|
|
- limit: Number of results (default: 10)
|
|
- category: Filter by category (optional)
|
|
"""
|
|
try:
|
|
query = request.args.get('q')
|
|
if not query:
|
|
return jsonify({'error': 'Missing search query'}), 400
|
|
|
|
limit = int(request.args.get('limit', 10))
|
|
category = request.args.get('category')
|
|
|
|
# Build filter if category provided
|
|
where_filter = None
|
|
if category:
|
|
where_filter = {"category": category}
|
|
|
|
# Perform search
|
|
results = chroma_client.search(
|
|
query_text=query,
|
|
n_results=limit,
|
|
where=where_filter
|
|
)
|
|
|
|
# Format for frontend
|
|
formatted_response = []
|
|
for item in results:
|
|
metadata = item.get('metadata', {})
|
|
# Use translated title if availble (stored in metadata as title_en or title)
|
|
# Note: Chroma metadata structure is flat. If we store title_en, we should use it.
|
|
# But currently we store: title, url, source, category, published_at.
|
|
# We need to make sure title_en is stored in Chroma OR fetch it from DB.
|
|
# Faster approach: just rely on what is in Chroma.
|
|
# BETTER: In crawl, we store title as title_en in metadata if available?
|
|
# Let's check how we store it in crawler_service.py/chroma_client.py
|
|
|
|
# Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
|
|
# In chroma_client.py line 97, we only extract title, url, source, category, published_at.
|
|
# We are NOT storing title_en in Chroma metadata currently.
|
|
|
|
# FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
|
|
# However, since the user IS complaining about English, let's assume valid English titles
|
|
# are what we want to display.
|
|
|
|
# Wait, if we change the metadata in ChromaClient to use title_en as the main title,
|
|
# then search results will automatically show English.
|
|
|
|
title = metadata.get('title', 'Unknown Title')
|
|
|
|
formatted_response.append({
|
|
'title': title,
|
|
'link': metadata.get('url', ''),
|
|
'source': metadata.get('source', 'Unknown'),
|
|
'category': metadata.get('category', 'general'),
|
|
'published_at': metadata.get('published_at', ''),
|
|
'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
|
|
'snippet': item.get('document', '')[:200] + '...' # Preview
|
|
})
|
|
|
|
return jsonify({
|
|
'query': query,
|
|
'count': len(formatted_response),
|
|
'results': formatted_response
|
|
}), 200
|
|
|
|
except Exception as e:
|
|
logging.error(f"Search error: {str(e)}")
|
|
return jsonify({'error': str(e)}), 500
|