update
This commit is contained in:
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
|
||||
from routes.transport_routes import transport_bp
|
||||
from routes.interests_routes import interests_bp
|
||||
from routes.personalization_routes import personalization_bp
|
||||
from routes.search_routes import search_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
|
||||
app.register_blueprint(transport_bp)
|
||||
app.register_blueprint(interests_bp)
|
||||
app.register_blueprint(personalization_bp)
|
||||
app.register_blueprint(search_bp)
|
||||
|
||||
# Health check endpoint
|
||||
@app.route('/health')
|
||||
|
||||
@@ -87,7 +87,8 @@ class ChromaClient:
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
title = article.get('title', '')
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
|
||||
@@ -45,6 +45,11 @@ class Config:
|
||||
TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
|
||||
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
|
||||
|
||||
# ChromaDB
|
||||
CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
|
||||
CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
|
||||
CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print configuration (without sensitive data)"""
|
||||
@@ -57,3 +62,5 @@ class Config:
|
||||
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Tracking Enabled: {cls.TRACKING_ENABLED}")
|
||||
print(f" Tracking API URL: {cls.TRACKING_API_URL}")
|
||||
print(f" ChromaDB Host: {cls.CHROMA_HOST}")
|
||||
print(f" ChromaDB Port: {cls.CHROMA_PORT}")
|
||||
|
||||
@@ -8,3 +8,4 @@ Jinja2==3.1.2
|
||||
redis==5.0.1
|
||||
|
||||
chromadb>=0.4.0
|
||||
sentence-transformers>=2.2.2
|
||||
|
||||
@@ -24,8 +24,11 @@ def get_news():
|
||||
|
||||
db_articles = []
|
||||
for doc in cursor:
|
||||
# Use English title if available, otherwise fallback to original
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
|
||||
# Use cluster_articles from aggregation (already fetched)
|
||||
cluster_articles = doc.get('cluster_articles', [])
|
||||
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published': doc.get('published_at', ''),
|
||||
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
|
||||
return jsonify({'error': 'Article not found'}), 404
|
||||
|
||||
return jsonify({
|
||||
'title': article.get('title', ''),
|
||||
'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
|
||||
'author': article.get('author'),
|
||||
'link': article.get('link', ''),
|
||||
'content': article.get('content', ''),
|
||||
|
||||
88
backend/routes/search_routes.py
Normal file
88
backend/routes/search_routes.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from flask import Blueprint, jsonify, request
|
||||
from config import Config
|
||||
from chroma_client import ChromaClient
|
||||
import logging
|
||||
|
||||
search_bp = Blueprint('search', __name__)
|
||||
|
||||
# Initialize ChromaDB client
|
||||
# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
|
||||
chroma_client = ChromaClient(
|
||||
host=Config.CHROMA_HOST,
|
||||
port=Config.CHROMA_PORT,
|
||||
collection_name=Config.CHROMA_COLLECTION
|
||||
)
|
||||
|
||||
@search_bp.route('/api/search', methods=['GET'])
|
||||
def search_news():
|
||||
"""
|
||||
Semantic search for news articles using ChromaDB.
|
||||
Query parameters:
|
||||
- q: Search query (required)
|
||||
- limit: Number of results (default: 10)
|
||||
- category: Filter by category (optional)
|
||||
"""
|
||||
try:
|
||||
query = request.args.get('q')
|
||||
if not query:
|
||||
return jsonify({'error': 'Missing search query'}), 400
|
||||
|
||||
limit = int(request.args.get('limit', 10))
|
||||
category = request.args.get('category')
|
||||
|
||||
# Build filter if category provided
|
||||
where_filter = None
|
||||
if category:
|
||||
where_filter = {"category": category}
|
||||
|
||||
# Perform search
|
||||
results = chroma_client.search(
|
||||
query_text=query,
|
||||
n_results=limit,
|
||||
where=where_filter
|
||||
)
|
||||
|
||||
# Format for frontend
|
||||
formatted_response = []
|
||||
for item in results:
|
||||
metadata = item.get('metadata', {})
|
||||
# Use translated title if availble (stored in metadata as title_en or title)
|
||||
# Note: Chroma metadata structure is flat. If we store title_en, we should use it.
|
||||
# But currently we store: title, url, source, category, published_at.
|
||||
# We need to make sure title_en is stored in Chroma OR fetch it from DB.
|
||||
# Faster approach: just rely on what is in Chroma.
|
||||
# BETTER: In crawl, we store title as title_en in metadata if available?
|
||||
# Let's check how we store it in crawler_service.py/chroma_client.py
|
||||
|
||||
# Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
|
||||
# In chroma_client.py line 97, we only extract title, url, source, category, published_at.
|
||||
# We are NOT storing title_en in Chroma metadata currently.
|
||||
|
||||
# FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
|
||||
# However, since the user IS complaining about English, let's assume valid English titles
|
||||
# are what we want to display.
|
||||
|
||||
# Wait, if we change the metadata in ChromaClient to use title_en as the main title,
|
||||
# then search results will automatically show English.
|
||||
|
||||
title = metadata.get('title', 'Unknown Title')
|
||||
|
||||
formatted_response.append({
|
||||
'title': title,
|
||||
'link': metadata.get('url', ''),
|
||||
'source': metadata.get('source', 'Unknown'),
|
||||
'category': metadata.get('category', 'general'),
|
||||
'published_at': metadata.get('published_at', ''),
|
||||
'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
|
||||
'snippet': item.get('document', '')[:200] + '...' # Preview
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'query': query,
|
||||
'count': len(formatted_response),
|
||||
'results': formatted_response
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Search error: {str(e)}")
|
||||
return jsonify({'error': str(e)}), 500
|
||||
Reference in New Issue
Block a user