update

2025-12-10 15:50:11 +00:00
parent 50b9888004
commit 4e8b60f77c
12 changed files with 247 additions and 106 deletions
--- a/backend/app.py
+++ b/backend/app.py
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
 from routes.transport_routes import transport_bp
 from routes.interests_routes import interests_bp
 from routes.personalization_routes import personalization_bp
+from routes.search_routes import search_bp

 # Initialize Flask app
 app = Flask(__name__)
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
 app.register_blueprint(transport_bp)
 app.register_blueprint(interests_bp)
 app.register_blueprint(personalization_bp)
+app.register_blueprint(search_bp)

 # Health check endpoint
@app.route('/health')
--- a/backend/chroma_client.py
+++ b/backend/chroma_client.py
@@ -87,7 +87,8 @@ class ChromaClient:
            
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
+            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
            
--- a/backend/config.py
+++ b/backend/config.py
@@ -45,6 +45,11 @@ class Config:
    TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
    TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
    
+    # ChromaDB
+    CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
+    CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
+    CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
+    
    @classmethod
    def print_config(cls):
        """Print configuration (without sensitive data)"""
@@ -57,3 +62,5 @@ class Config:
        print(f"  Ollama Enabled: {cls.OLLAMA_ENABLED}")
        print(f"  Tracking Enabled: {cls.TRACKING_ENABLED}")
        print(f"  Tracking API URL: {cls.TRACKING_API_URL}")
+        print(f"  ChromaDB Host: {cls.CHROMA_HOST}")
+        print(f"  ChromaDB Port: {cls.CHROMA_PORT}")
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -8,3 +8,4 @@ Jinja2==3.1.2
 redis==5.0.1

 chromadb>=0.4.0
+sentence-transformers>=2.2.2
--- a/backend/routes/news_routes.py
+++ b/backend/routes/news_routes.py
@@ -24,8 +24,11 @@ def get_news():
        
        db_articles = []
        for doc in cursor:
+            # Use English title if available, otherwise fallback to original
+            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+            
            article = {
-                'title': doc.get('title', ''),
+                'title': title,
                'author': doc.get('author'),
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
            # Use cluster_articles from aggregation (already fetched)
            cluster_articles = doc.get('cluster_articles', [])
            
+            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+            
            article = {
-                'title': doc.get('title', ''),
+                'title': title,
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
                'published': doc.get('published_at', ''),
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
            return jsonify({'error': 'Article not found'}), 404
        
        return jsonify({
-            'title': article.get('title', ''),
+            'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
            'author': article.get('author'),
            'link': article.get('link', ''),
            'content': article.get('content', ''),
--- a/backend/routes/search_routes.py
+++ b/backend/routes/search_routes.py
@@ -0,0 +1,88 @@
+from flask import Blueprint, jsonify, request
+from config import Config
+from chroma_client import ChromaClient
+import logging
+
+search_bp = Blueprint('search', __name__)
+
+# Initialize ChromaDB client
+# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
+chroma_client = ChromaClient(
+    host=Config.CHROMA_HOST,
+    port=Config.CHROMA_PORT,
+    collection_name=Config.CHROMA_COLLECTION
+)
+
+@search_bp.route('/api/search', methods=['GET'])
+def search_news():
+    """
+    Semantic search for news articles using ChromaDB.
+    Query parameters:
+    - q: Search query (required)
+    - limit: Number of results (default: 10)
+    - category: Filter by category (optional)
+    """
+    try:
+        query = request.args.get('q')
+        if not query:
+            return jsonify({'error': 'Missing search query'}), 400
+        
+        limit = int(request.args.get('limit', 10))
+        category = request.args.get('category')
+        
+        # Build filter if category provided
+        where_filter = None
+        if category:
+            where_filter = {"category": category}
+            
+        # Perform search
+        results = chroma_client.search(
+            query_text=query,
+            n_results=limit,
+            where=where_filter
+        )
+        
+        # Format for frontend
+        formatted_response = []
+        for item in results:
+            metadata = item.get('metadata', {})
+            # Use translated title if availble (stored in metadata as title_en or title)
+            # Note: Chroma metadata structure is flat. If we store title_en, we should use it.
+            # But currently we store: title, url, source, category, published_at. 
+            # We need to make sure title_en is stored in Chroma OR fetch it from DB.
+            # Faster approach: just rely on what is in Chroma.
+            # BETTER: In crawl, we store title as title_en in metadata if available?
+            # Let's check how we store it in crawler_service.py/chroma_client.py
+            
+            # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
+            # In chroma_client.py line 97, we only extract title, url, source, category, published_at.
+            # We are NOT storing title_en in Chroma metadata currently.
+            
+            # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
+            # However, since the user IS complaining about English, let's assume valid English titles 
+            # are what we want to display.
+            
+            # Wait, if we change the metadata in ChromaClient to use title_en as the main title, 
+            # then search results will automatically show English.
+            
+            title = metadata.get('title', 'Unknown Title')
+            
+            formatted_response.append({
+                'title': title,
+                'link': metadata.get('url', ''),
+                'source': metadata.get('source', 'Unknown'),
+                'category': metadata.get('category', 'general'),
+                'published_at': metadata.get('published_at', ''),
+                'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
+                'snippet': item.get('document', '')[:200] + '...' # Preview
+            })
+            
+        return jsonify({
+            'query': query,
+            'count': len(formatted_response),
+            'results': formatted_response
+        }), 200
+        
+    except Exception as e:
+        logging.error(f"Search error: {str(e)}")
+        return jsonify({'error': str(e)}), 500