From 4e8b60f77cdb320e749df0da376b8d471ffef7cd Mon Sep 17 00:00:00 2001
From: dongho <dongho@ekstrah.com>
Date: Wed, 10 Dec 2025 15:50:11 +0000
Subject: [PATCH] update

---
 backend/app.py                    |   2 +
 backend/chroma_client.py          |   3 +-
 backend/config.py                 |   7 ++
 backend/requirements.txt          |   1 +
 backend/routes/news_routes.py     |  11 +-
 backend/routes/search_routes.py   |  88 +++++++++++++++
 docker-compose.yml                |  25 -----
 frontend/public/app.js            | 176 ++++++++++++++++++------------
 frontend/server.js                |  25 +++++
 news_crawler/chroma_client.py     |   3 +-
 news_crawler/crawler_service.py   |   6 +-
 news_crawler/scheduled_crawler.py |   6 +-
 12 files changed, 247 insertions(+), 106 deletions(-)
 create mode 100644 backend/routes/search_routes.py

diff --git a/backend/app.py b/backend/app.py
index 06ad59f..9010c95 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
 from routes.transport_routes import transport_bp
 from routes.interests_routes import interests_bp
 from routes.personalization_routes import personalization_bp
+from routes.search_routes import search_bp
 
 # Initialize Flask app
 app = Flask(__name__)
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
 app.register_blueprint(transport_bp)
 app.register_blueprint(interests_bp)
 app.register_blueprint(personalization_bp)
+app.register_blueprint(search_bp)
 
 # Health check endpoint
 @app.route('/health')
diff --git a/backend/chroma_client.py b/backend/chroma_client.py
index 7476c0b..f3c2a89 100644
--- a/backend/chroma_client.py
+++ b/backend/chroma_client.py
@@ -87,7 +87,8 @@ class ChromaClient:
             
             # Prepare text for embedding (Title + Summary + Start of Content)
             # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
+            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
             summary = article.get('summary') or ''
             content_snippet = article.get('content', '')[:1000]
             
diff --git a/backend/config.py b/backend/config.py
index 4d429bb..ebc6f17 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -45,6 +45,11 @@ class Config:
     TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
     TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
     
+    # ChromaDB
+    CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
+    CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
+    CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
+    
     @classmethod
     def print_config(cls):
         """Print configuration (without sensitive data)"""
@@ -57,3 +62,5 @@ class Config:
         print(f"  Ollama Enabled: {cls.OLLAMA_ENABLED}")
         print(f"  Tracking Enabled: {cls.TRACKING_ENABLED}")
         print(f"  Tracking API URL: {cls.TRACKING_API_URL}")
+        print(f"  ChromaDB Host: {cls.CHROMA_HOST}")
+        print(f"  ChromaDB Port: {cls.CHROMA_PORT}")
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 3626e1e..abe4a4f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -8,3 +8,4 @@ Jinja2==3.1.2
 redis==5.0.1
 
 chromadb>=0.4.0
+sentence-transformers>=2.2.2
diff --git a/backend/routes/news_routes.py b/backend/routes/news_routes.py
index b03a40d..06647d9 100644
--- a/backend/routes/news_routes.py
+++ b/backend/routes/news_routes.py
@@ -24,8 +24,11 @@ def get_news():
         
         db_articles = []
         for doc in cursor:
+            # Use English title if available, otherwise fallback to original
+            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+            
             article = {
-                'title': doc.get('title', ''),
+                'title': title,
                 'author': doc.get('author'),
                 'link': doc.get('link', ''),
                 'source': doc.get('source', ''),
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
             # Use cluster_articles from aggregation (already fetched)
             cluster_articles = doc.get('cluster_articles', [])
             
+            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+            
             article = {
-                'title': doc.get('title', ''),
+                'title': title,
                 'link': doc.get('link', ''),
                 'source': doc.get('source', ''),
                 'published': doc.get('published_at', ''),
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
             return jsonify({'error': 'Article not found'}), 404
         
         return jsonify({
-            'title': article.get('title', ''),
+            'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
             'author': article.get('author'),
             'link': article.get('link', ''),
             'content': article.get('content', ''),
diff --git a/backend/routes/search_routes.py b/backend/routes/search_routes.py
new file mode 100644
index 0000000..3b8946e
--- /dev/null
+++ b/backend/routes/search_routes.py
@@ -0,0 +1,88 @@
+from flask import Blueprint, jsonify, request
+from config import Config
+from chroma_client import ChromaClient
+import logging
+
+search_bp = Blueprint('search', __name__)
+
+# Initialize ChromaDB client
+# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
+chroma_client = ChromaClient(
+    host=Config.CHROMA_HOST,
+    port=Config.CHROMA_PORT,
+    collection_name=Config.CHROMA_COLLECTION
+)
+
+@search_bp.route('/api/search', methods=['GET'])
+def search_news():
+    """
+    Semantic search for news articles using ChromaDB.
+    Query parameters:
+    - q: Search query (required)
+    - limit: Number of results (default: 10)
+    - category: Filter by category (optional)
+    """
+    try:
+        query = request.args.get('q')
+        if not query:
+            return jsonify({'error': 'Missing search query'}), 400
+        
+        limit = int(request.args.get('limit', 10))
+        category = request.args.get('category')
+        
+        # Build filter if category provided
+        where_filter = None
+        if category:
+            where_filter = {"category": category}
+            
+        # Perform search
+        results = chroma_client.search(
+            query_text=query,
+            n_results=limit,
+            where=where_filter
+        )
+        
+        # Format for frontend
+        formatted_response = []
+        for item in results:
+            metadata = item.get('metadata', {})
+            # Use translated title if availble (stored in metadata as title_en or title)
+            # Note: Chroma metadata structure is flat. If we store title_en, we should use it.
+            # But currently we store: title, url, source, category, published_at. 
+            # We need to make sure title_en is stored in Chroma OR fetch it from DB.
+            # Faster approach: just rely on what is in Chroma.
+            # BETTER: In crawl, we store title as title_en in metadata if available?
+            # Let's check how we store it in crawler_service.py/chroma_client.py
+            
+            # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
+            # In chroma_client.py line 97, we only extract title, url, source, category, published_at.
+            # We are NOT storing title_en in Chroma metadata currently.
+            
+            # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
+            # However, since the user IS complaining about English, let's assume valid English titles 
+            # are what we want to display.
+            
+            # Wait, if we change the metadata in ChromaClient to use title_en as the main title, 
+            # then search results will automatically show English.
+            
+            title = metadata.get('title', 'Unknown Title')
+            
+            formatted_response.append({
+                'title': title,
+                'link': metadata.get('url', ''),
+                'source': metadata.get('source', 'Unknown'),
+                'category': metadata.get('category', 'general'),
+                'published_at': metadata.get('published_at', ''),
+                'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
+                'snippet': item.get('document', '')[:200] + '...' # Preview
+            })
+            
+        return jsonify({
+            'query': query,
+            'count': len(formatted_response),
+            'results': formatted_response
+        }), 200
+        
+    except Exception as e:
+        logging.error(f"Search error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
diff --git a/docker-compose.yml b/docker-compose.yml
index f1c8f97..73e2bff 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,20 +1,3 @@
-# Munich News Daily - Docker Compose Configuration
-#
-# GPU Support:
-#   To enable GPU acceleration for Ollama (5-10x faster):
-#   1. Check GPU availability: ./check-gpu.sh
-#   2. Start with GPU: ./start-with-gpu.sh
-#   Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
-#
-# Security:
-#   - Only Backend API (port 5001) is exposed to host
-#   - MongoDB is internal-only (not exposed to host)
-#   - Ollama is internal-only (not exposed to host)
-#   - Crawler and Sender are internal-only
-#   All services communicate via internal Docker network
-#
-# See docs/OLLAMA_SETUP.md for detailed setup instructions
-
 services:
   # Ollama AI Service (Internal only - not exposed to host)
   ollama:
@@ -29,14 +12,6 @@ services:
     dns:
       - 8.8.8.8
       - 1.1.1.1
-    # GPU support (uncomment if you have NVIDIA GPU)
-    # deploy:
-    #   resources:
-    #     reservations:
-    #       devices:
-    #         - driver: nvidia
-    #           count: all
-    #           capabilities: [gpu]
     healthcheck:
       test: [ "CMD-SHELL", "ollama list || exit 1" ]
       interval: 30s
diff --git a/frontend/public/app.js b/frontend/public/app.js
index 8e4f82b..82417e1 100644
--- a/frontend/public/app.js
+++ b/frontend/public/app.js
@@ -19,10 +19,10 @@ async function loadCategories() {
         const response = await fetch('/api/categories');
         const data = await response.json();
         const categories = data.categories || [];
-        
+
         const container = document.getElementById('categoryCheckboxes');
         container.innerHTML = '';
-        
+
         categories.forEach(category => {
             const label = document.createElement('label');
             label.className = 'flex items-center space-x-3 cursor-pointer';
@@ -40,11 +40,11 @@ async function loadCategories() {
 async function loadNews() {
     const newsGrid = document.getElementById('newsGrid');
     newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Loading news...</div>';
-    
+
     try {
         const response = await fetch('/api/news');
         const data = await response.json();
-        
+
         if (data.articles && data.articles.length > 0) {
             allArticles = data.articles;
             filteredArticles = data.articles;
@@ -63,24 +63,24 @@ async function loadNews() {
 
 function loadMoreArticles() {
     if (isLoading || displayedCount >= filteredArticles.length) return;
-    
+
     isLoading = true;
     const newsGrid = document.getElementById('newsGrid');
-    
+
     // Remove loading indicator if exists
     const loadingIndicator = document.getElementById('loadingIndicator');
     if (loadingIndicator) loadingIndicator.remove();
-    
+
     // Get next batch of articles
     const nextBatch = filteredArticles.slice(displayedCount, displayedCount + ARTICLES_PER_PAGE);
-    
+
     nextBatch.forEach((article, index) => {
         const card = createNewsCard(article, displayedCount + index);
         newsGrid.appendChild(card);
     });
-    
+
     displayedCount += nextBatch.length;
-    
+
     // Add loading indicator if more articles available
     if (displayedCount < filteredArticles.length) {
         const loader = document.createElement('div');
@@ -95,17 +95,17 @@ function loadMoreArticles() {
         endMessage.textContent = `✓ All ${filteredArticles.length} articles loaded`;
         newsGrid.appendChild(endMessage);
     }
-    
+
     isLoading = false;
 }
 
 function setupInfiniteScroll() {
     window.addEventListener('scroll', () => {
         if (isLoading || displayedCount >= filteredArticles.length) return;
-        
+
         const scrollPosition = window.innerHeight + window.scrollY;
         const threshold = document.documentElement.scrollHeight - 500;
-        
+
         if (scrollPosition >= threshold) {
             loadMoreArticles();
         }
@@ -113,53 +113,85 @@ function setupInfiniteScroll() {
 }
 
 // Search functionality
-function handleSearch() {
+let searchTimeout;
+
+async function handleSearch() {
     const searchInput = document.getElementById('searchInput');
     const clearBtn = document.getElementById('clearSearch');
-    searchQuery = searchInput.value.trim().toLowerCase();
-    
+    const searchStats = document.getElementById('searchStats');
+    const newsGrid = document.getElementById('newsGrid');
+
+    searchQuery = searchInput.value.trim();
+
     // Show/hide clear button
     if (searchQuery) {
         clearBtn.classList.remove('hidden');
     } else {
         clearBtn.classList.add('hidden');
     }
-    
-    // Filter articles
+
+    // Clear previous timeout
+    if (searchTimeout) clearTimeout(searchTimeout);
+
+    // If empty query, reset to all articles
     if (searchQuery === '') {
         filteredArticles = allArticles;
-    } else {
-        filteredArticles = allArticles.filter(article => {
-            const title = article.title.toLowerCase();
-            const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, '');
-            const source = formatSourceName(article.source).toLowerCase();
-            
-            return title.includes(searchQuery) || 
-                   summary.includes(searchQuery) || 
-                   source.includes(searchQuery);
-        });
-    }
-    
-    // Reset display
-    displayedCount = 0;
-    const newsGrid = document.getElementById('newsGrid');
-    newsGrid.innerHTML = '';
-    
-    // Update stats
-    updateSearchStats();
-    
-    // Load filtered articles
-    if (filteredArticles.length > 0) {
+        displayedCount = 0;
+        newsGrid.innerHTML = '';
+        updateSearchStats();
         loadMoreArticles();
-    } else {
-        newsGrid.innerHTML = `
-            <div class="text-center py-16">
-                <div class="text-6xl mb-4">🔍</div>
-                <p class="text-xl text-gray-600 mb-2">No articles found</p>
-                <p class="text-gray-400">Try a different search term</p>
-            </div>
-        `;
+        return;
     }
+
+    // Debounce search API call
+    searchTimeout = setTimeout(async () => {
+        // Show searching state
+        newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Searching...</div>';
+
+        try {
+            const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`);
+
+            // Check if response is ok
+            if (!response.ok) {
+                const errorText = await response.text();
+                throw new Error(`Server returned ${response.status}: ${errorText}`);
+            }
+
+            const data = await response.json();
+
+            if (data.results && data.results.length > 0) {
+                // Map results to match card format
+                filteredArticles = data.results.map(item => ({
+                    title: item.title,
+                    link: item.link,
+                    source: item.source,
+                    summary: item.snippet, // Map snippet to summary
+                    published_at: item.published_at,
+                    score: item.relevance_score
+                }));
+
+                displayedCount = 0;
+                newsGrid.innerHTML = '';
+
+                // Update stats
+                searchStats.textContent = `Found ${filteredArticles.length} relevant articles`;
+
+                loadMoreArticles();
+            } else {
+                newsGrid.innerHTML = `
+                    <div class="text-center py-16">
+                        <div class="text-6xl mb-4">🔍</div>
+                        <p class="text-xl text-gray-600 mb-2">No relevant articles found</p>
+                        <p class="text-gray-400">Try different keywords or concepts</p>
+                    </div>
+                `;
+                searchStats.textContent = 'No results found';
+            }
+        } catch (error) {
+            console.error('Search failed:', error);
+            newsGrid.innerHTML = `<div class="text-center py-10 text-red-400">Search failed: ${error.message}</div>`;
+        }
+    }, 500); // 500ms debounce
 }
 
 function clearSearch() {
@@ -182,11 +214,11 @@ function createNewsCard(article, index) {
     const card = document.createElement('div');
     card.className = 'group bg-white rounded-xl overflow-hidden shadow-md hover:shadow-xl transition-all duration-300 cursor-pointer border border-gray-100 hover:border-primary/30';
     card.onclick = () => window.open(article.link, '_blank');
-    
+
     // Extract image from summary if it's an img tag (from Süddeutsche)
     let imageUrl = null;
     let cleanSummary = article.summary || 'No summary available.';
-    
+
     if (cleanSummary.includes('<img')) {
         const imgMatch = cleanSummary.match(/src="([^"]+)"/);
         if (imgMatch) {
@@ -195,17 +227,17 @@ function createNewsCard(article, index) {
         // Remove img tag from summary
         cleanSummary = cleanSummary.replace(/<img[^>]*>/g, '').replace(/<\/?p>/g, '').trim();
     }
-    
+
     // Get source icon/emoji
     const sourceIcon = getSourceIcon(article.source);
-    
+
     // Format source name
     const sourceName = formatSourceName(article.source);
-    
+
     // Get word count badge
     const wordCount = article.word_count || article.summary_word_count;
     const readTime = wordCount ? Math.ceil(wordCount / 200) : null;
-    
+
     card.innerHTML = `
         <div class="flex flex-col sm:flex-row">
             <!-- Image -->
@@ -237,11 +269,11 @@ function createNewsCard(article, index) {
             </div>
         </div>
     `;
-    
+
     // Add staggered animation
     card.style.opacity = '0';
     card.style.animation = `fadeIn 0.5s ease-out ${(index % ARTICLES_PER_PAGE) * 0.1}s forwards`;
-    
+
     return card;
 }
 
@@ -293,7 +325,7 @@ async function loadStats() {
     try {
         const response = await fetch('/api/stats');
         const data = await response.json();
-        
+
         if (data.subscribers !== undefined) {
             document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
         }
@@ -306,44 +338,44 @@ async function subscribe() {
     const emailInput = document.getElementById('emailInput');
     const subscribeBtn = document.getElementById('subscribeBtn');
     const formMessage = document.getElementById('formMessage');
-    
+
     const email = emailInput.value.trim();
-    
+
     if (!email || !email.includes('@')) {
         formMessage.textContent = 'Please enter a valid email address';
         formMessage.className = 'text-red-200 font-medium';
         return;
     }
-    
+
     // Get selected categories
     const checkboxes = document.querySelectorAll('#categoryCheckboxes input[type="checkbox"]:checked');
     const categories = Array.from(checkboxes).map(cb => cb.value);
-    
+
     if (categories.length === 0) {
         formMessage.textContent = 'Please select at least one category';
         formMessage.className = 'text-red-200 font-medium';
         return;
     }
-    
+
     subscribeBtn.disabled = true;
     subscribeBtn.textContent = 'Subscribing...';
     subscribeBtn.classList.add('opacity-75', 'cursor-not-allowed');
     formMessage.textContent = '';
-    
+
     try {
         const response = await fetch('/api/subscribe', {
             method: 'POST',
             headers: {
                 'Content-Type': 'application/json'
             },
-            body: JSON.stringify({ 
+            body: JSON.stringify({
                 email: email,
                 categories: categories
             })
         });
-        
+
         const data = await response.json();
-        
+
         if (response.ok) {
             formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
             formMessage.className = 'text-green-200 font-medium';
@@ -384,15 +416,15 @@ function closeUnsubscribe() {
 async function unsubscribe() {
     const emailInput = document.getElementById('unsubscribeEmail');
     const unsubscribeMessage = document.getElementById('unsubscribeMessage');
-    
+
     const email = emailInput.value.trim();
-    
+
     if (!email || !email.includes('@')) {
         unsubscribeMessage.textContent = 'Please enter a valid email address';
         unsubscribeMessage.className = 'text-red-600 font-medium';
         return;
     }
-    
+
     try {
         const response = await fetch('/api/unsubscribe', {
             method: 'POST',
@@ -401,9 +433,9 @@ async function unsubscribe() {
             },
             body: JSON.stringify({ email: email })
         });
-        
+
         const data = await response.json();
-        
+
         if (response.ok) {
             unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
             unsubscribeMessage.className = 'text-green-600 font-medium';
@@ -423,7 +455,7 @@ async function unsubscribe() {
 }
 
 // Close modal when clicking outside
-window.onclick = function(event) {
+window.onclick = function (event) {
     const modal = document.getElementById('unsubscribeModal');
     if (event.target === modal) {
         closeUnsubscribe();
diff --git a/frontend/server.js b/frontend/server.js
index 8261feb..e8c772d 100644
--- a/frontend/server.js
+++ b/frontend/server.js
@@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => {
   }
 });
 
+app.get('/api/search', async (req, res) => {
+  try {
+    const { q, limit, category } = req.query;
+    const response = await axios.get(`${API_URL}/api/search`, {
+      params: { q, limit, category }
+    });
+    res.json(response.data);
+  } catch (error) {
+    if (error.response) {
+      // The request was made and the server responded with a status code
+      // that falls out of the range of 2xx
+      console.error('Search API Error:', error.response.status, error.response.data);
+      res.status(error.response.status).json(error.response.data);
+    } else if (error.request) {
+      // The request was made but no response was received
+      console.error('Search API No Response:', error.request);
+      res.status(502).json({ error: 'Search service unavailable (timeout/connection)' });
+    } else {
+      // Something happened in setting up the request that triggered an Error
+      console.error('Search API Request Error:', error.message);
+      res.status(500).json({ error: 'Internal proxy error' });
+    }
+  }
+});
+
 app.listen(PORT, () => {
   console.log(`Frontend server running on http://localhost:${PORT}`);
   console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`);
diff --git a/news_crawler/chroma_client.py b/news_crawler/chroma_client.py
index 7476c0b..f3c2a89 100644
--- a/news_crawler/chroma_client.py
+++ b/news_crawler/chroma_client.py
@@ -87,7 +87,8 @@ class ChromaClient:
             
             # Prepare text for embedding (Title + Summary + Start of Content)
             # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
+            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
             summary = article.get('summary') or ''
             content_snippet = article.get('content', '')[:1000]
             
diff --git a/news_crawler/crawler_service.py b/news_crawler/crawler_service.py
index 2f08c33..cb666dd 100644
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
         
         if not feed.entries:
             print(f"   ⚠ No entries found in feed")
-            return 0
+            return {
+                'crawled': 0,
+                'summarized': 0,
+                'failed_summaries': 0
+            }
         
         crawled_count = 0
         summarized_count = 0
diff --git a/news_crawler/scheduled_crawler.py b/news_crawler/scheduled_crawler.py
index 73b04db..2f7dd83 100755
--- a/news_crawler/scheduled_crawler.py
+++ b/news_crawler/scheduled_crawler.py
@@ -37,12 +37,12 @@ def main():
     """Main scheduler loop"""
     print("🤖 Munich News Crawler Scheduler")
     print("="*60)
-    print("Schedule: Daily at 6:00 AM Berlin time")
+    print("Schedule: Every 3 hours")
     print("Timezone: Europe/Berlin (CET/CEST)")
     print("="*60)
     
-    # Schedule the crawler to run at 6 AM Berlin time
-    schedule.every().day.at("06:00").do(run_crawler)
+    # Schedule the crawler to run every 3 hours
+    schedule.every(3).hours.do(run_crawler)
     
     # Show next run time
     berlin_time = datetime.now(BERLIN_TZ)