update

2025-12-10 15:50:11 +00:00
parent 50b9888004
commit 4e8b60f77c
12 changed files with 247 additions and 106 deletions
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
 from routes.transport_routes import transport_bp
 from routes.interests_routes import interests_bp
 from routes.personalization_routes import personalization_bp
 from routes.search_routes import search_bp
 # Initialize Flask app
 app = Flask(__name__)
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
 app.register_blueprint(transport_bp)
 app.register_blueprint(interests_bp)
 app.register_blueprint(personalization_bp)
 app.register_blueprint(search_bp)
 # Health check endpoint
@app.route('/health')
@@ -87,7 +87,8 @@ class ChromaClient:
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
@@ -45,6 +45,11 @@ class Config:
    TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
    TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
    # ChromaDB
    CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
    CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
    CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
    @classmethod
    def print_config(cls):
        """Print configuration (without sensitive data)"""
@@ -57,3 +62,5 @@ class Config:
        print(f"  Ollama Enabled: {cls.OLLAMA_ENABLED}")
        print(f"  Tracking Enabled: {cls.TRACKING_ENABLED}")
        print(f"  Tracking API URL: {cls.TRACKING_API_URL}")
        print(f"  ChromaDB Host: {cls.CHROMA_HOST}")
        print(f"  ChromaDB Port: {cls.CHROMA_PORT}")
@@ -8,3 +8,4 @@ Jinja2==3.1.2
 redis==5.0.1
 chromadb>=0.4.0
 sentence-transformers>=2.2.2
@@ -24,8 +24,11 @@ def get_news():
        db_articles = []
        for doc in cursor:
            # Use English title if available, otherwise fallback to original
            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
            article = {
-                'title': doc.get('title', ''),
+                'title': title,
                'author': doc.get('author'),
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
            # Use cluster_articles from aggregation (already fetched)
            cluster_articles = doc.get('cluster_articles', [])
            title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
            article = {
-                'title': doc.get('title', ''),
+                'title': title,
                'link': doc.get('link', ''),
                'source': doc.get('source', ''),
                'published': doc.get('published_at', ''),
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
            return jsonify({'error': 'Article not found'}), 404
        return jsonify({
-            'title': article.get('title', ''),
+            'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
            'author': article.get('author'),
            'link': article.get('link', ''),
            'content': article.get('content', ''),
@@ -0,0 +1,88 @@
 from flask import Blueprint, jsonify, request
 from config import Config
 from chroma_client import ChromaClient
 import logging
 search_bp = Blueprint('search', __name__)
 # Initialize ChromaDB client
 # Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
 chroma_client = ChromaClient(
    host=Config.CHROMA_HOST,
    port=Config.CHROMA_PORT,
    collection_name=Config.CHROMA_COLLECTION
 )
@search_bp.route('/api/search', methods=['GET'])
 def search_news():
    """
    Semantic search for news articles using ChromaDB.
    Query parameters:
    - q: Search query (required)
    - limit: Number of results (default: 10)
    - category: Filter by category (optional)
    """
    try:
        query = request.args.get('q')
        if not query:
            return jsonify({'error': 'Missing search query'}), 400
        limit = int(request.args.get('limit', 10))
        category = request.args.get('category')
        # Build filter if category provided
        where_filter = None
        if category:
            where_filter = {"category": category}
        # Perform search
        results = chroma_client.search(
            query_text=query,
            n_results=limit,
            where=where_filter
        )
        # Format for frontend
        formatted_response = []
        for item in results:
            metadata = item.get('metadata', {})
            # Use translated title if availble (stored in metadata as title_en or title)
            # Note: Chroma metadata structure is flat. If we store title_en, we should use it.
            # But currently we store: title, url, source, category, published_at. 
            # We need to make sure title_en is stored in Chroma OR fetch it from DB.
            # Faster approach: just rely on what is in Chroma.
            # BETTER: In crawl, we store title as title_en in metadata if available?
            # Let's check how we store it in crawler_service.py/chroma_client.py
            # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
            # In chroma_client.py line 97, we only extract title, url, source, category, published_at.
            # We are NOT storing title_en in Chroma metadata currently.
            # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
            # However, since the user IS complaining about English, let's assume valid English titles 
            # are what we want to display.
            # Wait, if we change the metadata in ChromaClient to use title_en as the main title, 
            # then search results will automatically show English.
            title = metadata.get('title', 'Unknown Title')
            formatted_response.append({
                'title': title,
                'link': metadata.get('url', ''),
                'source': metadata.get('source', 'Unknown'),
                'category': metadata.get('category', 'general'),
                'published_at': metadata.get('published_at', ''),
                'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
                'snippet': item.get('document', '')[:200] + '...' # Preview
            })
        return jsonify({
            'query': query,
            'count': len(formatted_response),
            'results': formatted_response
        }), 200
    except Exception as e:
        logging.error(f"Search error: {str(e)}")
        return jsonify({'error': str(e)}), 500
@@ -1,20 +1,3 @@
 # Munich News Daily - Docker Compose Configuration
 #
 # GPU Support:
 #   To enable GPU acceleration for Ollama (5-10x faster):
 #   1. Check GPU availability: ./check-gpu.sh
 #   2. Start with GPU: ./start-with-gpu.sh
 #   Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
 #
 # Security:
 #   - Only Backend API (port 5001) is exposed to host
 #   - MongoDB is internal-only (not exposed to host)
 #   - Ollama is internal-only (not exposed to host)
 #   - Crawler and Sender are internal-only
 #   All services communicate via internal Docker network
 #
 # See docs/OLLAMA_SETUP.md for detailed setup instructions
 services:
  # Ollama AI Service (Internal only - not exposed to host)
  ollama:
@@ -29,14 +12,6 @@ services:
    dns:
      - 8.8.8.8
      - 1.1.1.1
    # GPU support (uncomment if you have NVIDIA GPU)
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia
    #           count: all
    #           capabilities: [gpu]
    healthcheck:
      test: [ "CMD-SHELL", "ollama list || exit 1" ]
      interval: 30s
@@ -113,10 +113,15 @@ function setupInfiniteScroll() {
 }
 // Search functionality
-function handleSearch() {
+let searchTimeout;
 async function handleSearch() {
    const searchInput = document.getElementById('searchInput');
    const clearBtn = document.getElementById('clearSearch');
-    searchQuery = searchInput.value.trim().toLowerCase();
+    const searchStats = document.getElementById('searchStats');
    const newsGrid = document.getElementById('newsGrid');
    searchQuery = searchInput.value.trim();
    // Show/hide clear button
    if (searchQuery) {
@@ -125,41 +130,68 @@ function handleSearch() {
        clearBtn.classList.add('hidden');
    }
-    // Filter articles
+    // Clear previous timeout
    if (searchTimeout) clearTimeout(searchTimeout);
    // If empty query, reset to all articles
    if (searchQuery === '') {
        filteredArticles = allArticles;
-    } else {
+        displayedCount = 0;
-        filteredArticles = allArticles.filter(article => {
+        newsGrid.innerHTML = '';
-            const title = article.title.toLowerCase();
+        updateSearchStats();
            const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, '');
            const source = formatSourceName(article.source).toLowerCase();
            return title.includes(searchQuery) || 
                   summary.includes(searchQuery) || 
                   source.includes(searchQuery);
        });
    }
    // Reset display
    displayedCount = 0;
    const newsGrid = document.getElementById('newsGrid');
    newsGrid.innerHTML = '';
    // Update stats
    updateSearchStats();
    // Load filtered articles
    if (filteredArticles.length > 0) {
        loadMoreArticles();
-    } else {
+        return;
        newsGrid.innerHTML = `
            <div class="text-center py-16">
                <div class="text-6xl mb-4">🔍</div>
                <p class="text-xl text-gray-600 mb-2">No articles found</p>
                <p class="text-gray-400">Try a different search term</p>
            </div>
        `;
    }
    // Debounce search API call
    searchTimeout = setTimeout(async () => {
        // Show searching state
        newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Searching...</div>';
        try {
            const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`);
            // Check if response is ok
            if (!response.ok) {
                const errorText = await response.text();
                throw new Error(`Server returned ${response.status}: ${errorText}`);
            }
            const data = await response.json();
            if (data.results && data.results.length > 0) {
                // Map results to match card format
                filteredArticles = data.results.map(item => ({
                    title: item.title,
                    link: item.link,
                    source: item.source,
                    summary: item.snippet, // Map snippet to summary
                    published_at: item.published_at,
                    score: item.relevance_score
                }));
                displayedCount = 0;
                newsGrid.innerHTML = '';
                // Update stats
                searchStats.textContent = `Found ${filteredArticles.length} relevant articles`;
                loadMoreArticles();
            } else {
                newsGrid.innerHTML = `
                    <div class="text-center py-16">
                        <div class="text-6xl mb-4">🔍</div>
                        <p class="text-xl text-gray-600 mb-2">No relevant articles found</p>
                        <p class="text-gray-400">Try different keywords or concepts</p>
                    </div>
                `;
                searchStats.textContent = 'No results found';
            }
        } catch (error) {
            console.error('Search failed:', error);
            newsGrid.innerHTML = `<div class="text-center py-10 text-red-400">Search failed: ${error.message}</div>`;
        }
    }, 500); // 500ms debounce
 }
 function clearSearch() {
@@ -423,7 +455,7 @@ async function unsubscribe() {
 }
 // Close modal when clicking outside
-window.onclick = function(event) {
+window.onclick = function (event) {
    const modal = document.getElementById('unsubscribeModal');
    if (event.target === modal) {
        closeUnsubscribe();
@@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => {
  }
 });
 app.get('/api/search', async (req, res) => {
  try {
    const { q, limit, category } = req.query;
    const response = await axios.get(`${API_URL}/api/search`, {
      params: { q, limit, category }
    });
    res.json(response.data);
  } catch (error) {
    if (error.response) {
      // The request was made and the server responded with a status code
      // that falls out of the range of 2xx
      console.error('Search API Error:', error.response.status, error.response.data);
      res.status(error.response.status).json(error.response.data);
    } else if (error.request) {
      // The request was made but no response was received
      console.error('Search API No Response:', error.request);
      res.status(502).json({ error: 'Search service unavailable (timeout/connection)' });
    } else {
      // Something happened in setting up the request that triggered an Error
      console.error('Search API Request Error:', error.message);
      res.status(500).json({ error: 'Internal proxy error' });
    }
  }
 });
 app.listen(PORT, () => {
  console.log(`Frontend server running on http://localhost:${PORT}`);
  console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`);
@@ -87,7 +87,8 @@ class ChromaClient:
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
-            title = article.get('title', '')
+            # Use English title if available, otherwise original
            title = article.get('title_en') if article.get('title_en') else article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
        if not feed.entries:
            print(f"   ⚠ No entries found in feed")
-            return 0
+            return {
                'crawled': 0,
                'summarized': 0,
                'failed_summaries': 0
            }
        crawled_count = 0
        summarized_count = 0
@@ -37,12 +37,12 @@ def main():
    """Main scheduler loop"""
    print("🤖 Munich News Crawler Scheduler")
    print("="*60)
-    print("Schedule: Daily at 6:00 AM Berlin time")
+    print("Schedule: Every 3 hours")
    print("Timezone: Europe/Berlin (CET/CEST)")
    print("="*60)
-    # Schedule the crawler to run at 6 AM Berlin time
+    # Schedule the crawler to run every 3 hours
-    schedule.every().day.at("06:00").do(run_crawler)
+    schedule.every(3).hours.do(run_crawler)
    # Show next run time
    berlin_time = datetime.now(BERLIN_TZ)