From 4e8b60f77cdb320e749df0da376b8d471ffef7cd Mon Sep 17 00:00:00 2001 From: dongho Date: Wed, 10 Dec 2025 15:50:11 +0000 Subject: [PATCH] update --- backend/app.py | 2 + backend/chroma_client.py | 3 +- backend/config.py | 7 ++ backend/requirements.txt | 1 + backend/routes/news_routes.py | 11 +- backend/routes/search_routes.py | 88 +++++++++++++++ docker-compose.yml | 25 ----- frontend/public/app.js | 176 ++++++++++++++++++------------ frontend/server.js | 25 +++++ news_crawler/chroma_client.py | 3 +- news_crawler/crawler_service.py | 6 +- news_crawler/scheduled_crawler.py | 6 +- 12 files changed, 247 insertions(+), 106 deletions(-) create mode 100644 backend/routes/search_routes.py diff --git a/backend/app.py b/backend/app.py index 06ad59f..9010c95 100644 --- a/backend/app.py +++ b/backend/app.py @@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp from routes.transport_routes import transport_bp from routes.interests_routes import interests_bp from routes.personalization_routes import personalization_bp +from routes.search_routes import search_bp # Initialize Flask app app = Flask(__name__) @@ -33,6 +34,7 @@ app.register_blueprint(admin_bp) app.register_blueprint(transport_bp) app.register_blueprint(interests_bp) app.register_blueprint(personalization_bp) +app.register_blueprint(search_bp) # Health check endpoint @app.route('/health') diff --git a/backend/chroma_client.py b/backend/chroma_client.py index 7476c0b..f3c2a89 100644 --- a/backend/chroma_client.py +++ b/backend/chroma_client.py @@ -87,7 +87,8 @@ class ChromaClient: # Prepare text for embedding (Title + Summary + Start of Content) # This gives semantic search a good overview - title = article.get('title', '') + # Use English title if available, otherwise original + title = article.get('title_en') if article.get('title_en') else article.get('title', '') summary = article.get('summary') or '' content_snippet = article.get('content', '')[:1000] diff --git a/backend/config.py b/backend/config.py index 4d429bb..ebc6f17 100644 --- a/backend/config.py +++ b/backend/config.py @@ -45,6 +45,11 @@ class Config: TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}') TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90')) + # ChromaDB + CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb') + CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000')) + CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles') + @classmethod def print_config(cls): """Print configuration (without sensitive data)""" @@ -57,3 +62,5 @@ class Config: print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}") print(f" Tracking Enabled: {cls.TRACKING_ENABLED}") print(f" Tracking API URL: {cls.TRACKING_API_URL}") + print(f" ChromaDB Host: {cls.CHROMA_HOST}") + print(f" ChromaDB Port: {cls.CHROMA_PORT}") diff --git a/backend/requirements.txt b/backend/requirements.txt index 3626e1e..abe4a4f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,3 +8,4 @@ Jinja2==3.1.2 redis==5.0.1 chromadb>=0.4.0 +sentence-transformers>=2.2.2 diff --git a/backend/routes/news_routes.py b/backend/routes/news_routes.py index b03a40d..06647d9 100644 --- a/backend/routes/news_routes.py +++ b/backend/routes/news_routes.py @@ -24,8 +24,11 @@ def get_news(): db_articles = [] for doc in cursor: + # Use English title if available, otherwise fallback to original + title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '') + article = { - 'title': doc.get('title', ''), + 'title': title, 'author': doc.get('author'), 'link': doc.get('link', ''), 'source': doc.get('source', ''), @@ -114,8 +117,10 @@ def get_clustered_news_internal(): # Use cluster_articles from aggregation (already fetched) cluster_articles = doc.get('cluster_articles', []) + title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '') + article = { - 'title': doc.get('title', ''), + 'title': title, 'link': doc.get('link', ''), 'source': doc.get('source', ''), 'published': doc.get('published_at', ''), @@ -173,7 +178,7 @@ def get_article_by_url(article_url): return jsonify({'error': 'Article not found'}), 404 return jsonify({ - 'title': article.get('title', ''), + 'title': article.get('title_en') if article.get('title_en') else article.get('title', ''), 'author': article.get('author'), 'link': article.get('link', ''), 'content': article.get('content', ''), diff --git a/backend/routes/search_routes.py b/backend/routes/search_routes.py new file mode 100644 index 0000000..3b8946e --- /dev/null +++ b/backend/routes/search_routes.py @@ -0,0 +1,88 @@ +from flask import Blueprint, jsonify, request +from config import Config +from chroma_client import ChromaClient +import logging + +search_bp = Blueprint('search', __name__) + +# Initialize ChromaDB client +# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend +chroma_client = ChromaClient( + host=Config.CHROMA_HOST, + port=Config.CHROMA_PORT, + collection_name=Config.CHROMA_COLLECTION +) + +@search_bp.route('/api/search', methods=['GET']) +def search_news(): + """ + Semantic search for news articles using ChromaDB. + Query parameters: + - q: Search query (required) + - limit: Number of results (default: 10) + - category: Filter by category (optional) + """ + try: + query = request.args.get('q') + if not query: + return jsonify({'error': 'Missing search query'}), 400 + + limit = int(request.args.get('limit', 10)) + category = request.args.get('category') + + # Build filter if category provided + where_filter = None + if category: + where_filter = {"category": category} + + # Perform search + results = chroma_client.search( + query_text=query, + n_results=limit, + where=where_filter + ) + + # Format for frontend + formatted_response = [] + for item in results: + metadata = item.get('metadata', {}) + # Use translated title if availble (stored in metadata as title_en or title) + # Note: Chroma metadata structure is flat. If we store title_en, we should use it. + # But currently we store: title, url, source, category, published_at. + # We need to make sure title_en is stored in Chroma OR fetch it from DB. + # Faster approach: just rely on what is in Chroma. + # BETTER: In crawl, we store title as title_en in metadata if available? + # Let's check how we store it in crawler_service.py/chroma_client.py + + # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles. + # In chroma_client.py line 97, we only extract title, url, source, category, published_at. + # We are NOT storing title_en in Chroma metadata currently. + + # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic. + # However, since the user IS complaining about English, let's assume valid English titles + # are what we want to display. + + # Wait, if we change the metadata in ChromaClient to use title_en as the main title, + # then search results will automatically show English. + + title = metadata.get('title', 'Unknown Title') + + formatted_response.append({ + 'title': title, + 'link': metadata.get('url', ''), + 'source': metadata.get('source', 'Unknown'), + 'category': metadata.get('category', 'general'), + 'published_at': metadata.get('published_at', ''), + 'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx) + 'snippet': item.get('document', '')[:200] + '...' # Preview + }) + + return jsonify({ + 'query': query, + 'count': len(formatted_response), + 'results': formatted_response + }), 200 + + except Exception as e: + logging.error(f"Search error: {str(e)}") + return jsonify({'error': str(e)}), 500 diff --git a/docker-compose.yml b/docker-compose.yml index f1c8f97..73e2bff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,20 +1,3 @@ -# Munich News Daily - Docker Compose Configuration -# -# GPU Support: -# To enable GPU acceleration for Ollama (5-10x faster): -# 1. Check GPU availability: ./check-gpu.sh -# 2. Start with GPU: ./start-with-gpu.sh -# Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d -# -# Security: -# - Only Backend API (port 5001) is exposed to host -# - MongoDB is internal-only (not exposed to host) -# - Ollama is internal-only (not exposed to host) -# - Crawler and Sender are internal-only -# All services communicate via internal Docker network -# -# See docs/OLLAMA_SETUP.md for detailed setup instructions - services: # Ollama AI Service (Internal only - not exposed to host) ollama: @@ -29,14 +12,6 @@ services: dns: - 8.8.8.8 - 1.1.1.1 - # GPU support (uncomment if you have NVIDIA GPU) - # deploy: - # resources: - # reservations: - # devices: - # - driver: nvidia - # count: all - # capabilities: [gpu] healthcheck: test: [ "CMD-SHELL", "ollama list || exit 1" ] interval: 30s diff --git a/frontend/public/app.js b/frontend/public/app.js index 8e4f82b..82417e1 100644 --- a/frontend/public/app.js +++ b/frontend/public/app.js @@ -19,10 +19,10 @@ async function loadCategories() { const response = await fetch('/api/categories'); const data = await response.json(); const categories = data.categories || []; - + const container = document.getElementById('categoryCheckboxes'); container.innerHTML = ''; - + categories.forEach(category => { const label = document.createElement('label'); label.className = 'flex items-center space-x-3 cursor-pointer'; @@ -40,11 +40,11 @@ async function loadCategories() { async function loadNews() { const newsGrid = document.getElementById('newsGrid'); newsGrid.innerHTML = '
Loading news...
'; - + try { const response = await fetch('/api/news'); const data = await response.json(); - + if (data.articles && data.articles.length > 0) { allArticles = data.articles; filteredArticles = data.articles; @@ -63,24 +63,24 @@ async function loadNews() { function loadMoreArticles() { if (isLoading || displayedCount >= filteredArticles.length) return; - + isLoading = true; const newsGrid = document.getElementById('newsGrid'); - + // Remove loading indicator if exists const loadingIndicator = document.getElementById('loadingIndicator'); if (loadingIndicator) loadingIndicator.remove(); - + // Get next batch of articles const nextBatch = filteredArticles.slice(displayedCount, displayedCount + ARTICLES_PER_PAGE); - + nextBatch.forEach((article, index) => { const card = createNewsCard(article, displayedCount + index); newsGrid.appendChild(card); }); - + displayedCount += nextBatch.length; - + // Add loading indicator if more articles available if (displayedCount < filteredArticles.length) { const loader = document.createElement('div'); @@ -95,17 +95,17 @@ function loadMoreArticles() { endMessage.textContent = `✓ All ${filteredArticles.length} articles loaded`; newsGrid.appendChild(endMessage); } - + isLoading = false; } function setupInfiniteScroll() { window.addEventListener('scroll', () => { if (isLoading || displayedCount >= filteredArticles.length) return; - + const scrollPosition = window.innerHeight + window.scrollY; const threshold = document.documentElement.scrollHeight - 500; - + if (scrollPosition >= threshold) { loadMoreArticles(); } @@ -113,53 +113,85 @@ function setupInfiniteScroll() { } // Search functionality -function handleSearch() { +let searchTimeout; + +async function handleSearch() { const searchInput = document.getElementById('searchInput'); const clearBtn = document.getElementById('clearSearch'); - searchQuery = searchInput.value.trim().toLowerCase(); - + const searchStats = document.getElementById('searchStats'); + const newsGrid = document.getElementById('newsGrid'); + + searchQuery = searchInput.value.trim(); + // Show/hide clear button if (searchQuery) { clearBtn.classList.remove('hidden'); } else { clearBtn.classList.add('hidden'); } - - // Filter articles + + // Clear previous timeout + if (searchTimeout) clearTimeout(searchTimeout); + + // If empty query, reset to all articles if (searchQuery === '') { filteredArticles = allArticles; - } else { - filteredArticles = allArticles.filter(article => { - const title = article.title.toLowerCase(); - const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, ''); - const source = formatSourceName(article.source).toLowerCase(); - - return title.includes(searchQuery) || - summary.includes(searchQuery) || - source.includes(searchQuery); - }); - } - - // Reset display - displayedCount = 0; - const newsGrid = document.getElementById('newsGrid'); - newsGrid.innerHTML = ''; - - // Update stats - updateSearchStats(); - - // Load filtered articles - if (filteredArticles.length > 0) { + displayedCount = 0; + newsGrid.innerHTML = ''; + updateSearchStats(); loadMoreArticles(); - } else { - newsGrid.innerHTML = ` -
-
🔍
-

No articles found

-

Try a different search term

-
- `; + return; } + + // Debounce search API call + searchTimeout = setTimeout(async () => { + // Show searching state + newsGrid.innerHTML = '
Searching...
'; + + try { + const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`); + + // Check if response is ok + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Server returned ${response.status}: ${errorText}`); + } + + const data = await response.json(); + + if (data.results && data.results.length > 0) { + // Map results to match card format + filteredArticles = data.results.map(item => ({ + title: item.title, + link: item.link, + source: item.source, + summary: item.snippet, // Map snippet to summary + published_at: item.published_at, + score: item.relevance_score + })); + + displayedCount = 0; + newsGrid.innerHTML = ''; + + // Update stats + searchStats.textContent = `Found ${filteredArticles.length} relevant articles`; + + loadMoreArticles(); + } else { + newsGrid.innerHTML = ` +
+
🔍
+

No relevant articles found

+

Try different keywords or concepts

+
+ `; + searchStats.textContent = 'No results found'; + } + } catch (error) { + console.error('Search failed:', error); + newsGrid.innerHTML = `
Search failed: ${error.message}
`; + } + }, 500); // 500ms debounce } function clearSearch() { @@ -182,11 +214,11 @@ function createNewsCard(article, index) { const card = document.createElement('div'); card.className = 'group bg-white rounded-xl overflow-hidden shadow-md hover:shadow-xl transition-all duration-300 cursor-pointer border border-gray-100 hover:border-primary/30'; card.onclick = () => window.open(article.link, '_blank'); - + // Extract image from summary if it's an img tag (from Süddeutsche) let imageUrl = null; let cleanSummary = article.summary || 'No summary available.'; - + if (cleanSummary.includes(']*>/g, '').replace(/<\/?p>/g, '').trim(); } - + // Get source icon/emoji const sourceIcon = getSourceIcon(article.source); - + // Format source name const sourceName = formatSourceName(article.source); - + // Get word count badge const wordCount = article.word_count || article.summary_word_count; const readTime = wordCount ? Math.ceil(wordCount / 200) : null; - + card.innerHTML = `
@@ -237,11 +269,11 @@ function createNewsCard(article, index) {
`; - + // Add staggered animation card.style.opacity = '0'; card.style.animation = `fadeIn 0.5s ease-out ${(index % ARTICLES_PER_PAGE) * 0.1}s forwards`; - + return card; } @@ -293,7 +325,7 @@ async function loadStats() { try { const response = await fetch('/api/stats'); const data = await response.json(); - + if (data.subscribers !== undefined) { document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString(); } @@ -306,44 +338,44 @@ async function subscribe() { const emailInput = document.getElementById('emailInput'); const subscribeBtn = document.getElementById('subscribeBtn'); const formMessage = document.getElementById('formMessage'); - + const email = emailInput.value.trim(); - + if (!email || !email.includes('@')) { formMessage.textContent = 'Please enter a valid email address'; formMessage.className = 'text-red-200 font-medium'; return; } - + // Get selected categories const checkboxes = document.querySelectorAll('#categoryCheckboxes input[type="checkbox"]:checked'); const categories = Array.from(checkboxes).map(cb => cb.value); - + if (categories.length === 0) { formMessage.textContent = 'Please select at least one category'; formMessage.className = 'text-red-200 font-medium'; return; } - + subscribeBtn.disabled = true; subscribeBtn.textContent = 'Subscribing...'; subscribeBtn.classList.add('opacity-75', 'cursor-not-allowed'); formMessage.textContent = ''; - + try { const response = await fetch('/api/subscribe', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ + body: JSON.stringify({ email: email, categories: categories }) }); - + const data = await response.json(); - + if (response.ok) { formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.'; formMessage.className = 'text-green-200 font-medium'; @@ -384,15 +416,15 @@ function closeUnsubscribe() { async function unsubscribe() { const emailInput = document.getElementById('unsubscribeEmail'); const unsubscribeMessage = document.getElementById('unsubscribeMessage'); - + const email = emailInput.value.trim(); - + if (!email || !email.includes('@')) { unsubscribeMessage.textContent = 'Please enter a valid email address'; unsubscribeMessage.className = 'text-red-600 font-medium'; return; } - + try { const response = await fetch('/api/unsubscribe', { method: 'POST', @@ -401,9 +433,9 @@ async function unsubscribe() { }, body: JSON.stringify({ email: email }) }); - + const data = await response.json(); - + if (response.ok) { unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.'; unsubscribeMessage.className = 'text-green-600 font-medium'; @@ -423,7 +455,7 @@ async function unsubscribe() { } // Close modal when clicking outside -window.onclick = function(event) { +window.onclick = function (event) { const modal = document.getElementById('unsubscribeModal'); if (event.target === modal) { closeUnsubscribe(); diff --git a/frontend/server.js b/frontend/server.js index 8261feb..e8c772d 100644 --- a/frontend/server.js +++ b/frontend/server.js @@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => { } }); +app.get('/api/search', async (req, res) => { + try { + const { q, limit, category } = req.query; + const response = await axios.get(`${API_URL}/api/search`, { + params: { q, limit, category } + }); + res.json(response.data); + } catch (error) { + if (error.response) { + // The request was made and the server responded with a status code + // that falls out of the range of 2xx + console.error('Search API Error:', error.response.status, error.response.data); + res.status(error.response.status).json(error.response.data); + } else if (error.request) { + // The request was made but no response was received + console.error('Search API No Response:', error.request); + res.status(502).json({ error: 'Search service unavailable (timeout/connection)' }); + } else { + // Something happened in setting up the request that triggered an Error + console.error('Search API Request Error:', error.message); + res.status(500).json({ error: 'Internal proxy error' }); + } + } +}); + app.listen(PORT, () => { console.log(`Frontend server running on http://localhost:${PORT}`); console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`); diff --git a/news_crawler/chroma_client.py b/news_crawler/chroma_client.py index 7476c0b..f3c2a89 100644 --- a/news_crawler/chroma_client.py +++ b/news_crawler/chroma_client.py @@ -87,7 +87,8 @@ class ChromaClient: # Prepare text for embedding (Title + Summary + Start of Content) # This gives semantic search a good overview - title = article.get('title', '') + # Use English title if available, otherwise original + title = article.get('title_en') if article.get('title_en') else article.get('title', '') summary = article.get('summary') or '' content_snippet = article.get('content', '')[:1000] diff --git a/news_crawler/crawler_service.py b/news_crawler/crawler_service.py index 2f08c33..cb666dd 100644 --- a/news_crawler/crawler_service.py +++ b/news_crawler/crawler_service.py @@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10 if not feed.entries: print(f" ⚠ No entries found in feed") - return 0 + return { + 'crawled': 0, + 'summarized': 0, + 'failed_summaries': 0 + } crawled_count = 0 summarized_count = 0 diff --git a/news_crawler/scheduled_crawler.py b/news_crawler/scheduled_crawler.py index 73b04db..2f7dd83 100755 --- a/news_crawler/scheduled_crawler.py +++ b/news_crawler/scheduled_crawler.py @@ -37,12 +37,12 @@ def main(): """Main scheduler loop""" print("🤖 Munich News Crawler Scheduler") print("="*60) - print("Schedule: Daily at 6:00 AM Berlin time") + print("Schedule: Every 3 hours") print("Timezone: Europe/Berlin (CET/CEST)") print("="*60) - # Schedule the crawler to run at 6 AM Berlin time - schedule.every().day.at("06:00").do(run_crawler) + # Schedule the crawler to run every 3 hours + schedule.every(3).hours.do(run_crawler) # Show next run time berlin_time = datetime.now(BERLIN_TZ)