diff --git a/backend/app.py b/backend/app.py
index 06ad59f..9010c95 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
from routes.transport_routes import transport_bp
from routes.interests_routes import interests_bp
from routes.personalization_routes import personalization_bp
+from routes.search_routes import search_bp
# Initialize Flask app
app = Flask(__name__)
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
app.register_blueprint(transport_bp)
app.register_blueprint(interests_bp)
app.register_blueprint(personalization_bp)
+app.register_blueprint(search_bp)
# Health check endpoint
@app.route('/health')
diff --git a/backend/chroma_client.py b/backend/chroma_client.py
index 7476c0b..f3c2a89 100644
--- a/backend/chroma_client.py
+++ b/backend/chroma_client.py
@@ -87,7 +87,8 @@ class ChromaClient:
# Prepare text for embedding (Title + Summary + Start of Content)
# This gives semantic search a good overview
- title = article.get('title', '')
+ # Use English title if available, otherwise original
+ title = article.get('title_en') if article.get('title_en') else article.get('title', '')
summary = article.get('summary') or ''
content_snippet = article.get('content', '')[:1000]
diff --git a/backend/config.py b/backend/config.py
index 4d429bb..ebc6f17 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -45,6 +45,11 @@ class Config:
TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
+ # ChromaDB
+ CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
+ CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
+ CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
+
@classmethod
def print_config(cls):
"""Print configuration (without sensitive data)"""
@@ -57,3 +62,5 @@ class Config:
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
print(f" Tracking Enabled: {cls.TRACKING_ENABLED}")
print(f" Tracking API URL: {cls.TRACKING_API_URL}")
+ print(f" ChromaDB Host: {cls.CHROMA_HOST}")
+ print(f" ChromaDB Port: {cls.CHROMA_PORT}")
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 3626e1e..abe4a4f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -8,3 +8,4 @@ Jinja2==3.1.2
redis==5.0.1
chromadb>=0.4.0
+sentence-transformers>=2.2.2
diff --git a/backend/routes/news_routes.py b/backend/routes/news_routes.py
index b03a40d..06647d9 100644
--- a/backend/routes/news_routes.py
+++ b/backend/routes/news_routes.py
@@ -24,8 +24,11 @@ def get_news():
db_articles = []
for doc in cursor:
+ # Use English title if available, otherwise fallback to original
+ title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+
article = {
- 'title': doc.get('title', ''),
+ 'title': title,
'author': doc.get('author'),
'link': doc.get('link', ''),
'source': doc.get('source', ''),
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
# Use cluster_articles from aggregation (already fetched)
cluster_articles = doc.get('cluster_articles', [])
+ title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
+
article = {
- 'title': doc.get('title', ''),
+ 'title': title,
'link': doc.get('link', ''),
'source': doc.get('source', ''),
'published': doc.get('published_at', ''),
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
return jsonify({'error': 'Article not found'}), 404
return jsonify({
- 'title': article.get('title', ''),
+ 'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
'author': article.get('author'),
'link': article.get('link', ''),
'content': article.get('content', ''),
diff --git a/backend/routes/search_routes.py b/backend/routes/search_routes.py
new file mode 100644
index 0000000..3b8946e
--- /dev/null
+++ b/backend/routes/search_routes.py
@@ -0,0 +1,88 @@
+from flask import Blueprint, jsonify, request
+from config import Config
+from chroma_client import ChromaClient
+import logging
+
+search_bp = Blueprint('search', __name__)
+
+# Initialize ChromaDB client
+# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
+chroma_client = ChromaClient(
+ host=Config.CHROMA_HOST,
+ port=Config.CHROMA_PORT,
+ collection_name=Config.CHROMA_COLLECTION
+)
+
+@search_bp.route('/api/search', methods=['GET'])
+def search_news():
+ """
+ Semantic search for news articles using ChromaDB.
+ Query parameters:
+ - q: Search query (required)
+ - limit: Number of results (default: 10)
+ - category: Filter by category (optional)
+ """
+ try:
+ query = request.args.get('q')
+ if not query:
+ return jsonify({'error': 'Missing search query'}), 400
+
+ limit = int(request.args.get('limit', 10))
+ category = request.args.get('category')
+
+ # Build filter if category provided
+ where_filter = None
+ if category:
+ where_filter = {"category": category}
+
+ # Perform search
+ results = chroma_client.search(
+ query_text=query,
+ n_results=limit,
+ where=where_filter
+ )
+
+ # Format for frontend
+ formatted_response = []
+ for item in results:
+ metadata = item.get('metadata', {})
+ # Use translated title if availble (stored in metadata as title_en or title)
+ # Note: Chroma metadata structure is flat. If we store title_en, we should use it.
+ # But currently we store: title, url, source, category, published_at.
+ # We need to make sure title_en is stored in Chroma OR fetch it from DB.
+ # Faster approach: just rely on what is in Chroma.
+ # BETTER: In crawl, we store title as title_en in metadata if available?
+ # Let's check how we store it in crawler_service.py/chroma_client.py
+
+ # Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
+ # In chroma_client.py line 97, we only extract title, url, source, category, published_at.
+ # We are NOT storing title_en in Chroma metadata currently.
+
+ # FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
+ # However, since the user IS complaining about English, let's assume valid English titles
+ # are what we want to display.
+
+ # Wait, if we change the metadata in ChromaClient to use title_en as the main title,
+ # then search results will automatically show English.
+
+ title = metadata.get('title', 'Unknown Title')
+
+ formatted_response.append({
+ 'title': title,
+ 'link': metadata.get('url', ''),
+ 'source': metadata.get('source', 'Unknown'),
+ 'category': metadata.get('category', 'general'),
+ 'published_at': metadata.get('published_at', ''),
+ 'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
+ 'snippet': item.get('document', '')[:200] + '...' # Preview
+ })
+
+ return jsonify({
+ 'query': query,
+ 'count': len(formatted_response),
+ 'results': formatted_response
+ }), 200
+
+ except Exception as e:
+ logging.error(f"Search error: {str(e)}")
+ return jsonify({'error': str(e)}), 500
diff --git a/docker-compose.yml b/docker-compose.yml
index f1c8f97..73e2bff 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,20 +1,3 @@
-# Munich News Daily - Docker Compose Configuration
-#
-# GPU Support:
-# To enable GPU acceleration for Ollama (5-10x faster):
-# 1. Check GPU availability: ./check-gpu.sh
-# 2. Start with GPU: ./start-with-gpu.sh
-# Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
-#
-# Security:
-# - Only Backend API (port 5001) is exposed to host
-# - MongoDB is internal-only (not exposed to host)
-# - Ollama is internal-only (not exposed to host)
-# - Crawler and Sender are internal-only
-# All services communicate via internal Docker network
-#
-# See docs/OLLAMA_SETUP.md for detailed setup instructions
-
services:
# Ollama AI Service (Internal only - not exposed to host)
ollama:
@@ -29,14 +12,6 @@ services:
dns:
- 8.8.8.8
- 1.1.1.1
- # GPU support (uncomment if you have NVIDIA GPU)
- # deploy:
- # resources:
- # reservations:
- # devices:
- # - driver: nvidia
- # count: all
- # capabilities: [gpu]
healthcheck:
test: [ "CMD-SHELL", "ollama list || exit 1" ]
interval: 30s
diff --git a/frontend/public/app.js b/frontend/public/app.js
index 8e4f82b..82417e1 100644
--- a/frontend/public/app.js
+++ b/frontend/public/app.js
@@ -19,10 +19,10 @@ async function loadCategories() {
const response = await fetch('/api/categories');
const data = await response.json();
const categories = data.categories || [];
-
+
const container = document.getElementById('categoryCheckboxes');
container.innerHTML = '';
-
+
categories.forEach(category => {
const label = document.createElement('label');
label.className = 'flex items-center space-x-3 cursor-pointer';
@@ -40,11 +40,11 @@ async function loadCategories() {
async function loadNews() {
const newsGrid = document.getElementById('newsGrid');
newsGrid.innerHTML = '
Loading news...
';
-
+
try {
const response = await fetch('/api/news');
const data = await response.json();
-
+
if (data.articles && data.articles.length > 0) {
allArticles = data.articles;
filteredArticles = data.articles;
@@ -63,24 +63,24 @@ async function loadNews() {
function loadMoreArticles() {
if (isLoading || displayedCount >= filteredArticles.length) return;
-
+
isLoading = true;
const newsGrid = document.getElementById('newsGrid');
-
+
// Remove loading indicator if exists
const loadingIndicator = document.getElementById('loadingIndicator');
if (loadingIndicator) loadingIndicator.remove();
-
+
// Get next batch of articles
const nextBatch = filteredArticles.slice(displayedCount, displayedCount + ARTICLES_PER_PAGE);
-
+
nextBatch.forEach((article, index) => {
const card = createNewsCard(article, displayedCount + index);
newsGrid.appendChild(card);
});
-
+
displayedCount += nextBatch.length;
-
+
// Add loading indicator if more articles available
if (displayedCount < filteredArticles.length) {
const loader = document.createElement('div');
@@ -95,17 +95,17 @@ function loadMoreArticles() {
endMessage.textContent = `✓ All ${filteredArticles.length} articles loaded`;
newsGrid.appendChild(endMessage);
}
-
+
isLoading = false;
}
function setupInfiniteScroll() {
window.addEventListener('scroll', () => {
if (isLoading || displayedCount >= filteredArticles.length) return;
-
+
const scrollPosition = window.innerHeight + window.scrollY;
const threshold = document.documentElement.scrollHeight - 500;
-
+
if (scrollPosition >= threshold) {
loadMoreArticles();
}
@@ -113,53 +113,85 @@ function setupInfiniteScroll() {
}
// Search functionality
-function handleSearch() {
+let searchTimeout;
+
+async function handleSearch() {
const searchInput = document.getElementById('searchInput');
const clearBtn = document.getElementById('clearSearch');
- searchQuery = searchInput.value.trim().toLowerCase();
-
+ const searchStats = document.getElementById('searchStats');
+ const newsGrid = document.getElementById('newsGrid');
+
+ searchQuery = searchInput.value.trim();
+
// Show/hide clear button
if (searchQuery) {
clearBtn.classList.remove('hidden');
} else {
clearBtn.classList.add('hidden');
}
-
- // Filter articles
+
+ // Clear previous timeout
+ if (searchTimeout) clearTimeout(searchTimeout);
+
+ // If empty query, reset to all articles
if (searchQuery === '') {
filteredArticles = allArticles;
- } else {
- filteredArticles = allArticles.filter(article => {
- const title = article.title.toLowerCase();
- const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, '');
- const source = formatSourceName(article.source).toLowerCase();
-
- return title.includes(searchQuery) ||
- summary.includes(searchQuery) ||
- source.includes(searchQuery);
- });
- }
-
- // Reset display
- displayedCount = 0;
- const newsGrid = document.getElementById('newsGrid');
- newsGrid.innerHTML = '';
-
- // Update stats
- updateSearchStats();
-
- // Load filtered articles
- if (filteredArticles.length > 0) {
+ displayedCount = 0;
+ newsGrid.innerHTML = '';
+ updateSearchStats();
loadMoreArticles();
- } else {
- newsGrid.innerHTML = `
-
-
🔍
-
No articles found
-
Try a different search term
-
- `;
+ return;
}
+
+ // Debounce search API call
+ searchTimeout = setTimeout(async () => {
+ // Show searching state
+ newsGrid.innerHTML = 'Searching...
';
+
+ try {
+ const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`);
+
+ // Check if response is ok
+ if (!response.ok) {
+ const errorText = await response.text();
+ throw new Error(`Server returned ${response.status}: ${errorText}`);
+ }
+
+ const data = await response.json();
+
+ if (data.results && data.results.length > 0) {
+ // Map results to match card format
+ filteredArticles = data.results.map(item => ({
+ title: item.title,
+ link: item.link,
+ source: item.source,
+ summary: item.snippet, // Map snippet to summary
+ published_at: item.published_at,
+ score: item.relevance_score
+ }));
+
+ displayedCount = 0;
+ newsGrid.innerHTML = '';
+
+ // Update stats
+ searchStats.textContent = `Found ${filteredArticles.length} relevant articles`;
+
+ loadMoreArticles();
+ } else {
+ newsGrid.innerHTML = `
+
+
🔍
+
No relevant articles found
+
Try different keywords or concepts
+
+ `;
+ searchStats.textContent = 'No results found';
+ }
+ } catch (error) {
+ console.error('Search failed:', error);
+ newsGrid.innerHTML = `Search failed: ${error.message}
`;
+ }
+ }, 500); // 500ms debounce
}
function clearSearch() {
@@ -182,11 +214,11 @@ function createNewsCard(article, index) {
const card = document.createElement('div');
card.className = 'group bg-white rounded-xl overflow-hidden shadow-md hover:shadow-xl transition-all duration-300 cursor-pointer border border-gray-100 hover:border-primary/30';
card.onclick = () => window.open(article.link, '_blank');
-
+
// Extract image from summary if it's an img tag (from Süddeutsche)
let imageUrl = null;
let cleanSummary = article.summary || 'No summary available.';
-
+
if (cleanSummary.includes('
]*>/g, '').replace(/<\/?p>/g, '').trim();
}
-
+
// Get source icon/emoji
const sourceIcon = getSourceIcon(article.source);
-
+
// Format source name
const sourceName = formatSourceName(article.source);
-
+
// Get word count badge
const wordCount = article.word_count || article.summary_word_count;
const readTime = wordCount ? Math.ceil(wordCount / 200) : null;
-
+
card.innerHTML = `
@@ -237,11 +269,11 @@ function createNewsCard(article, index) {
`;
-
+
// Add staggered animation
card.style.opacity = '0';
card.style.animation = `fadeIn 0.5s ease-out ${(index % ARTICLES_PER_PAGE) * 0.1}s forwards`;
-
+
return card;
}
@@ -293,7 +325,7 @@ async function loadStats() {
try {
const response = await fetch('/api/stats');
const data = await response.json();
-
+
if (data.subscribers !== undefined) {
document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
}
@@ -306,44 +338,44 @@ async function subscribe() {
const emailInput = document.getElementById('emailInput');
const subscribeBtn = document.getElementById('subscribeBtn');
const formMessage = document.getElementById('formMessage');
-
+
const email = emailInput.value.trim();
-
+
if (!email || !email.includes('@')) {
formMessage.textContent = 'Please enter a valid email address';
formMessage.className = 'text-red-200 font-medium';
return;
}
-
+
// Get selected categories
const checkboxes = document.querySelectorAll('#categoryCheckboxes input[type="checkbox"]:checked');
const categories = Array.from(checkboxes).map(cb => cb.value);
-
+
if (categories.length === 0) {
formMessage.textContent = 'Please select at least one category';
formMessage.className = 'text-red-200 font-medium';
return;
}
-
+
subscribeBtn.disabled = true;
subscribeBtn.textContent = 'Subscribing...';
subscribeBtn.classList.add('opacity-75', 'cursor-not-allowed');
formMessage.textContent = '';
-
+
try {
const response = await fetch('/api/subscribe', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
- body: JSON.stringify({
+ body: JSON.stringify({
email: email,
categories: categories
})
});
-
+
const data = await response.json();
-
+
if (response.ok) {
formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
formMessage.className = 'text-green-200 font-medium';
@@ -384,15 +416,15 @@ function closeUnsubscribe() {
async function unsubscribe() {
const emailInput = document.getElementById('unsubscribeEmail');
const unsubscribeMessage = document.getElementById('unsubscribeMessage');
-
+
const email = emailInput.value.trim();
-
+
if (!email || !email.includes('@')) {
unsubscribeMessage.textContent = 'Please enter a valid email address';
unsubscribeMessage.className = 'text-red-600 font-medium';
return;
}
-
+
try {
const response = await fetch('/api/unsubscribe', {
method: 'POST',
@@ -401,9 +433,9 @@ async function unsubscribe() {
},
body: JSON.stringify({ email: email })
});
-
+
const data = await response.json();
-
+
if (response.ok) {
unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
unsubscribeMessage.className = 'text-green-600 font-medium';
@@ -423,7 +455,7 @@ async function unsubscribe() {
}
// Close modal when clicking outside
-window.onclick = function(event) {
+window.onclick = function (event) {
const modal = document.getElementById('unsubscribeModal');
if (event.target === modal) {
closeUnsubscribe();
diff --git a/frontend/server.js b/frontend/server.js
index 8261feb..e8c772d 100644
--- a/frontend/server.js
+++ b/frontend/server.js
@@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => {
}
});
+app.get('/api/search', async (req, res) => {
+ try {
+ const { q, limit, category } = req.query;
+ const response = await axios.get(`${API_URL}/api/search`, {
+ params: { q, limit, category }
+ });
+ res.json(response.data);
+ } catch (error) {
+ if (error.response) {
+ // The request was made and the server responded with a status code
+ // that falls out of the range of 2xx
+ console.error('Search API Error:', error.response.status, error.response.data);
+ res.status(error.response.status).json(error.response.data);
+ } else if (error.request) {
+ // The request was made but no response was received
+ console.error('Search API No Response:', error.request);
+ res.status(502).json({ error: 'Search service unavailable (timeout/connection)' });
+ } else {
+ // Something happened in setting up the request that triggered an Error
+ console.error('Search API Request Error:', error.message);
+ res.status(500).json({ error: 'Internal proxy error' });
+ }
+ }
+});
+
app.listen(PORT, () => {
console.log(`Frontend server running on http://localhost:${PORT}`);
console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`);
diff --git a/news_crawler/chroma_client.py b/news_crawler/chroma_client.py
index 7476c0b..f3c2a89 100644
--- a/news_crawler/chroma_client.py
+++ b/news_crawler/chroma_client.py
@@ -87,7 +87,8 @@ class ChromaClient:
# Prepare text for embedding (Title + Summary + Start of Content)
# This gives semantic search a good overview
- title = article.get('title', '')
+ # Use English title if available, otherwise original
+ title = article.get('title_en') if article.get('title_en') else article.get('title', '')
summary = article.get('summary') or ''
content_snippet = article.get('content', '')[:1000]
diff --git a/news_crawler/crawler_service.py b/news_crawler/crawler_service.py
index 2f08c33..cb666dd 100644
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
if not feed.entries:
print(f" ⚠ No entries found in feed")
- return 0
+ return {
+ 'crawled': 0,
+ 'summarized': 0,
+ 'failed_summaries': 0
+ }
crawled_count = 0
summarized_count = 0
diff --git a/news_crawler/scheduled_crawler.py b/news_crawler/scheduled_crawler.py
index 73b04db..2f7dd83 100755
--- a/news_crawler/scheduled_crawler.py
+++ b/news_crawler/scheduled_crawler.py
@@ -37,12 +37,12 @@ def main():
"""Main scheduler loop"""
print("🤖 Munich News Crawler Scheduler")
print("="*60)
- print("Schedule: Daily at 6:00 AM Berlin time")
+ print("Schedule: Every 3 hours")
print("Timezone: Europe/Berlin (CET/CEST)")
print("="*60)
- # Schedule the crawler to run at 6 AM Berlin time
- schedule.every().day.at("06:00").do(run_crawler)
+ # Schedule the crawler to run every 3 hours
+ schedule.every(3).hours.do(run_crawler)
# Show next run time
berlin_time = datetime.now(BERLIN_TZ)