update
This commit is contained in:
@@ -13,6 +13,7 @@ from routes.admin_routes import admin_bp
|
||||
from routes.transport_routes import transport_bp
|
||||
from routes.interests_routes import interests_bp
|
||||
from routes.personalization_routes import personalization_bp
|
||||
from routes.search_routes import search_bp
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
@@ -33,6 +34,7 @@ app.register_blueprint(admin_bp)
|
||||
app.register_blueprint(transport_bp)
|
||||
app.register_blueprint(interests_bp)
|
||||
app.register_blueprint(personalization_bp)
|
||||
app.register_blueprint(search_bp)
|
||||
|
||||
# Health check endpoint
|
||||
@app.route('/health')
|
||||
|
||||
@@ -87,7 +87,8 @@ class ChromaClient:
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
title = article.get('title', '')
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
|
||||
@@ -45,6 +45,11 @@ class Config:
|
||||
TRACKING_API_URL = os.getenv('TRACKING_API_URL', f'http://localhost:{os.getenv("FLASK_PORT", "5000")}')
|
||||
TRACKING_DATA_RETENTION_DAYS = int(os.getenv('TRACKING_DATA_RETENTION_DAYS', '90'))
|
||||
|
||||
# ChromaDB
|
||||
CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
|
||||
CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
|
||||
CHROMA_COLLECTION = os.getenv('CHROMA_COLLECTION', 'munich_news_articles')
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print configuration (without sensitive data)"""
|
||||
@@ -57,3 +62,5 @@ class Config:
|
||||
print(f" Ollama Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Tracking Enabled: {cls.TRACKING_ENABLED}")
|
||||
print(f" Tracking API URL: {cls.TRACKING_API_URL}")
|
||||
print(f" ChromaDB Host: {cls.CHROMA_HOST}")
|
||||
print(f" ChromaDB Port: {cls.CHROMA_PORT}")
|
||||
|
||||
@@ -8,3 +8,4 @@ Jinja2==3.1.2
|
||||
redis==5.0.1
|
||||
|
||||
chromadb>=0.4.0
|
||||
sentence-transformers>=2.2.2
|
||||
|
||||
@@ -24,8 +24,11 @@ def get_news():
|
||||
|
||||
db_articles = []
|
||||
for doc in cursor:
|
||||
# Use English title if available, otherwise fallback to original
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'author': doc.get('author'),
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
@@ -114,8 +117,10 @@ def get_clustered_news_internal():
|
||||
# Use cluster_articles from aggregation (already fetched)
|
||||
cluster_articles = doc.get('cluster_articles', [])
|
||||
|
||||
title = doc.get('title_en') if doc.get('title_en') else doc.get('title', '')
|
||||
|
||||
article = {
|
||||
'title': doc.get('title', ''),
|
||||
'title': title,
|
||||
'link': doc.get('link', ''),
|
||||
'source': doc.get('source', ''),
|
||||
'published': doc.get('published_at', ''),
|
||||
@@ -173,7 +178,7 @@ def get_article_by_url(article_url):
|
||||
return jsonify({'error': 'Article not found'}), 404
|
||||
|
||||
return jsonify({
|
||||
'title': article.get('title', ''),
|
||||
'title': article.get('title_en') if article.get('title_en') else article.get('title', ''),
|
||||
'author': article.get('author'),
|
||||
'link': article.get('link', ''),
|
||||
'content': article.get('content', ''),
|
||||
|
||||
88
backend/routes/search_routes.py
Normal file
88
backend/routes/search_routes.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from flask import Blueprint, jsonify, request
|
||||
from config import Config
|
||||
from chroma_client import ChromaClient
|
||||
import logging
|
||||
|
||||
search_bp = Blueprint('search', __name__)
|
||||
|
||||
# Initialize ChromaDB client
|
||||
# Note: We use the hostname 'chromadb' as defined in docker-compose for the backend
|
||||
chroma_client = ChromaClient(
|
||||
host=Config.CHROMA_HOST,
|
||||
port=Config.CHROMA_PORT,
|
||||
collection_name=Config.CHROMA_COLLECTION
|
||||
)
|
||||
|
||||
@search_bp.route('/api/search', methods=['GET'])
|
||||
def search_news():
|
||||
"""
|
||||
Semantic search for news articles using ChromaDB.
|
||||
Query parameters:
|
||||
- q: Search query (required)
|
||||
- limit: Number of results (default: 10)
|
||||
- category: Filter by category (optional)
|
||||
"""
|
||||
try:
|
||||
query = request.args.get('q')
|
||||
if not query:
|
||||
return jsonify({'error': 'Missing search query'}), 400
|
||||
|
||||
limit = int(request.args.get('limit', 10))
|
||||
category = request.args.get('category')
|
||||
|
||||
# Build filter if category provided
|
||||
where_filter = None
|
||||
if category:
|
||||
where_filter = {"category": category}
|
||||
|
||||
# Perform search
|
||||
results = chroma_client.search(
|
||||
query_text=query,
|
||||
n_results=limit,
|
||||
where=where_filter
|
||||
)
|
||||
|
||||
# Format for frontend
|
||||
formatted_response = []
|
||||
for item in results:
|
||||
metadata = item.get('metadata', {})
|
||||
# Use translated title if availble (stored in metadata as title_en or title)
|
||||
# Note: Chroma metadata structure is flat. If we store title_en, we should use it.
|
||||
# But currently we store: title, url, source, category, published_at.
|
||||
# We need to make sure title_en is stored in Chroma OR fetch it from DB.
|
||||
# Faster approach: just rely on what is in Chroma.
|
||||
# BETTER: In crawl, we store title as title_en in metadata if available?
|
||||
# Let's check how we store it in crawler_service.py/chroma_client.py
|
||||
|
||||
# Correction: Looking at crawler_service.py line 456, we pass article_doc to add_articles.
|
||||
# In chroma_client.py line 97, we only extract title, url, source, category, published_at.
|
||||
# We are NOT storing title_en in Chroma metadata currently.
|
||||
|
||||
# FOR NOW: We will stick to the title stored in Chroma, but we should update Chroma storing logic.
|
||||
# However, since the user IS complaining about English, let's assume valid English titles
|
||||
# are what we want to display.
|
||||
|
||||
# Wait, if we change the metadata in ChromaClient to use title_en as the main title,
|
||||
# then search results will automatically show English.
|
||||
|
||||
title = metadata.get('title', 'Unknown Title')
|
||||
|
||||
formatted_response.append({
|
||||
'title': title,
|
||||
'link': metadata.get('url', ''),
|
||||
'source': metadata.get('source', 'Unknown'),
|
||||
'category': metadata.get('category', 'general'),
|
||||
'published_at': metadata.get('published_at', ''),
|
||||
'relevance_score': 1.0 - item.get('distance', 1.0), # Convert distance to score (approx)
|
||||
'snippet': item.get('document', '')[:200] + '...' # Preview
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'query': query,
|
||||
'count': len(formatted_response),
|
||||
'results': formatted_response
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Search error: {str(e)}")
|
||||
return jsonify({'error': str(e)}), 500
|
||||
@@ -1,20 +1,3 @@
|
||||
# Munich News Daily - Docker Compose Configuration
|
||||
#
|
||||
# GPU Support:
|
||||
# To enable GPU acceleration for Ollama (5-10x faster):
|
||||
# 1. Check GPU availability: ./check-gpu.sh
|
||||
# 2. Start with GPU: ./start-with-gpu.sh
|
||||
# Or manually: docker-compose -f docker-compose.yml -f docker-compose.gpu.yml up -d
|
||||
#
|
||||
# Security:
|
||||
# - Only Backend API (port 5001) is exposed to host
|
||||
# - MongoDB is internal-only (not exposed to host)
|
||||
# - Ollama is internal-only (not exposed to host)
|
||||
# - Crawler and Sender are internal-only
|
||||
# All services communicate via internal Docker network
|
||||
#
|
||||
# See docs/OLLAMA_SETUP.md for detailed setup instructions
|
||||
|
||||
services:
|
||||
# Ollama AI Service (Internal only - not exposed to host)
|
||||
ollama:
|
||||
@@ -29,14 +12,6 @@ services:
|
||||
dns:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
# GPU support (uncomment if you have NVIDIA GPU)
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
healthcheck:
|
||||
test: [ "CMD-SHELL", "ollama list || exit 1" ]
|
||||
interval: 30s
|
||||
|
||||
@@ -19,10 +19,10 @@ async function loadCategories() {
|
||||
const response = await fetch('/api/categories');
|
||||
const data = await response.json();
|
||||
const categories = data.categories || [];
|
||||
|
||||
|
||||
const container = document.getElementById('categoryCheckboxes');
|
||||
container.innerHTML = '';
|
||||
|
||||
|
||||
categories.forEach(category => {
|
||||
const label = document.createElement('label');
|
||||
label.className = 'flex items-center space-x-3 cursor-pointer';
|
||||
@@ -40,11 +40,11 @@ async function loadCategories() {
|
||||
async function loadNews() {
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Loading news...</div>';
|
||||
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/news');
|
||||
const data = await response.json();
|
||||
|
||||
|
||||
if (data.articles && data.articles.length > 0) {
|
||||
allArticles = data.articles;
|
||||
filteredArticles = data.articles;
|
||||
@@ -63,24 +63,24 @@ async function loadNews() {
|
||||
|
||||
function loadMoreArticles() {
|
||||
if (isLoading || displayedCount >= filteredArticles.length) return;
|
||||
|
||||
|
||||
isLoading = true;
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
|
||||
|
||||
// Remove loading indicator if exists
|
||||
const loadingIndicator = document.getElementById('loadingIndicator');
|
||||
if (loadingIndicator) loadingIndicator.remove();
|
||||
|
||||
|
||||
// Get next batch of articles
|
||||
const nextBatch = filteredArticles.slice(displayedCount, displayedCount + ARTICLES_PER_PAGE);
|
||||
|
||||
|
||||
nextBatch.forEach((article, index) => {
|
||||
const card = createNewsCard(article, displayedCount + index);
|
||||
newsGrid.appendChild(card);
|
||||
});
|
||||
|
||||
|
||||
displayedCount += nextBatch.length;
|
||||
|
||||
|
||||
// Add loading indicator if more articles available
|
||||
if (displayedCount < filteredArticles.length) {
|
||||
const loader = document.createElement('div');
|
||||
@@ -95,17 +95,17 @@ function loadMoreArticles() {
|
||||
endMessage.textContent = `✓ All ${filteredArticles.length} articles loaded`;
|
||||
newsGrid.appendChild(endMessage);
|
||||
}
|
||||
|
||||
|
||||
isLoading = false;
|
||||
}
|
||||
|
||||
function setupInfiniteScroll() {
|
||||
window.addEventListener('scroll', () => {
|
||||
if (isLoading || displayedCount >= filteredArticles.length) return;
|
||||
|
||||
|
||||
const scrollPosition = window.innerHeight + window.scrollY;
|
||||
const threshold = document.documentElement.scrollHeight - 500;
|
||||
|
||||
|
||||
if (scrollPosition >= threshold) {
|
||||
loadMoreArticles();
|
||||
}
|
||||
@@ -113,53 +113,85 @@ function setupInfiniteScroll() {
|
||||
}
|
||||
|
||||
// Search functionality
|
||||
function handleSearch() {
|
||||
let searchTimeout;
|
||||
|
||||
async function handleSearch() {
|
||||
const searchInput = document.getElementById('searchInput');
|
||||
const clearBtn = document.getElementById('clearSearch');
|
||||
searchQuery = searchInput.value.trim().toLowerCase();
|
||||
|
||||
const searchStats = document.getElementById('searchStats');
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
|
||||
searchQuery = searchInput.value.trim();
|
||||
|
||||
// Show/hide clear button
|
||||
if (searchQuery) {
|
||||
clearBtn.classList.remove('hidden');
|
||||
} else {
|
||||
clearBtn.classList.add('hidden');
|
||||
}
|
||||
|
||||
// Filter articles
|
||||
|
||||
// Clear previous timeout
|
||||
if (searchTimeout) clearTimeout(searchTimeout);
|
||||
|
||||
// If empty query, reset to all articles
|
||||
if (searchQuery === '') {
|
||||
filteredArticles = allArticles;
|
||||
} else {
|
||||
filteredArticles = allArticles.filter(article => {
|
||||
const title = article.title.toLowerCase();
|
||||
const summary = (article.summary || '').toLowerCase().replace(/<[^>]*>/g, '');
|
||||
const source = formatSourceName(article.source).toLowerCase();
|
||||
|
||||
return title.includes(searchQuery) ||
|
||||
summary.includes(searchQuery) ||
|
||||
source.includes(searchQuery);
|
||||
});
|
||||
}
|
||||
|
||||
// Reset display
|
||||
displayedCount = 0;
|
||||
const newsGrid = document.getElementById('newsGrid');
|
||||
newsGrid.innerHTML = '';
|
||||
|
||||
// Update stats
|
||||
updateSearchStats();
|
||||
|
||||
// Load filtered articles
|
||||
if (filteredArticles.length > 0) {
|
||||
displayedCount = 0;
|
||||
newsGrid.innerHTML = '';
|
||||
updateSearchStats();
|
||||
loadMoreArticles();
|
||||
} else {
|
||||
newsGrid.innerHTML = `
|
||||
<div class="text-center py-16">
|
||||
<div class="text-6xl mb-4">🔍</div>
|
||||
<p class="text-xl text-gray-600 mb-2">No articles found</p>
|
||||
<p class="text-gray-400">Try a different search term</p>
|
||||
</div>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
// Debounce search API call
|
||||
searchTimeout = setTimeout(async () => {
|
||||
// Show searching state
|
||||
newsGrid.innerHTML = '<div class="text-center py-10 text-gray-500">Searching...</div>';
|
||||
|
||||
try {
|
||||
const response = await fetch(`/api/search?q=${encodeURIComponent(searchQuery)}&limit=20`);
|
||||
|
||||
// Check if response is ok
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Server returned ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.results && data.results.length > 0) {
|
||||
// Map results to match card format
|
||||
filteredArticles = data.results.map(item => ({
|
||||
title: item.title,
|
||||
link: item.link,
|
||||
source: item.source,
|
||||
summary: item.snippet, // Map snippet to summary
|
||||
published_at: item.published_at,
|
||||
score: item.relevance_score
|
||||
}));
|
||||
|
||||
displayedCount = 0;
|
||||
newsGrid.innerHTML = '';
|
||||
|
||||
// Update stats
|
||||
searchStats.textContent = `Found ${filteredArticles.length} relevant articles`;
|
||||
|
||||
loadMoreArticles();
|
||||
} else {
|
||||
newsGrid.innerHTML = `
|
||||
<div class="text-center py-16">
|
||||
<div class="text-6xl mb-4">🔍</div>
|
||||
<p class="text-xl text-gray-600 mb-2">No relevant articles found</p>
|
||||
<p class="text-gray-400">Try different keywords or concepts</p>
|
||||
</div>
|
||||
`;
|
||||
searchStats.textContent = 'No results found';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Search failed:', error);
|
||||
newsGrid.innerHTML = `<div class="text-center py-10 text-red-400">Search failed: ${error.message}</div>`;
|
||||
}
|
||||
}, 500); // 500ms debounce
|
||||
}
|
||||
|
||||
function clearSearch() {
|
||||
@@ -182,11 +214,11 @@ function createNewsCard(article, index) {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'group bg-white rounded-xl overflow-hidden shadow-md hover:shadow-xl transition-all duration-300 cursor-pointer border border-gray-100 hover:border-primary/30';
|
||||
card.onclick = () => window.open(article.link, '_blank');
|
||||
|
||||
|
||||
// Extract image from summary if it's an img tag (from Süddeutsche)
|
||||
let imageUrl = null;
|
||||
let cleanSummary = article.summary || 'No summary available.';
|
||||
|
||||
|
||||
if (cleanSummary.includes('<img')) {
|
||||
const imgMatch = cleanSummary.match(/src="([^"]+)"/);
|
||||
if (imgMatch) {
|
||||
@@ -195,17 +227,17 @@ function createNewsCard(article, index) {
|
||||
// Remove img tag from summary
|
||||
cleanSummary = cleanSummary.replace(/<img[^>]*>/g, '').replace(/<\/?p>/g, '').trim();
|
||||
}
|
||||
|
||||
|
||||
// Get source icon/emoji
|
||||
const sourceIcon = getSourceIcon(article.source);
|
||||
|
||||
|
||||
// Format source name
|
||||
const sourceName = formatSourceName(article.source);
|
||||
|
||||
|
||||
// Get word count badge
|
||||
const wordCount = article.word_count || article.summary_word_count;
|
||||
const readTime = wordCount ? Math.ceil(wordCount / 200) : null;
|
||||
|
||||
|
||||
card.innerHTML = `
|
||||
<div class="flex flex-col sm:flex-row">
|
||||
<!-- Image -->
|
||||
@@ -237,11 +269,11 @@ function createNewsCard(article, index) {
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
|
||||
// Add staggered animation
|
||||
card.style.opacity = '0';
|
||||
card.style.animation = `fadeIn 0.5s ease-out ${(index % ARTICLES_PER_PAGE) * 0.1}s forwards`;
|
||||
|
||||
|
||||
return card;
|
||||
}
|
||||
|
||||
@@ -293,7 +325,7 @@ async function loadStats() {
|
||||
try {
|
||||
const response = await fetch('/api/stats');
|
||||
const data = await response.json();
|
||||
|
||||
|
||||
if (data.subscribers !== undefined) {
|
||||
document.getElementById('subscriberCount').textContent = data.subscribers.toLocaleString();
|
||||
}
|
||||
@@ -306,44 +338,44 @@ async function subscribe() {
|
||||
const emailInput = document.getElementById('emailInput');
|
||||
const subscribeBtn = document.getElementById('subscribeBtn');
|
||||
const formMessage = document.getElementById('formMessage');
|
||||
|
||||
|
||||
const email = emailInput.value.trim();
|
||||
|
||||
|
||||
if (!email || !email.includes('@')) {
|
||||
formMessage.textContent = 'Please enter a valid email address';
|
||||
formMessage.className = 'text-red-200 font-medium';
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Get selected categories
|
||||
const checkboxes = document.querySelectorAll('#categoryCheckboxes input[type="checkbox"]:checked');
|
||||
const categories = Array.from(checkboxes).map(cb => cb.value);
|
||||
|
||||
|
||||
if (categories.length === 0) {
|
||||
formMessage.textContent = 'Please select at least one category';
|
||||
formMessage.className = 'text-red-200 font-medium';
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
subscribeBtn.disabled = true;
|
||||
subscribeBtn.textContent = 'Subscribing...';
|
||||
subscribeBtn.classList.add('opacity-75', 'cursor-not-allowed');
|
||||
formMessage.textContent = '';
|
||||
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/subscribe', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
body: JSON.stringify({
|
||||
email: email,
|
||||
categories: categories
|
||||
})
|
||||
});
|
||||
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
|
||||
if (response.ok) {
|
||||
formMessage.textContent = data.message || 'Successfully subscribed! Check your email for confirmation.';
|
||||
formMessage.className = 'text-green-200 font-medium';
|
||||
@@ -384,15 +416,15 @@ function closeUnsubscribe() {
|
||||
async function unsubscribe() {
|
||||
const emailInput = document.getElementById('unsubscribeEmail');
|
||||
const unsubscribeMessage = document.getElementById('unsubscribeMessage');
|
||||
|
||||
|
||||
const email = emailInput.value.trim();
|
||||
|
||||
|
||||
if (!email || !email.includes('@')) {
|
||||
unsubscribeMessage.textContent = 'Please enter a valid email address';
|
||||
unsubscribeMessage.className = 'text-red-600 font-medium';
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/unsubscribe', {
|
||||
method: 'POST',
|
||||
@@ -401,9 +433,9 @@ async function unsubscribe() {
|
||||
},
|
||||
body: JSON.stringify({ email: email })
|
||||
});
|
||||
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
|
||||
if (response.ok) {
|
||||
unsubscribeMessage.textContent = data.message || 'Successfully unsubscribed.';
|
||||
unsubscribeMessage.className = 'text-green-600 font-medium';
|
||||
@@ -423,7 +455,7 @@ async function unsubscribe() {
|
||||
}
|
||||
|
||||
// Close modal when clicking outside
|
||||
window.onclick = function(event) {
|
||||
window.onclick = function (event) {
|
||||
const modal = document.getElementById('unsubscribeModal');
|
||||
if (event.target === modal) {
|
||||
closeUnsubscribe();
|
||||
|
||||
@@ -204,6 +204,31 @@ app.get('/api/ollama/config', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/api/search', async (req, res) => {
|
||||
try {
|
||||
const { q, limit, category } = req.query;
|
||||
const response = await axios.get(`${API_URL}/api/search`, {
|
||||
params: { q, limit, category }
|
||||
});
|
||||
res.json(response.data);
|
||||
} catch (error) {
|
||||
if (error.response) {
|
||||
// The request was made and the server responded with a status code
|
||||
// that falls out of the range of 2xx
|
||||
console.error('Search API Error:', error.response.status, error.response.data);
|
||||
res.status(error.response.status).json(error.response.data);
|
||||
} else if (error.request) {
|
||||
// The request was made but no response was received
|
||||
console.error('Search API No Response:', error.request);
|
||||
res.status(502).json({ error: 'Search service unavailable (timeout/connection)' });
|
||||
} else {
|
||||
// Something happened in setting up the request that triggered an Error
|
||||
console.error('Search API Request Error:', error.message);
|
||||
res.status(500).json({ error: 'Internal proxy error' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(PORT, () => {
|
||||
console.log(`Frontend server running on http://localhost:${PORT}`);
|
||||
console.log(`Admin dashboard: http://localhost:${PORT}/admin.html`);
|
||||
|
||||
@@ -87,7 +87,8 @@ class ChromaClient:
|
||||
|
||||
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||
# This gives semantic search a good overview
|
||||
title = article.get('title', '')
|
||||
# Use English title if available, otherwise original
|
||||
title = article.get('title_en') if article.get('title_en') else article.get('title', '')
|
||||
summary = article.get('summary') or ''
|
||||
content_snippet = article.get('content', '')[:1000]
|
||||
|
||||
|
||||
@@ -340,7 +340,11 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
||||
|
||||
if not feed.entries:
|
||||
print(f" ⚠ No entries found in feed")
|
||||
return 0
|
||||
return {
|
||||
'crawled': 0,
|
||||
'summarized': 0,
|
||||
'failed_summaries': 0
|
||||
}
|
||||
|
||||
crawled_count = 0
|
||||
summarized_count = 0
|
||||
|
||||
@@ -37,12 +37,12 @@ def main():
|
||||
"""Main scheduler loop"""
|
||||
print("🤖 Munich News Crawler Scheduler")
|
||||
print("="*60)
|
||||
print("Schedule: Daily at 6:00 AM Berlin time")
|
||||
print("Schedule: Every 3 hours")
|
||||
print("Timezone: Europe/Berlin (CET/CEST)")
|
||||
print("="*60)
|
||||
|
||||
# Schedule the crawler to run at 6 AM Berlin time
|
||||
schedule.every().day.at("06:00").do(run_crawler)
|
||||
# Schedule the crawler to run every 3 hours
|
||||
schedule.every(3).hours.do(run_crawler)
|
||||
|
||||
# Show next run time
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
|
||||
Reference in New Issue
Block a user