This commit is contained in:
2025-11-14 12:51:18 +01:00
parent 433a16ee0e
commit 869ca3a894
20 changed files with 1606 additions and 38 deletions

View File

@@ -12,12 +12,12 @@ COPY backend/config.py /app/config.py
# Copy crawler files (includes ollama_client.py)
COPY news_crawler/ /app/
# Make the scheduler executable
RUN chmod +x scheduled_crawler.py
# Make scripts executable
RUN chmod +x scheduled_crawler.py start.sh
# Set timezone to Berlin
ENV TZ=Europe/Berlin
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
# Run the scheduled crawler
CMD ["python", "-u", "scheduled_crawler.py"]
# Run both scheduler and worker
CMD ["/app/start.sh"]

View File

@@ -6,3 +6,4 @@ pymongo==4.6.1
python-dotenv==1.0.0
schedule==1.2.0
pytz==2023.3
redis==5.0.1

8
news_crawler/start.sh Normal file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
# Start both the scheduler and the worker
# Start the worker in the background
python -u worker.py &
# Start the scheduler in the foreground
python -u scheduled_crawler.py

72
news_crawler/worker.py Normal file
View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""
News Crawler Worker - Listens to Redis queue and processes crawl tasks
"""
import redis
import json
import os
import time
from crawler_service import crawl_all_feeds
REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
QUEUE_NAME = 'news_crawl_queue'
def get_redis_client():
"""Get Redis client"""
return redis.from_url(REDIS_URL, decode_responses=True)
def process_crawl_task(message):
"""Process a crawl task"""
try:
max_articles = message.get('max_articles', 10)
print(f"\n📨 Received task: {message.get('task')}")
print(f" Max articles per feed: {max_articles}")
print(f" Timestamp: {message.get('timestamp')}")
# Run the crawler
result = crawl_all_feeds(max_articles_per_feed=max_articles)
print(f"✅ Task completed: {result.get('total_articles_crawled')} articles crawled")
return True
except Exception as e:
print(f"❌ Task failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main worker loop"""
print("="*70)
print("📰 News Crawler Worker Starting")
print("="*70)
print(f"Redis URL: {REDIS_URL}")
print(f"Queue: {QUEUE_NAME}")
print("Waiting for tasks...")
print("="*70)
r = get_redis_client()
while True:
try:
# Block and wait for messages (timeout 1 second)
result = r.brpop(QUEUE_NAME, timeout=1)
if result:
queue_name, message_json = result
message = json.loads(message_json)
process_crawl_task(message)
except KeyboardInterrupt:
print("\n\n👋 Worker stopped by user")
break
except Exception as e:
print(f"\n❌ Worker error: {e}")
time.sleep(5) # Wait before retrying
if __name__ == '__main__':
main()