update
This commit is contained in:
@@ -12,12 +12,12 @@ COPY backend/config.py /app/config.py
|
||||
# Copy crawler files (includes ollama_client.py)
|
||||
COPY news_crawler/ /app/
|
||||
|
||||
# Make the scheduler executable
|
||||
RUN chmod +x scheduled_crawler.py
|
||||
# Make scripts executable
|
||||
RUN chmod +x scheduled_crawler.py start.sh
|
||||
|
||||
# Set timezone to Berlin
|
||||
ENV TZ=Europe/Berlin
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
# Run the scheduled crawler
|
||||
CMD ["python", "-u", "scheduled_crawler.py"]
|
||||
# Run both scheduler and worker
|
||||
CMD ["/app/start.sh"]
|
||||
|
||||
@@ -6,3 +6,4 @@ pymongo==4.6.1
|
||||
python-dotenv==1.0.0
|
||||
schedule==1.2.0
|
||||
pytz==2023.3
|
||||
redis==5.0.1
|
||||
|
||||
8
news_crawler/start.sh
Normal file
8
news_crawler/start.sh
Normal file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
# Start both the scheduler and the worker
|
||||
|
||||
# Start the worker in the background
|
||||
python -u worker.py &
|
||||
|
||||
# Start the scheduler in the foreground
|
||||
python -u scheduled_crawler.py
|
||||
72
news_crawler/worker.py
Normal file
72
news_crawler/worker.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
News Crawler Worker - Listens to Redis queue and processes crawl tasks
|
||||
"""
|
||||
import redis
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from crawler_service import crawl_all_feeds
|
||||
|
||||
REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
|
||||
QUEUE_NAME = 'news_crawl_queue'
|
||||
|
||||
|
||||
def get_redis_client():
|
||||
"""Get Redis client"""
|
||||
return redis.from_url(REDIS_URL, decode_responses=True)
|
||||
|
||||
|
||||
def process_crawl_task(message):
|
||||
"""Process a crawl task"""
|
||||
try:
|
||||
max_articles = message.get('max_articles', 10)
|
||||
print(f"\n📨 Received task: {message.get('task')}")
|
||||
print(f" Max articles per feed: {max_articles}")
|
||||
print(f" Timestamp: {message.get('timestamp')}")
|
||||
|
||||
# Run the crawler
|
||||
result = crawl_all_feeds(max_articles_per_feed=max_articles)
|
||||
|
||||
print(f"✅ Task completed: {result.get('total_articles_crawled')} articles crawled")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Task failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Main worker loop"""
|
||||
print("="*70)
|
||||
print("📰 News Crawler Worker Starting")
|
||||
print("="*70)
|
||||
print(f"Redis URL: {REDIS_URL}")
|
||||
print(f"Queue: {QUEUE_NAME}")
|
||||
print("Waiting for tasks...")
|
||||
print("="*70)
|
||||
|
||||
r = get_redis_client()
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Block and wait for messages (timeout 1 second)
|
||||
result = r.brpop(QUEUE_NAME, timeout=1)
|
||||
|
||||
if result:
|
||||
queue_name, message_json = result
|
||||
message = json.loads(message_json)
|
||||
process_crawl_task(message)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Worker stopped by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"\n❌ Worker error: {e}")
|
||||
time.sleep(5) # Wait before retrying
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user