510 lines
18 KiB
Python
510 lines
18 KiB
Python
"""
|
||
Web crawler service to extract full article content from RSS feed links
|
||
"""
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from datetime import datetime
|
||
from pymongo import MongoClient
|
||
from pymongo.errors import DuplicateKeyError
|
||
import feedparser
|
||
import time
|
||
import os
|
||
from dotenv import load_dotenv
|
||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||
from config import Config
|
||
from ollama_client import OllamaClient
|
||
|
||
# Load environment variables
|
||
load_dotenv(dotenv_path='../.env')
|
||
|
||
# MongoDB setup
|
||
client = MongoClient(Config.MONGODB_URI)
|
||
db = client[Config.DB_NAME]
|
||
|
||
articles_collection = db['articles']
|
||
rss_feeds_collection = db['rss_feeds']
|
||
|
||
# Initialize Ollama client
|
||
ollama_client = OllamaClient(
|
||
base_url=Config.OLLAMA_BASE_URL,
|
||
model=Config.OLLAMA_MODEL,
|
||
api_key=Config.OLLAMA_API_KEY,
|
||
enabled=Config.OLLAMA_ENABLED,
|
||
timeout=Config.OLLAMA_TIMEOUT
|
||
)
|
||
|
||
# Print configuration on startup
|
||
if __name__ != '__main__':
|
||
Config.print_config()
|
||
if Config.OLLAMA_ENABLED:
|
||
print("🤖 Ollama AI summarization: ENABLED")
|
||
if ollama_client.is_available():
|
||
print("✓ Ollama server is reachable")
|
||
else:
|
||
print("⚠ Warning: Ollama server is not reachable")
|
||
else:
|
||
print("ℹ Ollama AI summarization: DISABLED")
|
||
|
||
|
||
def get_active_rss_feeds():
|
||
"""Get all active RSS feeds from database"""
|
||
feeds = []
|
||
cursor = rss_feeds_collection.find({'active': True})
|
||
for feed in cursor:
|
||
feeds.append({
|
||
'id': str(feed['_id']),
|
||
'name': feed.get('name', ''),
|
||
'url': feed.get('url', ''),
|
||
'category': feed.get('category', 'general')
|
||
})
|
||
return feeds
|
||
|
||
|
||
def extract_article_content(url, timeout=10):
|
||
"""
|
||
Extract main article content from a URL with smart detection
|
||
Returns: dict with title, content, author, date, and metadata
|
||
"""
|
||
try:
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||
}
|
||
|
||
response = requests.get(url, headers=headers, timeout=timeout)
|
||
response.raise_for_status()
|
||
|
||
soup = BeautifulSoup(response.content, 'html.parser')
|
||
|
||
# Remove script and style elements
|
||
for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
|
||
script.decompose()
|
||
|
||
# === EXTRACT TITLE ===
|
||
title = extract_title(soup)
|
||
|
||
# === EXTRACT AUTHOR ===
|
||
author = extract_author(soup)
|
||
|
||
# === EXTRACT PUBLISHED DATE ===
|
||
published_date = extract_date(soup)
|
||
|
||
# === EXTRACT MAIN CONTENT ===
|
||
content_text = extract_main_content(soup)
|
||
|
||
# === EXTRACT META DESCRIPTION ===
|
||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||
if not meta_desc:
|
||
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
|
||
description = meta_desc.get('content', '') if meta_desc else ''
|
||
|
||
return {
|
||
'title': title,
|
||
'author': author,
|
||
'content': content_text, # Full content, no limit
|
||
'description': description,
|
||
'published_date': published_date,
|
||
'word_count': len(content_text.split()) if content_text else 0,
|
||
'crawled_at': datetime.utcnow()
|
||
}
|
||
|
||
except requests.exceptions.Timeout:
|
||
print(f"Timeout crawling {url}")
|
||
return None
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"Error crawling {url}: {e}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"Unexpected error crawling {url}: {e}")
|
||
return None
|
||
|
||
|
||
def extract_title(soup):
|
||
"""
|
||
Extract article title using multiple strategies
|
||
"""
|
||
# Strategy 1: Look for h1 tag
|
||
h1 = soup.find('h1')
|
||
if h1:
|
||
title = h1.get_text().strip()
|
||
if title and len(title) > 10: # Reasonable title length
|
||
return title
|
||
|
||
# Strategy 2: Look for meta og:title
|
||
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
||
if og_title and og_title.get('content'):
|
||
return og_title.get('content').strip()
|
||
|
||
# Strategy 3: Look for meta twitter:title
|
||
twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
|
||
if twitter_title and twitter_title.get('content'):
|
||
return twitter_title.get('content').strip()
|
||
|
||
# Strategy 4: Look for title tag (fallback)
|
||
title_tag = soup.find('title')
|
||
if title_tag:
|
||
title = title_tag.get_text().strip()
|
||
# Clean up common patterns like "Site Name | Article Title"
|
||
if ' | ' in title:
|
||
title = title.split(' | ')[0]
|
||
elif ' - ' in title:
|
||
title = title.split(' - ')[0]
|
||
return title
|
||
|
||
return None
|
||
|
||
|
||
def extract_author(soup):
|
||
"""
|
||
Extract article author using multiple strategies
|
||
"""
|
||
# Strategy 1: Look for meta author
|
||
meta_author = soup.find('meta', attrs={'name': 'author'})
|
||
if meta_author and meta_author.get('content'):
|
||
return meta_author.get('content').strip()
|
||
|
||
# Strategy 2: Look for rel="author"
|
||
rel_author = soup.find('a', attrs={'rel': 'author'})
|
||
if rel_author:
|
||
return rel_author.get_text().strip()
|
||
|
||
# Strategy 3: Look for common author class names
|
||
author_selectors = [
|
||
'[class*="author-name"]',
|
||
'[class*="author"]',
|
||
'[class*="byline"]',
|
||
'[class*="writer"]',
|
||
'[rel="author"]',
|
||
'[itemprop="author"]'
|
||
]
|
||
|
||
for selector in author_selectors:
|
||
author_elem = soup.select_one(selector)
|
||
if author_elem:
|
||
author = author_elem.get_text().strip()
|
||
# Clean up common patterns
|
||
author = author.replace('By ', '').replace('by ', '').strip()
|
||
if author and len(author) < 100: # Reasonable author name length
|
||
return author
|
||
|
||
# Strategy 4: Look for JSON-LD structured data
|
||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||
if json_ld:
|
||
try:
|
||
import json
|
||
data = json.loads(json_ld.string)
|
||
if isinstance(data, dict) and data.get('author'):
|
||
author_data = data.get('author')
|
||
if isinstance(author_data, dict):
|
||
return author_data.get('name', '')
|
||
elif isinstance(author_data, str):
|
||
return author_data
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
def extract_date(soup):
|
||
"""
|
||
Extract published date using multiple strategies
|
||
"""
|
||
# Strategy 1: Look for time tag with datetime attribute
|
||
time_tag = soup.find('time')
|
||
if time_tag and time_tag.get('datetime'):
|
||
return time_tag.get('datetime')
|
||
|
||
# Strategy 2: Look for meta article:published_time
|
||
meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
|
||
if meta_published and meta_published.get('content'):
|
||
return meta_published.get('content')
|
||
|
||
# Strategy 3: Look for meta og:published_time
|
||
og_published = soup.find('meta', attrs={'property': 'og:published_time'})
|
||
if og_published and og_published.get('content'):
|
||
return og_published.get('content')
|
||
|
||
# Strategy 4: Look for common date class names
|
||
date_selectors = [
|
||
'[class*="publish-date"]',
|
||
'[class*="published"]',
|
||
'[class*="date"]',
|
||
'[class*="timestamp"]',
|
||
'[itemprop="datePublished"]'
|
||
]
|
||
|
||
for selector in date_selectors:
|
||
date_elem = soup.select_one(selector)
|
||
if date_elem:
|
||
# Try datetime attribute first
|
||
if date_elem.get('datetime'):
|
||
return date_elem.get('datetime')
|
||
# Otherwise get text
|
||
date_text = date_elem.get_text().strip()
|
||
if date_text and len(date_text) < 50:
|
||
return date_text
|
||
|
||
# Strategy 5: Look for JSON-LD structured data
|
||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||
if json_ld:
|
||
try:
|
||
import json
|
||
data = json.loads(json_ld.string)
|
||
if isinstance(data, dict):
|
||
return data.get('datePublished') or data.get('dateCreated')
|
||
except:
|
||
pass
|
||
|
||
return None
|
||
|
||
|
||
def extract_main_content(soup):
|
||
"""
|
||
Extract main article content using multiple strategies
|
||
"""
|
||
# Strategy 1: Try common article content selectors
|
||
content_selectors = [
|
||
'article',
|
||
'[class*="article-content"]',
|
||
'[class*="article-body"]',
|
||
'[class*="post-content"]',
|
||
'[class*="entry-content"]',
|
||
'[class*="content-body"]',
|
||
'[class*="story-body"]',
|
||
'[itemprop="articleBody"]',
|
||
'main'
|
||
]
|
||
|
||
article_content = None
|
||
for selector in content_selectors:
|
||
element = soup.select_one(selector)
|
||
if element:
|
||
article_content = element
|
||
break
|
||
|
||
# Fallback: get body
|
||
if not article_content:
|
||
article_content = soup.find('body')
|
||
|
||
if not article_content:
|
||
return ''
|
||
|
||
# Extract text from paragraphs
|
||
paragraphs = article_content.find_all('p')
|
||
|
||
# Filter out short paragraphs (likely navigation/ads)
|
||
content_paragraphs = []
|
||
for p in paragraphs:
|
||
text = p.get_text().strip()
|
||
# Keep paragraphs with at least 50 characters
|
||
if len(text) >= 50:
|
||
content_paragraphs.append(text)
|
||
|
||
content_text = '\n\n'.join(content_paragraphs)
|
||
|
||
return content_text
|
||
|
||
|
||
def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10):
|
||
"""
|
||
Crawl articles from an RSS feed
|
||
Returns: dict with statistics
|
||
"""
|
||
print(f"\n📰 Crawling feed: {feed_name}")
|
||
print(f" URL: {feed_url}")
|
||
|
||
try:
|
||
# Parse RSS feed
|
||
feed = feedparser.parse(feed_url)
|
||
|
||
if not feed.entries:
|
||
print(f" ⚠ No entries found in feed")
|
||
return 0
|
||
|
||
crawled_count = 0
|
||
summarized_count = 0
|
||
failed_summaries = 0
|
||
|
||
for entry in feed.entries[:max_articles]:
|
||
# Extract article URL using utility function
|
||
article_url = extract_article_url(entry)
|
||
|
||
if not article_url:
|
||
print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
|
||
continue
|
||
|
||
# Check if article already exists and has content
|
||
existing = articles_collection.find_one({'link': article_url})
|
||
if existing and existing.get('content'):
|
||
print(f" ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
|
||
continue
|
||
|
||
print(f" 🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
|
||
|
||
# Extract full content
|
||
article_data = extract_article_content(article_url)
|
||
|
||
if article_data and article_data.get('content'):
|
||
# Store original title
|
||
original_title = article_data.get('title') or entry.get('title', '')
|
||
|
||
# Translate title with Ollama if enabled
|
||
translation_result = None
|
||
if Config.OLLAMA_ENABLED and original_title:
|
||
print(f" 🌐 Translating title...")
|
||
translation_result = ollama_client.translate_title(original_title)
|
||
|
||
if translation_result and translation_result['success']:
|
||
print(f" ✓ Title translated ({translation_result['duration']:.1f}s)")
|
||
else:
|
||
error_msg = translation_result['error'] if translation_result else 'Unknown error'
|
||
print(f" ⚠ Translation failed: {error_msg}")
|
||
|
||
# Summarize with Ollama if enabled
|
||
summary_result = None
|
||
if Config.OLLAMA_ENABLED and article_data.get('content'):
|
||
print(f" 🤖 Summarizing with AI...")
|
||
summary_result = ollama_client.summarize_article(
|
||
article_data['content'],
|
||
max_words=Config.SUMMARY_MAX_WORDS
|
||
)
|
||
|
||
if summary_result['success']:
|
||
print(f" ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
|
||
summarized_count += 1
|
||
else:
|
||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||
failed_summaries += 1
|
||
|
||
# Prepare document
|
||
article_doc = {
|
||
'title': original_title,
|
||
'title_en': translation_result['translated_title'] if translation_result and translation_result['success'] else None,
|
||
'author': article_data.get('author'),
|
||
'link': article_url,
|
||
'content': article_data.get('content', ''), # Full article content
|
||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||
'word_count': article_data.get('word_count', 0),
|
||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||
'source': feed_name,
|
||
'category': feed_category,
|
||
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
|
||
'crawled_at': article_data.get('crawled_at'),
|
||
'translated_at': datetime.utcnow() if translation_result and translation_result['success'] else None,
|
||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
||
'created_at': datetime.utcnow()
|
||
}
|
||
|
||
try:
|
||
# Upsert: update if exists, insert if not
|
||
articles_collection.update_one(
|
||
{'link': article_url},
|
||
{'$set': article_doc},
|
||
upsert=True
|
||
)
|
||
crawled_count += 1
|
||
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
||
|
||
except DuplicateKeyError:
|
||
print(f" ⚠ Duplicate key error")
|
||
except Exception as e:
|
||
print(f" ✗ Error saving: {e}")
|
||
else:
|
||
print(f" ✗ Failed to extract content")
|
||
|
||
# Be nice to servers - add delay
|
||
time.sleep(1)
|
||
|
||
print(f" ✓ Crawled {crawled_count} articles from {feed_name}")
|
||
if Config.OLLAMA_ENABLED:
|
||
print(f" 🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
|
||
|
||
return {
|
||
'crawled': crawled_count,
|
||
'summarized': summarized_count,
|
||
'failed_summaries': failed_summaries
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f" ✗ Error processing feed {feed_name}: {e}")
|
||
return 0
|
||
|
||
|
||
def crawl_all_feeds(max_articles_per_feed=10):
|
||
"""
|
||
Crawl all active RSS feeds
|
||
Returns: dict with statistics
|
||
"""
|
||
print("\n" + "="*60)
|
||
print("🚀 Starting RSS Feed Crawler")
|
||
print("="*60)
|
||
|
||
start_time = time.time()
|
||
feeds = get_active_rss_feeds()
|
||
|
||
if not feeds:
|
||
print("⚠ No active RSS feeds found")
|
||
return {
|
||
'total_feeds': 0,
|
||
'total_articles_crawled': 0,
|
||
'duration_seconds': 0
|
||
}
|
||
|
||
print(f"Found {len(feeds)} active feed(s)")
|
||
if Config.OLLAMA_ENABLED:
|
||
print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
|
||
|
||
total_crawled = 0
|
||
total_summarized = 0
|
||
total_failed = 0
|
||
|
||
for feed in feeds:
|
||
result = crawl_rss_feed(
|
||
feed['url'],
|
||
feed['name'],
|
||
feed['category'],
|
||
max_articles=max_articles_per_feed
|
||
)
|
||
total_crawled += result['crawled']
|
||
total_summarized += result['summarized']
|
||
total_failed += result['failed_summaries']
|
||
|
||
duration = time.time() - start_time
|
||
|
||
print("\n" + "="*60)
|
||
print(f"✓ Crawling Complete!")
|
||
print(f" Total feeds processed: {len(feeds)}")
|
||
print(f" Total articles crawled: {total_crawled}")
|
||
if Config.OLLAMA_ENABLED:
|
||
print(f" Total articles summarized: {total_summarized}")
|
||
print(f" Failed summarizations: {total_failed}")
|
||
if total_summarized > 0:
|
||
success_rate = (total_summarized / (total_summarized + total_failed)) * 100
|
||
print(f" Success rate: {success_rate:.1f}%")
|
||
print(f" Duration: {duration:.2f} seconds")
|
||
if total_crawled > 0:
|
||
print(f" Average time per article: {duration/total_crawled:.1f}s")
|
||
print("="*60 + "\n")
|
||
|
||
return {
|
||
'total_feeds': len(feeds),
|
||
'total_articles_crawled': total_crawled,
|
||
'total_summarized': total_summarized,
|
||
'failed_summaries': total_failed,
|
||
'duration_seconds': round(duration, 2)
|
||
}
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# Can be run standalone for testing
|
||
import sys
|
||
max_articles = 10
|
||
|
||
if len(sys.argv) > 1:
|
||
try:
|
||
max_articles = int(sys.argv[1])
|
||
except ValueError:
|
||
print("Usage: python crawler_service.py [max_articles_per_feed]")
|
||
sys.exit(1)
|
||
|
||
crawl_all_feeds(max_articles_per_feed=max_articles)
|