""" Web crawler service to extract full article content from RSS feed links """ import requests from bs4 import BeautifulSoup from datetime import datetime from pymongo import MongoClient from pymongo.errors import DuplicateKeyError import feedparser import time import os from dotenv import load_dotenv from rss_utils import extract_article_url, extract_article_summary, extract_published_date from config import Config from ollama_client import OllamaClient # Load environment variables load_dotenv(dotenv_path='../.env') # MongoDB setup client = MongoClient(Config.MONGODB_URI) db = client[Config.DB_NAME] articles_collection = db['articles'] rss_feeds_collection = db['rss_feeds'] # Initialize Ollama client ollama_client = OllamaClient( base_url=Config.OLLAMA_BASE_URL, model=Config.OLLAMA_MODEL, api_key=Config.OLLAMA_API_KEY, enabled=Config.OLLAMA_ENABLED, timeout=Config.OLLAMA_TIMEOUT ) # Print configuration on startup if __name__ != '__main__': Config.print_config() if Config.OLLAMA_ENABLED: print("šŸ¤– Ollama AI summarization: ENABLED") if ollama_client.is_available(): print("āœ“ Ollama server is reachable") else: print("⚠ Warning: Ollama server is not reachable") else: print("ℹ Ollama AI summarization: DISABLED") def get_active_rss_feeds(): """Get all active RSS feeds from database""" feeds = [] cursor = rss_feeds_collection.find({'active': True}) for feed in cursor: feeds.append({ 'id': str(feed['_id']), 'name': feed.get('name', ''), 'url': feed.get('url', ''), 'category': feed.get('category', 'general') }) return feeds def extract_article_content(url, timeout=10): """ Extract main article content from a URL with smart detection Returns: dict with title, content, author, date, and metadata """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']): script.decompose() # === EXTRACT TITLE === title = extract_title(soup) # === EXTRACT AUTHOR === author = extract_author(soup) # === EXTRACT PUBLISHED DATE === published_date = extract_date(soup) # === EXTRACT MAIN CONTENT === content_text = extract_main_content(soup) # === EXTRACT META DESCRIPTION === meta_desc = soup.find('meta', attrs={'name': 'description'}) if not meta_desc: meta_desc = soup.find('meta', attrs={'property': 'og:description'}) description = meta_desc.get('content', '') if meta_desc else '' return { 'title': title, 'author': author, 'content': content_text, # Full content, no limit 'description': description, 'published_date': published_date, 'word_count': len(content_text.split()) if content_text else 0, 'crawled_at': datetime.utcnow() } except requests.exceptions.Timeout: print(f"Timeout crawling {url}") return None except requests.exceptions.RequestException as e: print(f"Error crawling {url}: {e}") return None except Exception as e: print(f"Unexpected error crawling {url}: {e}") return None def extract_title(soup): """ Extract article title using multiple strategies """ # Strategy 1: Look for h1 tag h1 = soup.find('h1') if h1: title = h1.get_text().strip() if title and len(title) > 10: # Reasonable title length return title # Strategy 2: Look for meta og:title og_title = soup.find('meta', attrs={'property': 'og:title'}) if og_title and og_title.get('content'): return og_title.get('content').strip() # Strategy 3: Look for meta twitter:title twitter_title = soup.find('meta', attrs={'name': 'twitter:title'}) if twitter_title and twitter_title.get('content'): return twitter_title.get('content').strip() # Strategy 4: Look for title tag (fallback) title_tag = soup.find('title') if title_tag: title = title_tag.get_text().strip() # Clean up common patterns like "Site Name | Article Title" if ' | ' in title: title = title.split(' | ')[0] elif ' - ' in title: title = title.split(' - ')[0] return title return None def extract_author(soup): """ Extract article author using multiple strategies """ # Strategy 1: Look for meta author meta_author = soup.find('meta', attrs={'name': 'author'}) if meta_author and meta_author.get('content'): return meta_author.get('content').strip() # Strategy 2: Look for rel="author" rel_author = soup.find('a', attrs={'rel': 'author'}) if rel_author: return rel_author.get_text().strip() # Strategy 3: Look for common author class names author_selectors = [ '[class*="author-name"]', '[class*="author"]', '[class*="byline"]', '[class*="writer"]', '[rel="author"]', '[itemprop="author"]' ] for selector in author_selectors: author_elem = soup.select_one(selector) if author_elem: author = author_elem.get_text().strip() # Clean up common patterns author = author.replace('By ', '').replace('by ', '').strip() if author and len(author) < 100: # Reasonable author name length return author # Strategy 4: Look for JSON-LD structured data json_ld = soup.find('script', attrs={'type': 'application/ld+json'}) if json_ld: try: import json data = json.loads(json_ld.string) if isinstance(data, dict) and data.get('author'): author_data = data.get('author') if isinstance(author_data, dict): return author_data.get('name', '') elif isinstance(author_data, str): return author_data except: pass return None def extract_date(soup): """ Extract published date using multiple strategies """ # Strategy 1: Look for time tag with datetime attribute time_tag = soup.find('time') if time_tag and time_tag.get('datetime'): return time_tag.get('datetime') # Strategy 2: Look for meta article:published_time meta_published = soup.find('meta', attrs={'property': 'article:published_time'}) if meta_published and meta_published.get('content'): return meta_published.get('content') # Strategy 3: Look for meta og:published_time og_published = soup.find('meta', attrs={'property': 'og:published_time'}) if og_published and og_published.get('content'): return og_published.get('content') # Strategy 4: Look for common date class names date_selectors = [ '[class*="publish-date"]', '[class*="published"]', '[class*="date"]', '[class*="timestamp"]', '[itemprop="datePublished"]' ] for selector in date_selectors: date_elem = soup.select_one(selector) if date_elem: # Try datetime attribute first if date_elem.get('datetime'): return date_elem.get('datetime') # Otherwise get text date_text = date_elem.get_text().strip() if date_text and len(date_text) < 50: return date_text # Strategy 5: Look for JSON-LD structured data json_ld = soup.find('script', attrs={'type': 'application/ld+json'}) if json_ld: try: import json data = json.loads(json_ld.string) if isinstance(data, dict): return data.get('datePublished') or data.get('dateCreated') except: pass return None def extract_main_content(soup): """ Extract main article content using multiple strategies """ # Strategy 1: Try common article content selectors content_selectors = [ 'article', '[class*="article-content"]', '[class*="article-body"]', '[class*="post-content"]', '[class*="entry-content"]', '[class*="content-body"]', '[class*="story-body"]', '[itemprop="articleBody"]', 'main' ] article_content = None for selector in content_selectors: element = soup.select_one(selector) if element: article_content = element break # Fallback: get body if not article_content: article_content = soup.find('body') if not article_content: return '' # Extract text from paragraphs paragraphs = article_content.find_all('p') # Filter out short paragraphs (likely navigation/ads) content_paragraphs = [] for p in paragraphs: text = p.get_text().strip() # Keep paragraphs with at least 50 characters if len(text) >= 50: content_paragraphs.append(text) content_text = '\n\n'.join(content_paragraphs) return content_text def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10): """ Crawl articles from an RSS feed Returns: dict with statistics """ print(f"\nšŸ“° Crawling feed: {feed_name}") print(f" URL: {feed_url}") try: # Parse RSS feed feed = feedparser.parse(feed_url) if not feed.entries: print(f" ⚠ No entries found in feed") return 0 crawled_count = 0 summarized_count = 0 failed_summaries = 0 for entry in feed.entries[:max_articles]: # Extract article URL using utility function article_url = extract_article_url(entry) if not article_url: print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}") continue # Check if article already exists and has content existing = articles_collection.find_one({'link': article_url}) if existing and existing.get('content'): print(f" ā­ Skipping (already crawled): {entry.get('title', 'No title')[:50]}") continue print(f" šŸ” Crawling: {entry.get('title', 'No title')[:50]}...") # Extract full content article_data = extract_article_content(article_url) if article_data and article_data.get('content'): # Summarize with Ollama if enabled summary_result = None if Config.OLLAMA_ENABLED and article_data.get('content'): print(f" šŸ¤– Summarizing with AI...") summary_result = ollama_client.summarize_article( article_data['content'], max_words=Config.SUMMARY_MAX_WORDS ) if summary_result['success']: print(f" āœ“ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)") summarized_count += 1 else: print(f" ⚠ Summarization failed: {summary_result['error']}") failed_summaries += 1 # Prepare document article_doc = { 'title': article_data.get('title') or entry.get('title', ''), 'author': article_data.get('author'), 'link': article_url, 'content': article_data.get('content', ''), # Full article content 'summary': summary_result['summary'] if summary_result and summary_result['success'] else None, 'word_count': article_data.get('word_count', 0), 'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None, 'source': feed_name, 'category': feed_category, 'published_at': extract_published_date(entry) or article_data.get('published_date', ''), 'crawled_at': article_data.get('crawled_at'), 'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None, 'created_at': datetime.utcnow() } try: # Upsert: update if exists, insert if not articles_collection.update_one( {'link': article_url}, {'$set': article_doc}, upsert=True ) crawled_count += 1 print(f" āœ“ Saved ({article_data.get('word_count', 0)} words)") except DuplicateKeyError: print(f" ⚠ Duplicate key error") except Exception as e: print(f" āœ— Error saving: {e}") else: print(f" āœ— Failed to extract content") # Be nice to servers - add delay time.sleep(1) print(f" āœ“ Crawled {crawled_count} articles from {feed_name}") if Config.OLLAMA_ENABLED: print(f" šŸ¤– Summarized: {summarized_count}, Failed: {failed_summaries}") return { 'crawled': crawled_count, 'summarized': summarized_count, 'failed_summaries': failed_summaries } except Exception as e: print(f" āœ— Error processing feed {feed_name}: {e}") return 0 def crawl_all_feeds(max_articles_per_feed=10): """ Crawl all active RSS feeds Returns: dict with statistics """ print("\n" + "="*60) print("šŸš€ Starting RSS Feed Crawler") print("="*60) start_time = time.time() feeds = get_active_rss_feeds() if not feeds: print("⚠ No active RSS feeds found") return { 'total_feeds': 0, 'total_articles_crawled': 0, 'duration_seconds': 0 } print(f"Found {len(feeds)} active feed(s)") if Config.OLLAMA_ENABLED: print(f"šŸ¤– AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)") total_crawled = 0 total_summarized = 0 total_failed = 0 for feed in feeds: result = crawl_rss_feed( feed['url'], feed['name'], feed['category'], max_articles=max_articles_per_feed ) total_crawled += result['crawled'] total_summarized += result['summarized'] total_failed += result['failed_summaries'] duration = time.time() - start_time print("\n" + "="*60) print(f"āœ“ Crawling Complete!") print(f" Total feeds processed: {len(feeds)}") print(f" Total articles crawled: {total_crawled}") if Config.OLLAMA_ENABLED: print(f" Total articles summarized: {total_summarized}") print(f" Failed summarizations: {total_failed}") if total_summarized > 0: success_rate = (total_summarized / (total_summarized + total_failed)) * 100 print(f" Success rate: {success_rate:.1f}%") print(f" Duration: {duration:.2f} seconds") if total_crawled > 0: print(f" Average time per article: {duration/total_crawled:.1f}s") print("="*60 + "\n") return { 'total_feeds': len(feeds), 'total_articles_crawled': total_crawled, 'total_summarized': total_summarized, 'failed_summaries': total_failed, 'duration_seconds': round(duration, 2) } if __name__ == '__main__': # Can be run standalone for testing import sys max_articles = 10 if len(sys.argv) > 1: try: max_articles = int(sys.argv[1]) except ValueError: print("Usage: python crawler_service.py [max_articles_per_feed]") sys.exit(1) crawl_all_feeds(max_articles_per_feed=max_articles)