Files
Munich-news/news_crawler/crawler_service.py
2025-11-28 15:50:44 +00:00

577 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Web crawler service to extract full article content from RSS feed links
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import feedparser
import time
import os
from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
from config import Config
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import create_cluster_summaries
# Load environment variables
load_dotenv(dotenv_path='../.env')
# MongoDB setup
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
articles_collection = db['articles']
rss_feeds_collection = db['rss_feeds']
# Initialize Ollama client
ollama_client = OllamaClient(
base_url=Config.OLLAMA_BASE_URL,
model=Config.OLLAMA_MODEL,
api_key=Config.OLLAMA_API_KEY,
enabled=Config.OLLAMA_ENABLED,
timeout=Config.OLLAMA_TIMEOUT
)
# Initialize Article Clusterer (will be initialized after ollama_client)
article_clusterer = None
# Print configuration on startup
if __name__ != '__main__':
Config.print_config()
if Config.OLLAMA_ENABLED:
print("🤖 Ollama AI summarization: ENABLED")
if ollama_client.is_available():
print("✓ Ollama server is reachable")
else:
print("⚠ Warning: Ollama server is not reachable")
else:
print(" Ollama AI summarization: DISABLED")
# Initialize Article Clusterer with ollama_client
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60, # Not used when AI is enabled
time_window_hours=24 # Look back 24 hours
)
print("🔗 Article clustering: ENABLED (AI-powered)")
def get_active_rss_feeds():
"""Get all active RSS feeds from database"""
feeds = []
cursor = rss_feeds_collection.find({'active': True})
for feed in cursor:
feeds.append({
'id': str(feed['_id']),
'name': feed.get('name', ''),
'url': feed.get('url', ''),
'category': feed.get('category', 'general')
})
return feeds
def extract_article_content(url, timeout=10):
"""
Extract main article content from a URL with smart detection
Returns: dict with title, content, author, date, and metadata
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
script.decompose()
# === EXTRACT TITLE ===
title = extract_title(soup)
# === EXTRACT AUTHOR ===
author = extract_author(soup)
# === EXTRACT PUBLISHED DATE ===
published_date = extract_date(soup)
# === EXTRACT MAIN CONTENT ===
content_text = extract_main_content(soup)
# === EXTRACT META DESCRIPTION ===
meta_desc = soup.find('meta', attrs={'name': 'description'})
if not meta_desc:
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
description = meta_desc.get('content', '') if meta_desc else ''
return {
'title': title,
'author': author,
'content': content_text, # Full content, no limit
'description': description,
'published_date': published_date,
'word_count': len(content_text.split()) if content_text else 0,
'crawled_at': datetime.utcnow()
}
except requests.exceptions.Timeout:
print(f"Timeout crawling {url}")
return None
except requests.exceptions.RequestException as e:
print(f"Error crawling {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error crawling {url}: {e}")
return None
def extract_title(soup):
"""
Extract article title using multiple strategies
"""
# Strategy 1: Look for h1 tag
h1 = soup.find('h1')
if h1:
title = h1.get_text().strip()
if title and len(title) > 10: # Reasonable title length
return title
# Strategy 2: Look for meta og:title
og_title = soup.find('meta', attrs={'property': 'og:title'})
if og_title and og_title.get('content'):
return og_title.get('content').strip()
# Strategy 3: Look for meta twitter:title
twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
if twitter_title and twitter_title.get('content'):
return twitter_title.get('content').strip()
# Strategy 4: Look for title tag (fallback)
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
# Clean up common patterns like "Site Name | Article Title"
if ' | ' in title:
title = title.split(' | ')[0]
elif ' - ' in title:
title = title.split(' - ')[0]
return title
return None
def extract_author(soup):
"""
Extract article author using multiple strategies
"""
# Strategy 1: Look for meta author
meta_author = soup.find('meta', attrs={'name': 'author'})
if meta_author and meta_author.get('content'):
return meta_author.get('content').strip()
# Strategy 2: Look for rel="author"
rel_author = soup.find('a', attrs={'rel': 'author'})
if rel_author:
return rel_author.get_text().strip()
# Strategy 3: Look for common author class names
author_selectors = [
'[class*="author-name"]',
'[class*="author"]',
'[class*="byline"]',
'[class*="writer"]',
'[rel="author"]',
'[itemprop="author"]'
]
for selector in author_selectors:
author_elem = soup.select_one(selector)
if author_elem:
author = author_elem.get_text().strip()
# Clean up common patterns
author = author.replace('By ', '').replace('by ', '').strip()
if author and len(author) < 100: # Reasonable author name length
return author
# Strategy 4: Look for JSON-LD structured data
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
if json_ld:
try:
import json
data = json.loads(json_ld.string)
if isinstance(data, dict) and data.get('author'):
author_data = data.get('author')
if isinstance(author_data, dict):
return author_data.get('name', '')
elif isinstance(author_data, str):
return author_data
except:
pass
return None
def extract_date(soup):
"""
Extract published date using multiple strategies
"""
# Strategy 1: Look for time tag with datetime attribute
time_tag = soup.find('time')
if time_tag and time_tag.get('datetime'):
return time_tag.get('datetime')
# Strategy 2: Look for meta article:published_time
meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
if meta_published and meta_published.get('content'):
return meta_published.get('content')
# Strategy 3: Look for meta og:published_time
og_published = soup.find('meta', attrs={'property': 'og:published_time'})
if og_published and og_published.get('content'):
return og_published.get('content')
# Strategy 4: Look for common date class names
date_selectors = [
'[class*="publish-date"]',
'[class*="published"]',
'[class*="date"]',
'[class*="timestamp"]',
'[itemprop="datePublished"]'
]
for selector in date_selectors:
date_elem = soup.select_one(selector)
if date_elem:
# Try datetime attribute first
if date_elem.get('datetime'):
return date_elem.get('datetime')
# Otherwise get text
date_text = date_elem.get_text().strip()
if date_text and len(date_text) < 50:
return date_text
# Strategy 5: Look for JSON-LD structured data
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
if json_ld:
try:
import json
data = json.loads(json_ld.string)
if isinstance(data, dict):
return data.get('datePublished') or data.get('dateCreated')
except:
pass
return None
def extract_main_content(soup):
"""
Extract main article content using multiple strategies
"""
# Strategy 1: Try common article content selectors
content_selectors = [
'article',
'[class*="article-content"]',
'[class*="article-body"]',
'[class*="post-content"]',
'[class*="entry-content"]',
'[class*="content-body"]',
'[class*="story-body"]',
'[itemprop="articleBody"]',
'main'
]
article_content = None
for selector in content_selectors:
element = soup.select_one(selector)
if element:
article_content = element
break
# Fallback: get body
if not article_content:
article_content = soup.find('body')
if not article_content:
return ''
# Extract text from paragraphs
paragraphs = article_content.find_all('p')
# Filter out short paragraphs (likely navigation/ads)
content_paragraphs = []
for p in paragraphs:
text = p.get_text().strip()
# Keep paragraphs with at least 50 characters
if len(text) >= 50:
content_paragraphs.append(text)
content_text = '\n\n'.join(content_paragraphs)
return content_text
def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10):
"""
Crawl articles from an RSS feed
Returns: dict with statistics
"""
print(f"\n📰 Crawling feed: {feed_name}")
print(f" URL: {feed_url}")
try:
# Parse RSS feed
feed = feedparser.parse(feed_url)
if not feed.entries:
print(f" ⚠ No entries found in feed")
return 0
crawled_count = 0
summarized_count = 0
failed_summaries = 0
for entry in feed.entries[:max_articles]:
# Extract article URL using utility function
article_url = extract_article_url(entry)
if not article_url:
print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
continue
# Check if article already exists and has content
existing = articles_collection.find_one({'link': article_url})
if existing and existing.get('content'):
print(f" ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
continue
print(f" 🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
# Extract full content
article_data = extract_article_content(article_url)
if article_data and article_data.get('content'):
# Store original title
original_title = article_data.get('title') or entry.get('title', '')
# Translate title with Ollama if enabled
translation_result = None
if Config.OLLAMA_ENABLED and original_title:
print(f" 🌐 Translating title...")
translation_result = ollama_client.translate_title(original_title)
if translation_result and translation_result['success']:
print(f" ✓ Title translated ({translation_result['duration']:.1f}s)")
else:
error_msg = translation_result['error'] if translation_result else 'Unknown error'
print(f" ⚠ Translation failed: {error_msg}")
# Summarize with Ollama if enabled
summary_result = None
if Config.OLLAMA_ENABLED and article_data.get('content'):
print(f" 🤖 Summarizing with AI...")
summary_result = ollama_client.summarize_article(
article_data['content'],
max_words=Config.SUMMARY_MAX_WORDS
)
if summary_result['success']:
print(f" ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
summarized_count += 1
else:
print(f" ⚠ Summarization failed: {summary_result['error']}")
failed_summaries += 1
# Extract keywords for personalization
keywords_result = None
if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
print(f" 🔑 Extracting keywords...")
keywords_result = ollama_client.extract_keywords(
original_title,
summary_result['summary'],
max_keywords=5
)
if keywords_result['success']:
print(f" ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
else:
print(f" ⚠ Keyword extraction failed: {keywords_result['error']}")
# Prepare document
article_doc = {
'title': original_title,
'title_en': translation_result['translated_title'] if translation_result and translation_result['success'] else None,
'author': article_data.get('author'),
'link': article_url,
'content': article_data.get('content', ''), # Full article content
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
'word_count': article_data.get('word_count', 0),
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
'source': feed_name,
'category': feed_category,
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
'crawled_at': article_data.get('crawled_at'),
'translated_at': datetime.utcnow() if translation_result and translation_result['success'] else None,
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
'created_at': datetime.utcnow()
}
# Cluster article with existing articles (detect duplicates from other sources)
from datetime import timedelta
recent_articles = list(articles_collection.find({
'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
}))
article_doc = article_clusterer.cluster_article(article_doc, recent_articles)
try:
# Upsert: update if exists, insert if not
articles_collection.update_one(
{'link': article_url},
{'$set': article_doc},
upsert=True
)
crawled_count += 1
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
except DuplicateKeyError:
print(f" ⚠ Duplicate key error")
except Exception as e:
print(f" ✗ Error saving: {e}")
else:
print(f" ✗ Failed to extract content")
# Be nice to servers - add delay
time.sleep(1)
print(f" ✓ Crawled {crawled_count} articles from {feed_name}")
if Config.OLLAMA_ENABLED:
print(f" 🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
return {
'crawled': crawled_count,
'summarized': summarized_count,
'failed_summaries': failed_summaries
}
except Exception as e:
print(f" ✗ Error processing feed {feed_name}: {e}")
return {
'crawled': 0,
'summarized': 0,
'failed_summaries': 0
}
def crawl_all_feeds(max_articles_per_feed=10):
"""
Crawl all active RSS feeds
Returns: dict with statistics
"""
global article_clusterer
# Initialize clusterer if not already done
if article_clusterer is None:
article_clusterer = ArticleClusterer(
ollama_client=ollama_client,
similarity_threshold=0.60,
time_window_hours=24
)
print("\n" + "="*60)
print("🚀 Starting RSS Feed Crawler")
print("="*60)
start_time = time.time()
feeds = get_active_rss_feeds()
if not feeds:
print("⚠ No active RSS feeds found")
return {
'total_feeds': 0,
'total_articles_crawled': 0,
'duration_seconds': 0
}
print(f"Found {len(feeds)} active feed(s)")
if Config.OLLAMA_ENABLED:
print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
total_crawled = 0
total_summarized = 0
total_failed = 0
for feed in feeds:
result = crawl_rss_feed(
feed['url'],
feed['name'],
feed['category'],
max_articles=max_articles_per_feed
)
total_crawled += result['crawled']
total_summarized += result['summarized']
total_failed += result['failed_summaries']
duration = time.time() - start_time
print("\n" + "="*60)
print(f"✓ Crawling Complete!")
print(f" Total feeds processed: {len(feeds)}")
print(f" Total articles crawled: {total_crawled}")
if Config.OLLAMA_ENABLED:
print(f" Total articles summarized: {total_summarized}")
print(f" Failed summarizations: {total_failed}")
if total_summarized > 0:
success_rate = (total_summarized / (total_summarized + total_failed)) * 100
print(f" Success rate: {success_rate:.1f}%")
print(f" Duration: {duration:.2f} seconds")
if total_crawled > 0:
print(f" Average time per article: {duration/total_crawled:.1f}s")
print("="*60 + "\n")
# Generate neutral summaries for clustered articles
cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
if Config.OLLAMA_ENABLED and total_crawled > 0:
print("\n" + "="*60)
print("🔄 Generating Neutral Summaries for Clustered Articles")
print("="*60)
cluster_summary_stats = create_cluster_summaries(db, ollama_client)
print("\n" + "="*60)
print(f"✓ Cluster Summarization Complete!")
print(f" Clusters processed: {cluster_summary_stats['processed']}")
print(f" Succeeded: {cluster_summary_stats['succeeded']}")
print(f" Failed: {cluster_summary_stats['failed']}")
print("="*60 + "\n")
return {
'total_feeds': len(feeds),
'total_articles_crawled': total_crawled,
'total_summarized': total_summarized,
'failed_summaries': total_failed,
'duration_seconds': round(duration, 2),
'cluster_summaries': cluster_summary_stats
}
if __name__ == '__main__':
# Can be run standalone for testing
import sys
max_articles = 10
if len(sys.argv) > 1:
try:
max_articles = int(sys.argv[1])
except ValueError:
print("Usage: python crawler_service.py [max_articles_per_feed]")
sys.exit(1)
crawl_all_feeds(max_articles_per_feed=max_articles)