Munich-news/news_crawler/crawler_service.py

"""
Web crawler service to extract full article content from RSS feed links
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
import feedparser
import time
import os
from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
from config import Config
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from cluster_summarizer import create_cluster_summaries

# Load environment variables
load_dotenv(dotenv_path='../.env')

# MongoDB setup
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]

articles_collection = db['articles']
rss_feeds_collection = db['rss_feeds']

# Initialize Ollama client
ollama_client = OllamaClient(
    base_url=Config.OLLAMA_BASE_URL,
    model=Config.OLLAMA_MODEL,
    api_key=Config.OLLAMA_API_KEY,
    enabled=Config.OLLAMA_ENABLED,
    timeout=Config.OLLAMA_TIMEOUT
)

# Initialize Article Clusterer (will be initialized after ollama_client)
article_clusterer = None

# Print configuration on startup
if __name__ != '__main__':
    Config.print_config()
    if Config.OLLAMA_ENABLED:
        print("🤖 Ollama AI summarization: ENABLED")
        if ollama_client.is_available():
            print("✓ Ollama server is reachable")
        else:
            print("⚠ Warning: Ollama server is not reachable")
    else:
        print("ℹ Ollama AI summarization: DISABLED")

    # Initialize Article Clusterer with ollama_client
    article_clusterer = ArticleClusterer(
        ollama_client=ollama_client,
        similarity_threshold=0.60,  # Not used when AI is enabled
        time_window_hours=24        # Look back 24 hours
    )
    print("🔗 Article clustering: ENABLED (AI-powered)")


def get_active_rss_feeds():
    """Get all active RSS feeds from database"""
    feeds = []
    cursor = rss_feeds_collection.find({'active': True})
    for feed in cursor:
        feeds.append({
            'id': str(feed['_id']),
            'name': feed.get('name', ''),
            'url': feed.get('url', ''),
            'category': feed.get('category', 'general')
        })
    return feeds


def extract_article_content(url, timeout=10):
    """
    Extract main article content from a URL with smart detection
    Returns: dict with title, content, author, date, and metadata
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        }

        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
            script.decompose()

        # === EXTRACT TITLE ===
        title = extract_title(soup)

        # === EXTRACT AUTHOR ===
        author = extract_author(soup)

        # === EXTRACT PUBLISHED DATE ===
        published_date = extract_date(soup)

        # === EXTRACT MAIN CONTENT ===
        content_text = extract_main_content(soup)

        # === EXTRACT META DESCRIPTION ===
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if not meta_desc:
            meta_desc = soup.find('meta', attrs={'property': 'og:description'})
        description = meta_desc.get('content', '') if meta_desc else ''

        return {
            'title': title,
            'author': author,
            'content': content_text,  # Full content, no limit
            'description': description,
            'published_date': published_date,
            'word_count': len(content_text.split()) if content_text else 0,
            'crawled_at': datetime.utcnow()
        }

    except requests.exceptions.Timeout:
        print(f"Timeout crawling {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error crawling {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error crawling {url}: {e}")
        return None


def extract_title(soup):
    """
    Extract article title using multiple strategies
    """
    # Strategy 1: Look for h1 tag
    h1 = soup.find('h1')
    if h1:
        title = h1.get_text().strip()
        if title and len(title) > 10:  # Reasonable title length
            return title

    # Strategy 2: Look for meta og:title
    og_title = soup.find('meta', attrs={'property': 'og:title'})
    if og_title and og_title.get('content'):
        return og_title.get('content').strip()

    # Strategy 3: Look for meta twitter:title
    twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
    if twitter_title and twitter_title.get('content'):
        return twitter_title.get('content').strip()

    # Strategy 4: Look for title tag (fallback)
    title_tag = soup.find('title')
    if title_tag:
        title = title_tag.get_text().strip()
        # Clean up common patterns like "Site Name | Article Title"
        if ' | ' in title:
            title = title.split(' | ')[0]
        elif ' - ' in title:
            title = title.split(' - ')[0]
        return title

    return None


def extract_author(soup):
    """
    Extract article author using multiple strategies
    """
    # Strategy 1: Look for meta author
    meta_author = soup.find('meta', attrs={'name': 'author'})
    if meta_author and meta_author.get('content'):
        return meta_author.get('content').strip()

    # Strategy 2: Look for rel="author"
    rel_author = soup.find('a', attrs={'rel': 'author'})
    if rel_author:
        return rel_author.get_text().strip()

    # Strategy 3: Look for common author class names
    author_selectors = [
        '[class*="author-name"]',
        '[class*="author"]',
        '[class*="byline"]',
        '[class*="writer"]',
        '[rel="author"]',
        '[itemprop="author"]'
    ]

    for selector in author_selectors:
        author_elem = soup.select_one(selector)
        if author_elem:
            author = author_elem.get_text().strip()
            # Clean up common patterns
            author = author.replace('By ', '').replace('by ', '').strip()
            if author and len(author) < 100:  # Reasonable author name length
                return author

    # Strategy 4: Look for JSON-LD structured data
    json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
    if json_ld:
        try:
            import json
            data = json.loads(json_ld.string)
            if isinstance(data, dict) and data.get('author'):
                author_data = data.get('author')
                if isinstance(author_data, dict):
                    return author_data.get('name', '')
                elif isinstance(author_data, str):
                    return author_data
        except:
            pass

    return None


def extract_date(soup):
    """
    Extract published date using multiple strategies
    """
    # Strategy 1: Look for time tag with datetime attribute
    time_tag = soup.find('time')
    if time_tag and time_tag.get('datetime'):
        return time_tag.get('datetime')

    # Strategy 2: Look for meta article:published_time
    meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
    if meta_published and meta_published.get('content'):
        return meta_published.get('content')

    # Strategy 3: Look for meta og:published_time
    og_published = soup.find('meta', attrs={'property': 'og:published_time'})
    if og_published and og_published.get('content'):
        return og_published.get('content')

    # Strategy 4: Look for common date class names
    date_selectors = [
        '[class*="publish-date"]',
        '[class*="published"]',
        '[class*="date"]',
        '[class*="timestamp"]',
        '[itemprop="datePublished"]'
    ]

    for selector in date_selectors:
        date_elem = soup.select_one(selector)
        if date_elem:
            # Try datetime attribute first
            if date_elem.get('datetime'):
                return date_elem.get('datetime')
            # Otherwise get text
            date_text = date_elem.get_text().strip()
            if date_text and len(date_text) < 50:
                return date_text

    # Strategy 5: Look for JSON-LD structured data
    json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
    if json_ld:
        try:
            import json
            data = json.loads(json_ld.string)
            if isinstance(data, dict):
                return data.get('datePublished') or data.get('dateCreated')
        except:
            pass

    return None


def extract_main_content(soup):
    """
    Extract main article content using multiple strategies
    """
    # Strategy 1: Try common article content selectors
    content_selectors = [
        'article',
        '[class*="article-content"]',
        '[class*="article-body"]',
        '[class*="post-content"]',
        '[class*="entry-content"]',
        '[class*="content-body"]',
        '[class*="story-body"]',
        '[itemprop="articleBody"]',
        'main'
    ]

    article_content = None
    for selector in content_selectors:
        element = soup.select_one(selector)
        if element:
            article_content = element
            break

    # Fallback: get body
    if not article_content:
        article_content = soup.find('body')

    if not article_content:
        return ''

    # Extract text from paragraphs
    paragraphs = article_content.find_all('p')

    # Filter out short paragraphs (likely navigation/ads)
    content_paragraphs = []
    for p in paragraphs:
        text = p.get_text().strip()
        # Keep paragraphs with at least 50 characters
        if len(text) >= 50:
            content_paragraphs.append(text)

    content_text = '\n\n'.join(content_paragraphs)

    return content_text


def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10):
    """
    Crawl articles from an RSS feed
    Returns: dict with statistics
    """
    print(f"\n📰 Crawling feed: {feed_name}")
    print(f"   URL: {feed_url}")

    try:
        # Parse RSS feed
        feed = feedparser.parse(feed_url)

        if not feed.entries:
            print(f"   ⚠ No entries found in feed")
            return 0

        crawled_count = 0
        summarized_count = 0
        failed_summaries = 0

        for entry in feed.entries[:max_articles]:
            # Extract article URL using utility function
            article_url = extract_article_url(entry)

            if not article_url:
                print(f"   ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
                continue

            # Check if article already exists and has content
            existing = articles_collection.find_one({'link': article_url})
            if existing and existing.get('content'):
                print(f"   ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
                continue

            print(f"   🔍 Crawling: {entry.get('title', 'No title')[:50]}...")

            # Extract full content
            article_data = extract_article_content(article_url)

            if article_data and article_data.get('content'):
                # Store original title
                original_title = article_data.get('title') or entry.get('title', '')

                # Translate title with Ollama if enabled
                translation_result = None
                if Config.OLLAMA_ENABLED and original_title:
                    print(f"   🌐 Translating title...")
                    translation_result = ollama_client.translate_title(original_title)

                    if translation_result and translation_result['success']:
                        print(f"   ✓ Title translated ({translation_result['duration']:.1f}s)")
                    else:
                        error_msg = translation_result['error'] if translation_result else 'Unknown error'
                        print(f"   ⚠ Translation failed: {error_msg}")

                # Summarize with Ollama if enabled
                summary_result = None
                if Config.OLLAMA_ENABLED and article_data.get('content'):
                    print(f"   🤖 Summarizing with AI...")
                    summary_result = ollama_client.summarize_article(
                        article_data['content'],
                        max_words=Config.SUMMARY_MAX_WORDS
                    )

                    if summary_result['success']:
                        print(f"   ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
                        summarized_count += 1
                    else:
                        print(f"   ⚠ Summarization failed: {summary_result['error']}")
                        failed_summaries += 1

                # Extract keywords for personalization
                keywords_result = None
                if Config.OLLAMA_ENABLED and summary_result and summary_result['success']:
                    print(f"   🔑 Extracting keywords...")
                    keywords_result = ollama_client.extract_keywords(
                        original_title,
                        summary_result['summary'],
                        max_keywords=5
                    )

                    if keywords_result['success']:
                        print(f"   ✓ Keywords: {', '.join(keywords_result['keywords'])} ({keywords_result['duration']:.1f}s)")
                    else:
                        print(f"   ⚠ Keyword extraction failed: {keywords_result['error']}")

                # Prepare document
                article_doc = {
                    'title': original_title,
                    'title_en': translation_result['translated_title'] if translation_result and translation_result['success'] else None,
                    'author': article_data.get('author'),
                    'link': article_url,
                    'content': article_data.get('content', ''),  # Full article content
                    'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
                    'keywords': keywords_result['keywords'] if keywords_result and keywords_result['success'] else [],
                    'word_count': article_data.get('word_count', 0),
                    'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
                    'source': feed_name,
                    'category': feed_category,
                    'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
                    'crawled_at': article_data.get('crawled_at'),
                    'translated_at': datetime.utcnow() if translation_result and translation_result['success'] else None,
                    'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
                    'created_at': datetime.utcnow()
                }

                # Cluster article with existing articles (detect duplicates from other sources)
                from datetime import timedelta
                recent_articles = list(articles_collection.find({
                    'published_at': {'$gte': datetime.utcnow() - timedelta(hours=24)}
                }))
                article_doc = article_clusterer.cluster_article(article_doc, recent_articles)

                try:
                    # Upsert: update if exists, insert if not
                    articles_collection.update_one(
                        {'link': article_url},
                        {'$set': article_doc},
                        upsert=True
                    )
                    crawled_count += 1
                    print(f"   ✓ Saved ({article_data.get('word_count', 0)} words)")

                except DuplicateKeyError:
                    print(f"   ⚠ Duplicate key error")
                except Exception as e:
                    print(f"   ✗ Error saving: {e}")
            else:
                print(f"   ✗ Failed to extract content")

            # Be nice to servers - add delay
            time.sleep(1)

        print(f"   ✓ Crawled {crawled_count} articles from {feed_name}")
        if Config.OLLAMA_ENABLED:
            print(f"   🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")

        return {
            'crawled': crawled_count,
            'summarized': summarized_count,
            'failed_summaries': failed_summaries
        }

    except Exception as e:
        print(f"   ✗ Error processing feed {feed_name}: {e}")
        return {
            'crawled': 0,
            'summarized': 0,
            'failed_summaries': 0
        }


def crawl_all_feeds(max_articles_per_feed=10):
    """
    Crawl all active RSS feeds
    Returns: dict with statistics
    """
    global article_clusterer

    # Initialize clusterer if not already done
    if article_clusterer is None:
        article_clusterer = ArticleClusterer(
            ollama_client=ollama_client,
            similarity_threshold=0.60,
            time_window_hours=24
        )

    print("\n" + "="*60)
    print("🚀 Starting RSS Feed Crawler")
    print("="*60)

    start_time = time.time()
    feeds = get_active_rss_feeds()

    if not feeds:
        print("⚠ No active RSS feeds found")
        return {
            'total_feeds': 0,
            'total_articles_crawled': 0,
            'duration_seconds': 0
        }

    print(f"Found {len(feeds)} active feed(s)")
    if Config.OLLAMA_ENABLED:
        print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")

    total_crawled = 0
    total_summarized = 0
    total_failed = 0

    for feed in feeds:
        result = crawl_rss_feed(
            feed['url'],
            feed['name'],
            feed['category'],
            max_articles=max_articles_per_feed
        )
        total_crawled += result['crawled']
        total_summarized += result['summarized']
        total_failed += result['failed_summaries']

    duration = time.time() - start_time

    print("\n" + "="*60)
    print(f"✓ Crawling Complete!")
    print(f"  Total feeds processed: {len(feeds)}")
    print(f"  Total articles crawled: {total_crawled}")
    if Config.OLLAMA_ENABLED:
        print(f"  Total articles summarized: {total_summarized}")
        print(f"  Failed summarizations: {total_failed}")
        if total_summarized > 0:
            success_rate = (total_summarized / (total_summarized + total_failed)) * 100
            print(f"  Success rate: {success_rate:.1f}%")
    print(f"  Duration: {duration:.2f} seconds")
    if total_crawled > 0:
        print(f"  Average time per article: {duration/total_crawled:.1f}s")
    print("="*60 + "\n")

    # Generate neutral summaries for clustered articles
    cluster_summary_stats = {'processed': 0, 'succeeded': 0, 'failed': 0}
    if Config.OLLAMA_ENABLED and total_crawled > 0:
        print("\n" + "="*60)
        print("🔄 Generating Neutral Summaries for Clustered Articles")
        print("="*60)

        cluster_summary_stats = create_cluster_summaries(db, ollama_client)

        print("\n" + "="*60)
        print(f"✓ Cluster Summarization Complete!")
        print(f"  Clusters processed: {cluster_summary_stats['processed']}")
        print(f"  Succeeded: {cluster_summary_stats['succeeded']}")
        print(f"  Failed: {cluster_summary_stats['failed']}")
        print("="*60 + "\n")

    return {
        'total_feeds': len(feeds),
        'total_articles_crawled': total_crawled,
        'total_summarized': total_summarized,
        'failed_summaries': total_failed,
        'duration_seconds': round(duration, 2),
        'cluster_summaries': cluster_summary_stats
    }


if __name__ == '__main__':
    # Can be run standalone for testing
    import sys
    max_articles = 10

    if len(sys.argv) > 1:
        try:
            max_articles = int(sys.argv[1])
        except ValueError:
            print("Usage: python crawler_service.py [max_articles_per_feed]")
            sys.exit(1)

    crawl_all_feeds(max_articles_per_feed=max_articles)