Munich-news/news_crawler/rss_utils.py

"""
Utility functions for RSS feed processing
"""


def extract_article_url(entry):
    """
    Extract article URL from RSS entry.
    Different RSS feeds use different fields for the article URL.

    Args:
        entry: feedparser entry object

    Returns:
        str: Article URL or None if not found

    Examples:
        - Most feeds use 'link'
        - Some use 'guid' as the URL
        - Some use 'id' as the URL
        - Some have guid as a dict with 'href'
    """
    # Try 'link' first (most common)
    if entry.get('link') and entry.get('link', '').startswith('http'):
        return entry.get('link')

    # Try 'guid' if it's a valid URL
    if entry.get('guid'):
        guid = entry.get('guid')
        # guid can be a string
        if isinstance(guid, str) and guid.startswith('http'):
            return guid
        # or a dict with 'href'
        elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
            return guid.get('href')

    # Try 'id' if it's a valid URL
    if entry.get('id') and entry.get('id', '').startswith('http'):
        return entry.get('id')

    # Try 'links' array (some feeds have multiple links)
    if entry.get('links'):
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                # Prefer 'alternate' type, but accept any http link
                if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
                    return link.get('href')
        # If no alternate found, return first http link
        for link in entry.get('links', []):
            if isinstance(link, dict) and link.get('href', '').startswith('http'):
                return link.get('href')

    return None


def extract_article_summary(entry):
    """
    Extract article summary/description from RSS entry.

    Args:
        entry: feedparser entry object

    Returns:
        str: Article summary or empty string
    """
    # Try different fields
    if entry.get('summary'):
        return entry.get('summary', '')
    elif entry.get('description'):
        return entry.get('description', '')
    elif entry.get('content'):
        # content is usually a list of dicts
        content = entry.get('content', [])
        if content and isinstance(content, list) and len(content) > 0:
            return content[0].get('value', '')

    return ''


def extract_published_date(entry):
    """
    Extract published date from RSS entry.

    Args:
        entry: feedparser entry object

    Returns:
        str: Published date or empty string
    """
    # Try different fields
    if entry.get('published'):
        return entry.get('published', '')
    elif entry.get('updated'):
        return entry.get('updated', '')
    elif entry.get('created'):
        return entry.get('created', '')

    return ''