""" Utility functions for RSS feed processing """ def extract_article_url(entry): """ Extract article URL from RSS entry. Different RSS feeds use different fields for the article URL. Args: entry: feedparser entry object Returns: str: Article URL or None if not found Examples: - Most feeds use 'link' - Some use 'guid' as the URL - Some use 'id' as the URL - Some have guid as a dict with 'href' """ # Try 'link' first (most common) if entry.get('link') and entry.get('link', '').startswith('http'): return entry.get('link') # Try 'guid' if it's a valid URL if entry.get('guid'): guid = entry.get('guid') # guid can be a string if isinstance(guid, str) and guid.startswith('http'): return guid # or a dict with 'href' elif isinstance(guid, dict) and guid.get('href', '').startswith('http'): return guid.get('href') # Try 'id' if it's a valid URL if entry.get('id') and entry.get('id', '').startswith('http'): return entry.get('id') # Try 'links' array (some feeds have multiple links) if entry.get('links'): for link in entry.get('links', []): if isinstance(link, dict) and link.get('href', '').startswith('http'): # Prefer 'alternate' type, but accept any http link if link.get('type') == 'text/html' or link.get('rel') == 'alternate': return link.get('href') # If no alternate found, return first http link for link in entry.get('links', []): if isinstance(link, dict) and link.get('href', '').startswith('http'): return link.get('href') return None def extract_article_summary(entry): """ Extract article summary/description from RSS entry. Args: entry: feedparser entry object Returns: str: Article summary or empty string """ # Try different fields if entry.get('summary'): return entry.get('summary', '') elif entry.get('description'): return entry.get('description', '') elif entry.get('content'): # content is usually a list of dicts content = entry.get('content', []) if content and isinstance(content, list) and len(content) > 0: return content[0].get('value', '') return '' def extract_published_date(entry): """ Extract published date from RSS entry. Args: entry: feedparser entry object Returns: str: Published date or empty string """ # Try different fields if entry.get('published'): return entry.get('published', '') elif entry.get('updated'): return entry.get('updated', '') elif entry.get('created'): return entry.get('created', '') return ''