99 lines
2.9 KiB
Python
99 lines
2.9 KiB
Python
"""
|
|
Utility functions for RSS feed processing
|
|
"""
|
|
|
|
|
|
def extract_article_url(entry):
|
|
"""
|
|
Extract article URL from RSS entry.
|
|
Different RSS feeds use different fields for the article URL.
|
|
|
|
Args:
|
|
entry: feedparser entry object
|
|
|
|
Returns:
|
|
str: Article URL or None if not found
|
|
|
|
Examples:
|
|
- Most feeds use 'link'
|
|
- Some use 'guid' as the URL
|
|
- Some use 'id' as the URL
|
|
- Some have guid as a dict with 'href'
|
|
"""
|
|
# Try 'link' first (most common)
|
|
if entry.get('link') and entry.get('link', '').startswith('http'):
|
|
return entry.get('link')
|
|
|
|
# Try 'guid' if it's a valid URL
|
|
if entry.get('guid'):
|
|
guid = entry.get('guid')
|
|
# guid can be a string
|
|
if isinstance(guid, str) and guid.startswith('http'):
|
|
return guid
|
|
# or a dict with 'href'
|
|
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
|
return guid.get('href')
|
|
|
|
# Try 'id' if it's a valid URL
|
|
if entry.get('id') and entry.get('id', '').startswith('http'):
|
|
return entry.get('id')
|
|
|
|
# Try 'links' array (some feeds have multiple links)
|
|
if entry.get('links'):
|
|
for link in entry.get('links', []):
|
|
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
|
# Prefer 'alternate' type, but accept any http link
|
|
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
|
return link.get('href')
|
|
# If no alternate found, return first http link
|
|
for link in entry.get('links', []):
|
|
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
|
return link.get('href')
|
|
|
|
return None
|
|
|
|
|
|
def extract_article_summary(entry):
|
|
"""
|
|
Extract article summary/description from RSS entry.
|
|
|
|
Args:
|
|
entry: feedparser entry object
|
|
|
|
Returns:
|
|
str: Article summary or empty string
|
|
"""
|
|
# Try different fields
|
|
if entry.get('summary'):
|
|
return entry.get('summary', '')
|
|
elif entry.get('description'):
|
|
return entry.get('description', '')
|
|
elif entry.get('content'):
|
|
# content is usually a list of dicts
|
|
content = entry.get('content', [])
|
|
if content and isinstance(content, list) and len(content) > 0:
|
|
return content[0].get('value', '')
|
|
|
|
return ''
|
|
|
|
|
|
def extract_published_date(entry):
|
|
"""
|
|
Extract published date from RSS entry.
|
|
|
|
Args:
|
|
entry: feedparser entry object
|
|
|
|
Returns:
|
|
str: Published date or empty string
|
|
"""
|
|
# Try different fields
|
|
if entry.get('published'):
|
|
return entry.get('published', '')
|
|
elif entry.get('updated'):
|
|
return entry.get('updated', '')
|
|
elif entry.get('created'):
|
|
return entry.get('created', '')
|
|
|
|
return ''
|