This commit is contained in:
2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
"""
Utility functions for RSS feed processing
"""
def extract_article_url(entry):
"""
Extract article URL from RSS entry.
Different RSS feeds use different fields for the article URL.
Args:
entry: feedparser entry object
Returns:
str: Article URL or None if not found
Examples:
- Most feeds use 'link'
- Some use 'guid' as the URL
- Some use 'id' as the URL
- Some have guid as a dict with 'href'
"""
# Try 'link' first (most common)
if entry.get('link') and entry.get('link', '').startswith('http'):
return entry.get('link')
# Try 'guid' if it's a valid URL
if entry.get('guid'):
guid = entry.get('guid')
# guid can be a string
if isinstance(guid, str) and guid.startswith('http'):
return guid
# or a dict with 'href'
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
return guid.get('href')
# Try 'id' if it's a valid URL
if entry.get('id') and entry.get('id', '').startswith('http'):
return entry.get('id')
# Try 'links' array (some feeds have multiple links)
if entry.get('links'):
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
# Prefer 'alternate' type, but accept any http link
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
return link.get('href')
# If no alternate found, return first http link
for link in entry.get('links', []):
if isinstance(link, dict) and link.get('href', '').startswith('http'):
return link.get('href')
return None
def extract_article_summary(entry):
"""
Extract article summary/description from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Article summary or empty string
"""
# Try different fields
if entry.get('summary'):
return entry.get('summary', '')
elif entry.get('description'):
return entry.get('description', '')
elif entry.get('content'):
# content is usually a list of dicts
content = entry.get('content', [])
if content and isinstance(content, list) and len(content) > 0:
return content[0].get('value', '')
return ''
def extract_published_date(entry):
"""
Extract published date from RSS entry.
Args:
entry: feedparser entry object
Returns:
str: Published date or empty string
"""
# Try different fields
if entry.get('published'):
return entry.get('published', '')
elif entry.get('updated'):
return entry.get('updated', '')
elif entry.get('created'):
return entry.get('created', '')
return ''