update
This commit is contained in:
98
news_crawler/rss_utils.py
Normal file
98
news_crawler/rss_utils.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Utility functions for RSS feed processing
|
||||
"""
|
||||
|
||||
|
||||
def extract_article_url(entry):
|
||||
"""
|
||||
Extract article URL from RSS entry.
|
||||
Different RSS feeds use different fields for the article URL.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article URL or None if not found
|
||||
|
||||
Examples:
|
||||
- Most feeds use 'link'
|
||||
- Some use 'guid' as the URL
|
||||
- Some use 'id' as the URL
|
||||
- Some have guid as a dict with 'href'
|
||||
"""
|
||||
# Try 'link' first (most common)
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
|
||||
# Try 'guid' if it's a valid URL
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
|
||||
# Try 'id' if it's a valid URL
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
|
||||
# Try 'links' array (some feeds have multiple links)
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type, but accept any http link
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
# If no alternate found, return first http link
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
return link.get('href')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_article_summary(entry):
|
||||
"""
|
||||
Extract article summary/description from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article summary or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('summary'):
|
||||
return entry.get('summary', '')
|
||||
elif entry.get('description'):
|
||||
return entry.get('description', '')
|
||||
elif entry.get('content'):
|
||||
# content is usually a list of dicts
|
||||
content = entry.get('content', [])
|
||||
if content and isinstance(content, list) and len(content) > 0:
|
||||
return content[0].get('value', '')
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def extract_published_date(entry):
|
||||
"""
|
||||
Extract published date from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Published date or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('published'):
|
||||
return entry.get('published', '')
|
||||
elif entry.get('updated'):
|
||||
return entry.get('updated', '')
|
||||
elif entry.get('created'):
|
||||
return entry.get('created', '')
|
||||
|
||||
return ''
|
||||
Reference in New Issue
Block a user