update

2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions
--- a/news_crawler/rss_utils.py
+++ b/news_crawler/rss_utils.py
@@ -0,0 +1,98 @@
+"""
+Utility functions for RSS feed processing
+"""
+
+
+def extract_article_url(entry):
+    """
+    Extract article URL from RSS entry.
+    Different RSS feeds use different fields for the article URL.
+    
+    Args:
+        entry: feedparser entry object
+        
+    Returns:
+        str: Article URL or None if not found
+        
+    Examples:
+        - Most feeds use 'link'
+        - Some use 'guid' as the URL
+        - Some use 'id' as the URL
+        - Some have guid as a dict with 'href'
+    """
+    # Try 'link' first (most common)
+    if entry.get('link') and entry.get('link', '').startswith('http'):
+        return entry.get('link')
+    
+    # Try 'guid' if it's a valid URL
+    if entry.get('guid'):
+        guid = entry.get('guid')
+        # guid can be a string
+        if isinstance(guid, str) and guid.startswith('http'):
+            return guid
+        # or a dict with 'href'
+        elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
+            return guid.get('href')
+    
+    # Try 'id' if it's a valid URL
+    if entry.get('id') and entry.get('id', '').startswith('http'):
+        return entry.get('id')
+    
+    # Try 'links' array (some feeds have multiple links)
+    if entry.get('links'):
+        for link in entry.get('links', []):
+            if isinstance(link, dict) and link.get('href', '').startswith('http'):
+                # Prefer 'alternate' type, but accept any http link
+                if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
+                    return link.get('href')
+        # If no alternate found, return first http link
+        for link in entry.get('links', []):
+            if isinstance(link, dict) and link.get('href', '').startswith('http'):
+                return link.get('href')
+    
+    return None
+
+
+def extract_article_summary(entry):
+    """
+    Extract article summary/description from RSS entry.
+    
+    Args:
+        entry: feedparser entry object
+        
+    Returns:
+        str: Article summary or empty string
+    """
+    # Try different fields
+    if entry.get('summary'):
+        return entry.get('summary', '')
+    elif entry.get('description'):
+        return entry.get('description', '')
+    elif entry.get('content'):
+        # content is usually a list of dicts
+        content = entry.get('content', [])
+        if content and isinstance(content, list) and len(content) > 0:
+            return content[0].get('value', '')
+    
+    return ''
+
+
+def extract_published_date(entry):
+    """
+    Extract published date from RSS entry.
+    
+    Args:
+        entry: feedparser entry object
+        
+    Returns:
+        str: Published date or empty string
+    """
+    # Try different fields
+    if entry.get('published'):
+        return entry.get('published', '')
+    elif entry.get('updated'):
+        return entry.get('updated', '')
+    elif entry.get('created'):
+        return entry.get('created', '')
+    
+    return ''