slight update
This commit is contained in:
@@ -54,7 +54,8 @@ def get_active_rss_feeds():
|
||||
feeds.append({
|
||||
'id': str(feed['_id']),
|
||||
'name': feed.get('name', ''),
|
||||
'url': feed.get('url', '')
|
||||
'url': feed.get('url', ''),
|
||||
'category': feed.get('category', 'general')
|
||||
})
|
||||
return feeds
|
||||
|
||||
@@ -303,7 +304,7 @@ def extract_main_content(soup):
|
||||
return content_text
|
||||
|
||||
|
||||
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
||||
def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10):
|
||||
"""
|
||||
Crawl articles from an RSS feed
|
||||
Returns: dict with statistics
|
||||
@@ -369,6 +370,7 @@ def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
||||
'word_count': article_data.get('word_count', 0),
|
||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
'category': feed_category,
|
||||
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
|
||||
'crawled_at': article_data.get('crawled_at'),
|
||||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
||||
@@ -442,6 +444,7 @@ def crawl_all_feeds(max_articles_per_feed=10):
|
||||
result = crawl_rss_feed(
|
||||
feed['url'],
|
||||
feed['name'],
|
||||
feed['category'],
|
||||
max_articles=max_articles_per_feed
|
||||
)
|
||||
total_crawled += result['crawled']
|
||||
|
||||
Reference in New Issue
Block a user