slight update

This commit is contained in:
2025-11-11 14:34:43 +01:00
parent 760a458e66
commit f23f4b71d8
7 changed files with 748 additions and 34 deletions

View File

@@ -54,7 +54,8 @@ def get_active_rss_feeds():
feeds.append({
'id': str(feed['_id']),
'name': feed.get('name', ''),
'url': feed.get('url', '')
'url': feed.get('url', ''),
'category': feed.get('category', 'general')
})
return feeds
@@ -303,7 +304,7 @@ def extract_main_content(soup):
return content_text
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10):
"""
Crawl articles from an RSS feed
Returns: dict with statistics
@@ -369,6 +370,7 @@ def crawl_rss_feed(feed_url, feed_name, max_articles=10):
'word_count': article_data.get('word_count', 0),
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
'source': feed_name,
'category': feed_category,
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
'crawled_at': article_data.get('crawled_at'),
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
@@ -442,6 +444,7 @@ def crawl_all_feeds(max_articles_per_feed=10):
result = crawl_rss_feed(
feed['url'],
feed['name'],
feed['category'],
max_articles=max_articles_per_feed
)
total_crawled += result['crawled']