update
This commit is contained in:
154
news_crawler/test_rss_feeds.py
Normal file
154
news_crawler/test_rss_feeds.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify RSS feed URL extraction
|
||||
Tests actual feeds from the database
|
||||
"""
|
||||
import feedparser
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
client = MongoClient(MONGODB_URI)
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def test_feed(feed_name, feed_url):
|
||||
"""Test a single RSS feed"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing: {feed_name}")
|
||||
print(f"URL: {feed_url}")
|
||||
print('='*70)
|
||||
|
||||
try:
|
||||
# Parse the feed
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
return False
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 5 entries
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, entry in enumerate(feed.entries[:5], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
print(f"Title: {entry.get('title', 'No title')[:60]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL: {article_url}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
fail_count += 1
|
||||
|
||||
# Test summary extraction
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
else:
|
||||
print(f"⚠ No summary found")
|
||||
|
||||
# Test date extraction
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Published: {pub_date}")
|
||||
else:
|
||||
print(f"⚠ No published date found")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Results for {feed_name}:")
|
||||
print(f" ✓ Success: {success_count}/5")
|
||||
print(f" ❌ Failed: {fail_count}/5")
|
||||
print('='*70)
|
||||
|
||||
return fail_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing feed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*70)
|
||||
|
||||
# Get all RSS feeds from database
|
||||
print("\nFetching RSS feeds from database...")
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database")
|
||||
print("\nAdd feeds using:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
results = {}
|
||||
for feed in feeds:
|
||||
feed_name = feed.get('name', 'Unknown')
|
||||
feed_url = feed.get('url', '')
|
||||
active = feed.get('active', True)
|
||||
|
||||
if not active:
|
||||
print(f"\n⏭ Skipping inactive feed: {feed_name}")
|
||||
continue
|
||||
|
||||
if not feed_url:
|
||||
print(f"\n❌ Feed '{feed_name}' has no URL")
|
||||
results[feed_name] = False
|
||||
continue
|
||||
|
||||
results[feed_name] = test_feed(feed_name, feed_url)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("FINAL RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for feed_name, success in results.items():
|
||||
status = "✓ PASS" if success else "❌ FAIL"
|
||||
print(f"{status} - {feed_name}")
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for s in results.values() if s)
|
||||
|
||||
print(f"\nTotal: {passed}/{total} feeds passed")
|
||||
print("="*70 + "\n")
|
||||
|
||||
if passed == total:
|
||||
print("✓ All feeds are working correctly!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("⚠ Some feeds have issues. Check the output above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user