#!/usr/bin/env python """ Test script to verify RSS feed URL extraction Tests actual feeds from the database """ import feedparser from pymongo import MongoClient import os from dotenv import load_dotenv from rss_utils import extract_article_url, extract_article_summary, extract_published_date # Load environment variables load_dotenv(dotenv_path='../.env') # MongoDB setup MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/') DB_NAME = 'munich_news' client = MongoClient(MONGODB_URI) db = client[DB_NAME] rss_feeds_collection = db['rss_feeds'] def test_feed(feed_name, feed_url): """Test a single RSS feed""" print(f"\n{'='*70}") print(f"Testing: {feed_name}") print(f"URL: {feed_url}") print('='*70) try: # Parse the feed print("Fetching RSS feed...") feed = feedparser.parse(feed_url) if not feed.entries: print("❌ No entries found in feed") return False print(f"✓ Found {len(feed.entries)} entries\n") # Test first 5 entries success_count = 0 fail_count = 0 for i, entry in enumerate(feed.entries[:5], 1): print(f"\n--- Entry {i} ---") print(f"Title: {entry.get('title', 'No title')[:60]}") # Test URL extraction article_url = extract_article_url(entry) if article_url: print(f"✓ URL: {article_url}") success_count += 1 else: print(f"❌ No valid URL found") print(f" Available fields: {list(entry.keys())}") print(f" link: {entry.get('link', 'N/A')}") print(f" guid: {entry.get('guid', 'N/A')}") print(f" id: {entry.get('id', 'N/A')}") fail_count += 1 # Test summary extraction summary = extract_article_summary(entry) if summary: print(f"✓ Summary: {summary[:80]}...") else: print(f"⚠ No summary found") # Test date extraction pub_date = extract_published_date(entry) if pub_date: print(f"✓ Published: {pub_date}") else: print(f"⚠ No published date found") print(f"\n{'='*70}") print(f"Results for {feed_name}:") print(f" ✓ Success: {success_count}/5") print(f" ❌ Failed: {fail_count}/5") print('='*70) return fail_count == 0 except Exception as e: print(f"❌ Error testing feed: {e}") return False def main(): print("\n" + "="*70) print("RSS Feed URL Extraction Test") print("="*70) # Get all RSS feeds from database print("\nFetching RSS feeds from database...") feeds = list(rss_feeds_collection.find()) if not feeds: print("❌ No RSS feeds found in database") print("\nAdd feeds using:") print(" curl -X POST http://localhost:5001/api/rss-feeds \\") print(" -H 'Content-Type: application/json' \\") print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'") return 1 print(f"✓ Found {len(feeds)} feed(s) in database\n") # Test each feed results = {} for feed in feeds: feed_name = feed.get('name', 'Unknown') feed_url = feed.get('url', '') active = feed.get('active', True) if not active: print(f"\n⏭ Skipping inactive feed: {feed_name}") continue if not feed_url: print(f"\n❌ Feed '{feed_name}' has no URL") results[feed_name] = False continue results[feed_name] = test_feed(feed_name, feed_url) # Summary print("\n" + "="*70) print("FINAL RESULTS") print("="*70) for feed_name, success in results.items(): status = "✓ PASS" if success else "❌ FAIL" print(f"{status} - {feed_name}") total = len(results) passed = sum(1 for s in results.values() if s) print(f"\nTotal: {passed}/{total} feeds passed") print("="*70 + "\n") if passed == total: print("✓ All feeds are working correctly!") print("\nYou can now run the crawler:") print(" python crawler_service.py") return 0 else: print("⚠ Some feeds have issues. Check the output above.") return 1 if __name__ == '__main__': import sys sys.exit(main())