#!/usr/bin/env python """ Test RSS feed URL extraction Run from backend directory with venv activated: cd backend source venv/bin/activate # or venv\Scripts\activate on Windows python test_rss_extraction.py """ from pymongo import MongoClient from config import Config import feedparser from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date print("\n" + "="*80) print("RSS Feed URL Extraction Test") print("="*80) # Connect to database print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}") client = MongoClient(Config.MONGODB_URI) db = client[Config.DB_NAME] # Get RSS feeds print("Fetching RSS feeds from database...") feeds = list(db['rss_feeds'].find()) if not feeds: print("\n❌ No RSS feeds in database!") print("\nAdd a feed first:") print(" curl -X POST http://localhost:5001/api/rss-feeds \\") print(" -H 'Content-Type: application/json' \\") print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'") exit(1) print(f"✓ Found {len(feeds)} feed(s)\n") # Test each feed total_success = 0 total_fail = 0 for feed_doc in feeds: name = feed_doc.get('name', 'Unknown') url = feed_doc.get('url', '') active = feed_doc.get('active', True) print("\n" + "="*80) print(f"Feed: {name}") print(f"URL: {url}") print(f"Active: {'Yes' if active else 'No'}") print("="*80) if not active: print("⏭ Skipping (inactive)") continue try: # Parse RSS print("\nFetching RSS feed...") feed = feedparser.parse(url) if not feed.entries: print("❌ No entries found in feed") continue print(f"✓ Found {len(feed.entries)} entries") # Test first 3 entries print(f"\nTesting first 3 entries:") print("-" * 80) for i, entry in enumerate(feed.entries[:3], 1): print(f"\n📰 Entry {i}:") # Title title = entry.get('title', 'No title') print(f" Title: {title[:65]}") # Test URL extraction article_url = extract_article_url(entry) if article_url: print(f" ✓ URL: {article_url}") total_success += 1 else: print(f" ❌ Could not extract URL") print(f" Available fields: {list(entry.keys())[:10]}") print(f" link: {entry.get('link', 'N/A')}") print(f" guid: {entry.get('guid', 'N/A')}") print(f" id: {entry.get('id', 'N/A')}") total_fail += 1 # Test summary summary = extract_article_summary(entry) if summary: print(f" ✓ Summary: {summary[:70]}...") else: print(f" ⚠ No summary") # Test date pub_date = extract_published_date(entry) if pub_date: print(f" ✓ Date: {pub_date}") else: print(f" ⚠ No date") except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() # Summary print("\n" + "="*80) print("SUMMARY") print("="*80) print(f"Total URLs tested: {total_success + total_fail}") print(f"✓ Successfully extracted: {total_success}") print(f"❌ Failed to extract: {total_fail}") if total_fail == 0: print("\n🎉 All URLs extracted successfully!") print("\nYou can now run the crawler:") print(" cd ../news_crawler") print(" pip install -r requirements.txt") print(" python crawler_service.py 5") else: print(f"\n⚠ {total_fail} URL(s) could not be extracted") print("Check the output above for details") print("="*80 + "\n")