Munich-news/tests/backend/test_rss_extraction.py

#!/usr/bin/env python
"""
Test RSS feed URL extraction
Run from backend directory with venv activated:
  cd backend
  source venv/bin/activate  # or venv\Scripts\activate on Windows
  python test_rss_extraction.py
"""
from pymongo import MongoClient
from config import Config
import feedparser
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date

print("\n" + "="*80)
print("RSS Feed URL Extraction Test")
print("="*80)

# Connect to database
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]

# Get RSS feeds
print("Fetching RSS feeds from database...")
feeds = list(db['rss_feeds'].find())

if not feeds:
    print("\n❌ No RSS feeds in database!")
    print("\nAdd a feed first:")
    print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
    print("    -H 'Content-Type: application/json' \\")
    print("    -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
    exit(1)

print(f"✓ Found {len(feeds)} feed(s)\n")

# Test each feed
total_success = 0
total_fail = 0

for feed_doc in feeds:
    name = feed_doc.get('name', 'Unknown')
    url = feed_doc.get('url', '')
    active = feed_doc.get('active', True)

    print("\n" + "="*80)
    print(f"Feed: {name}")
    print(f"URL: {url}")
    print(f"Active: {'Yes' if active else 'No'}")
    print("="*80)

    if not active:
        print("⏭  Skipping (inactive)")
        continue

    try:
        # Parse RSS
        print("\nFetching RSS feed...")
        feed = feedparser.parse(url)

        if not feed.entries:
            print("❌ No entries found in feed")
            continue

        print(f"✓ Found {len(feed.entries)} entries")

        # Test first 3 entries
        print(f"\nTesting first 3 entries:")
        print("-" * 80)

        for i, entry in enumerate(feed.entries[:3], 1):
            print(f"\n📰 Entry {i}:")

            # Title
            title = entry.get('title', 'No title')
            print(f"   Title: {title[:65]}")

            # Test URL extraction
            article_url = extract_article_url(entry)
            if article_url:
                print(f"   ✓ URL: {article_url}")
                total_success += 1
            else:
                print(f"   ❌ Could not extract URL")
                print(f"      Available fields: {list(entry.keys())[:10]}")
                print(f"      link: {entry.get('link', 'N/A')}")
                print(f"      guid: {entry.get('guid', 'N/A')}")
                print(f"      id: {entry.get('id', 'N/A')}")
                total_fail += 1

            # Test summary
            summary = extract_article_summary(entry)
            if summary:
                print(f"   ✓ Summary: {summary[:70]}...")
            else:
                print(f"   ⚠  No summary")

            # Test date
            pub_date = extract_published_date(entry)
            if pub_date:
                print(f"   ✓ Date: {pub_date}")
            else:
                print(f"   ⚠  No date")

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

# Summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total URLs tested: {total_success + total_fail}")
print(f"✓ Successfully extracted: {total_success}")
print(f"❌ Failed to extract: {total_fail}")

if total_fail == 0:
    print("\n🎉 All URLs extracted successfully!")
    print("\nYou can now run the crawler:")
    print("  cd ../news_crawler")
    print("  pip install -r requirements.txt")
    print("  python crawler_service.py 5")
else:
    print(f"\n⚠  {total_fail} URL(s) could not be extracted")
    print("Check the output above for details")

print("="*80 + "\n")