Files
Munich-news/tests/backend/test_rss_extraction.py
2025-11-11 14:09:21 +01:00

129 lines
3.8 KiB
Python

#!/usr/bin/env python
"""
Test RSS feed URL extraction
Run from backend directory with venv activated:
cd backend
source venv/bin/activate # or venv\Scripts\activate on Windows
python test_rss_extraction.py
"""
from pymongo import MongoClient
from config import Config
import feedparser
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
print("\n" + "="*80)
print("RSS Feed URL Extraction Test")
print("="*80)
# Connect to database
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
client = MongoClient(Config.MONGODB_URI)
db = client[Config.DB_NAME]
# Get RSS feeds
print("Fetching RSS feeds from database...")
feeds = list(db['rss_feeds'].find())
if not feeds:
print("\n❌ No RSS feeds in database!")
print("\nAdd a feed first:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
exit(1)
print(f"✓ Found {len(feeds)} feed(s)\n")
# Test each feed
total_success = 0
total_fail = 0
for feed_doc in feeds:
name = feed_doc.get('name', 'Unknown')
url = feed_doc.get('url', '')
active = feed_doc.get('active', True)
print("\n" + "="*80)
print(f"Feed: {name}")
print(f"URL: {url}")
print(f"Active: {'Yes' if active else 'No'}")
print("="*80)
if not active:
print("⏭ Skipping (inactive)")
continue
try:
# Parse RSS
print("\nFetching RSS feed...")
feed = feedparser.parse(url)
if not feed.entries:
print("❌ No entries found in feed")
continue
print(f"✓ Found {len(feed.entries)} entries")
# Test first 3 entries
print(f"\nTesting first 3 entries:")
print("-" * 80)
for i, entry in enumerate(feed.entries[:3], 1):
print(f"\n📰 Entry {i}:")
# Title
title = entry.get('title', 'No title')
print(f" Title: {title[:65]}")
# Test URL extraction
article_url = extract_article_url(entry)
if article_url:
print(f" ✓ URL: {article_url}")
total_success += 1
else:
print(f" ❌ Could not extract URL")
print(f" Available fields: {list(entry.keys())[:10]}")
print(f" link: {entry.get('link', 'N/A')}")
print(f" guid: {entry.get('guid', 'N/A')}")
print(f" id: {entry.get('id', 'N/A')}")
total_fail += 1
# Test summary
summary = extract_article_summary(entry)
if summary:
print(f" ✓ Summary: {summary[:70]}...")
else:
print(f" ⚠ No summary")
# Test date
pub_date = extract_published_date(entry)
if pub_date:
print(f" ✓ Date: {pub_date}")
else:
print(f" ⚠ No date")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
# Summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total URLs tested: {total_success + total_fail}")
print(f"✓ Successfully extracted: {total_success}")
print(f"❌ Failed to extract: {total_fail}")
if total_fail == 0:
print("\n🎉 All URLs extracted successfully!")
print("\nYou can now run the crawler:")
print(" cd ../news_crawler")
print(" pip install -r requirements.txt")
print(" python crawler_service.py 5")
else:
print(f"\n{total_fail} URL(s) could not be extracted")
print("Check the output above for details")
print("="*80 + "\n")