update
This commit is contained in:
128
backend/test_rss_extraction.py
Normal file
128
backend/test_rss_extraction.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test RSS feed URL extraction
|
||||
Run from backend directory with venv activated:
|
||||
cd backend
|
||||
source venv/bin/activate # or venv\Scripts\activate on Windows
|
||||
python test_rss_extraction.py
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
from config import Config
|
||||
import feedparser
|
||||
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*80)
|
||||
|
||||
# Connect to database
|
||||
print(f"\nConnecting to MongoDB: {Config.MONGODB_URI}")
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
# Get RSS feeds
|
||||
print("Fetching RSS feeds from database...")
|
||||
feeds = list(db['rss_feeds'].find())
|
||||
|
||||
if not feeds:
|
||||
print("\n❌ No RSS feeds in database!")
|
||||
print("\nAdd a feed first:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Süddeutsche Politik\", \"url\": \"https://rss.sueddeutsche.de/rss/Politik\"}'")
|
||||
exit(1)
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s)\n")
|
||||
|
||||
# Test each feed
|
||||
total_success = 0
|
||||
total_fail = 0
|
||||
|
||||
for feed_doc in feeds:
|
||||
name = feed_doc.get('name', 'Unknown')
|
||||
url = feed_doc.get('url', '')
|
||||
active = feed_doc.get('active', True)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(f"Feed: {name}")
|
||||
print(f"URL: {url}")
|
||||
print(f"Active: {'Yes' if active else 'No'}")
|
||||
print("="*80)
|
||||
|
||||
if not active:
|
||||
print("⏭ Skipping (inactive)")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse RSS
|
||||
print("\nFetching RSS feed...")
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
continue
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries")
|
||||
|
||||
# Test first 3 entries
|
||||
print(f"\nTesting first 3 entries:")
|
||||
print("-" * 80)
|
||||
|
||||
for i, entry in enumerate(feed.entries[:3], 1):
|
||||
print(f"\n📰 Entry {i}:")
|
||||
|
||||
# Title
|
||||
title = entry.get('title', 'No title')
|
||||
print(f" Title: {title[:65]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f" ✓ URL: {article_url}")
|
||||
total_success += 1
|
||||
else:
|
||||
print(f" ❌ Could not extract URL")
|
||||
print(f" Available fields: {list(entry.keys())[:10]}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
total_fail += 1
|
||||
|
||||
# Test summary
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f" ✓ Summary: {summary[:70]}...")
|
||||
else:
|
||||
print(f" ⚠ No summary")
|
||||
|
||||
# Test date
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f" ✓ Date: {pub_date}")
|
||||
else:
|
||||
print(f" ⚠ No date")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*80)
|
||||
print("SUMMARY")
|
||||
print("="*80)
|
||||
print(f"Total URLs tested: {total_success + total_fail}")
|
||||
print(f"✓ Successfully extracted: {total_success}")
|
||||
print(f"❌ Failed to extract: {total_fail}")
|
||||
|
||||
if total_fail == 0:
|
||||
print("\n🎉 All URLs extracted successfully!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" cd ../news_crawler")
|
||||
print(" pip install -r requirements.txt")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print(f"\n⚠ {total_fail} URL(s) could not be extracted")
|
||||
print("Check the output above for details")
|
||||
|
||||
print("="*80 + "\n")
|
||||
Reference in New Issue
Block a user