This commit is contained in:
2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python
"""
Test script to verify RSS feed URL extraction
Tests actual feeds from the database
"""
import feedparser
from pymongo import MongoClient
import os
from dotenv import load_dotenv
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
# Load environment variables
load_dotenv(dotenv_path='../.env')
# MongoDB setup
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
DB_NAME = 'munich_news'
client = MongoClient(MONGODB_URI)
db = client[DB_NAME]
rss_feeds_collection = db['rss_feeds']
def test_feed(feed_name, feed_url):
"""Test a single RSS feed"""
print(f"\n{'='*70}")
print(f"Testing: {feed_name}")
print(f"URL: {feed_url}")
print('='*70)
try:
# Parse the feed
print("Fetching RSS feed...")
feed = feedparser.parse(feed_url)
if not feed.entries:
print("❌ No entries found in feed")
return False
print(f"✓ Found {len(feed.entries)} entries\n")
# Test first 5 entries
success_count = 0
fail_count = 0
for i, entry in enumerate(feed.entries[:5], 1):
print(f"\n--- Entry {i} ---")
print(f"Title: {entry.get('title', 'No title')[:60]}")
# Test URL extraction
article_url = extract_article_url(entry)
if article_url:
print(f"✓ URL: {article_url}")
success_count += 1
else:
print(f"❌ No valid URL found")
print(f" Available fields: {list(entry.keys())}")
print(f" link: {entry.get('link', 'N/A')}")
print(f" guid: {entry.get('guid', 'N/A')}")
print(f" id: {entry.get('id', 'N/A')}")
fail_count += 1
# Test summary extraction
summary = extract_article_summary(entry)
if summary:
print(f"✓ Summary: {summary[:80]}...")
else:
print(f"⚠ No summary found")
# Test date extraction
pub_date = extract_published_date(entry)
if pub_date:
print(f"✓ Published: {pub_date}")
else:
print(f"⚠ No published date found")
print(f"\n{'='*70}")
print(f"Results for {feed_name}:")
print(f" ✓ Success: {success_count}/5")
print(f" ❌ Failed: {fail_count}/5")
print('='*70)
return fail_count == 0
except Exception as e:
print(f"❌ Error testing feed: {e}")
return False
def main():
print("\n" + "="*70)
print("RSS Feed URL Extraction Test")
print("="*70)
# Get all RSS feeds from database
print("\nFetching RSS feeds from database...")
feeds = list(rss_feeds_collection.find())
if not feeds:
print("❌ No RSS feeds found in database")
print("\nAdd feeds using:")
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
print(" -H 'Content-Type: application/json' \\")
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
return 1
print(f"✓ Found {len(feeds)} feed(s) in database\n")
# Test each feed
results = {}
for feed in feeds:
feed_name = feed.get('name', 'Unknown')
feed_url = feed.get('url', '')
active = feed.get('active', True)
if not active:
print(f"\n⏭ Skipping inactive feed: {feed_name}")
continue
if not feed_url:
print(f"\n❌ Feed '{feed_name}' has no URL")
results[feed_name] = False
continue
results[feed_name] = test_feed(feed_name, feed_url)
# Summary
print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
for feed_name, success in results.items():
status = "✓ PASS" if success else "❌ FAIL"
print(f"{status} - {feed_name}")
total = len(results)
passed = sum(1 for s in results.values() if s)
print(f"\nTotal: {passed}/{total} feeds passed")
print("="*70 + "\n")
if passed == total:
print("✓ All feeds are working correctly!")
print("\nYou can now run the crawler:")
print(" python crawler_service.py")
return 0
else:
print("⚠ Some feeds have issues. Check the output above.")
return 1
if __name__ == '__main__':
import sys
sys.exit(main())