update

2025-11-10 19:13:33 +01:00
commit ac5738c29d
64 changed files with 9445 additions and 0 deletions
--- a/news_crawler/test_rss_feeds.py
+++ b/news_crawler/test_rss_feeds.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+"""
+Test script to verify RSS feed URL extraction
+Tests actual feeds from the database
+"""
+import feedparser
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+from rss_utils import extract_article_url, extract_article_summary, extract_published_date
+
+# Load environment variables
+load_dotenv(dotenv_path='../.env')
+
+# MongoDB setup
+MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
+DB_NAME = 'munich_news'
+
+client = MongoClient(MONGODB_URI)
+db = client[DB_NAME]
+rss_feeds_collection = db['rss_feeds']
+
+
+def test_feed(feed_name, feed_url):
+    """Test a single RSS feed"""
+    print(f"\n{'='*70}")
+    print(f"Testing: {feed_name}")
+    print(f"URL: {feed_url}")
+    print('='*70)
+    
+    try:
+        # Parse the feed
+        print("Fetching RSS feed...")
+        feed = feedparser.parse(feed_url)
+        
+        if not feed.entries:
+            print("❌ No entries found in feed")
+            return False
+        
+        print(f"✓ Found {len(feed.entries)} entries\n")
+        
+        # Test first 5 entries
+        success_count = 0
+        fail_count = 0
+        
+        for i, entry in enumerate(feed.entries[:5], 1):
+            print(f"\n--- Entry {i} ---")
+            print(f"Title: {entry.get('title', 'No title')[:60]}")
+            
+            # Test URL extraction
+            article_url = extract_article_url(entry)
+            if article_url:
+                print(f"✓ URL: {article_url}")
+                success_count += 1
+            else:
+                print(f"❌ No valid URL found")
+                print(f"   Available fields: {list(entry.keys())}")
+                print(f"   link: {entry.get('link', 'N/A')}")
+                print(f"   guid: {entry.get('guid', 'N/A')}")
+                print(f"   id: {entry.get('id', 'N/A')}")
+                fail_count += 1
+            
+            # Test summary extraction
+            summary = extract_article_summary(entry)
+            if summary:
+                print(f"✓ Summary: {summary[:80]}...")
+            else:
+                print(f"⚠ No summary found")
+            
+            # Test date extraction
+            pub_date = extract_published_date(entry)
+            if pub_date:
+                print(f"✓ Published: {pub_date}")
+            else:
+                print(f"⚠ No published date found")
+        
+        print(f"\n{'='*70}")
+        print(f"Results for {feed_name}:")
+        print(f"  ✓ Success: {success_count}/5")
+        print(f"  ❌ Failed: {fail_count}/5")
+        print('='*70)
+        
+        return fail_count == 0
+        
+    except Exception as e:
+        print(f"❌ Error testing feed: {e}")
+        return False
+
+
+def main():
+    print("\n" + "="*70)
+    print("RSS Feed URL Extraction Test")
+    print("="*70)
+    
+    # Get all RSS feeds from database
+    print("\nFetching RSS feeds from database...")
+    feeds = list(rss_feeds_collection.find())
+    
+    if not feeds:
+        print("❌ No RSS feeds found in database")
+        print("\nAdd feeds using:")
+        print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
+        print("    -H 'Content-Type: application/json' \\")
+        print("    -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
+        return 1
+    
+    print(f"✓ Found {len(feeds)} feed(s) in database\n")
+    
+    # Test each feed
+    results = {}
+    for feed in feeds:
+        feed_name = feed.get('name', 'Unknown')
+        feed_url = feed.get('url', '')
+        active = feed.get('active', True)
+        
+        if not active:
+            print(f"\n⏭ Skipping inactive feed: {feed_name}")
+            continue
+        
+        if not feed_url:
+            print(f"\n❌ Feed '{feed_name}' has no URL")
+            results[feed_name] = False
+            continue
+        
+        results[feed_name] = test_feed(feed_name, feed_url)
+    
+    # Summary
+    print("\n" + "="*70)
+    print("FINAL RESULTS")
+    print("="*70)
+    
+    for feed_name, success in results.items():
+        status = "✓ PASS" if success else "❌ FAIL"
+        print(f"{status} - {feed_name}")
+    
+    total = len(results)
+    passed = sum(1 for s in results.values() if s)
+    
+    print(f"\nTotal: {passed}/{total} feeds passed")
+    print("="*70 + "\n")
+    
+    if passed == total:
+        print("✓ All feeds are working correctly!")
+        print("\nYou can now run the crawler:")
+        print("  python crawler_service.py")
+        return 0
+    else:
+        print("⚠ Some feeds have issues. Check the output above.")
+        return 1
+
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main())