update

2025-11-11 14:09:21 +01:00
parent bcd0a10576
commit 1075a91eac
57 changed files with 5598 additions and 1366 deletions
--- a/tests/crawler/test_crawler.py
+++ b/tests/crawler/test_crawler.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+"""
+Test script to verify crawler functionality
+"""
+from crawler_service import extract_article_content, get_active_rss_feeds
+import sys
+
+
+def test_content_extraction():
+    """Test content extraction from a sample URL"""
+    print("Testing content extraction...")
+    
+    # Test with a simple news site
+    test_url = "https://www.bbc.com/news"
+    
+    print(f"Extracting content from: {test_url}")
+    result = extract_article_content(test_url, timeout=10)
+    
+    if result:
+        print("✓ Content extraction successful!")
+        print(f"  Title: {result.get('title', 'N/A')[:50]}...")
+        print(f"  Content length: {len(result.get('content', ''))} chars")
+        print(f"  Word count: {result.get('word_count', 0)}")
+        return True
+    else:
+        print("✗ Content extraction failed")
+        return False
+
+
+def test_database_connection():
+    """Test MongoDB connection"""
+    print("\nTesting database connection...")
+    
+    try:
+        feeds = get_active_rss_feeds()
+        print(f"✓ Database connection successful!")
+        print(f"  Found {len(feeds)} active RSS feed(s)")
+        
+        if feeds:
+            print("\n  Active feeds:")
+            for feed in feeds:
+                print(f"    - {feed['name']}: {feed['url']}")
+        else:
+            print("\n  ⚠ No active feeds found. Add feeds via the backend API:")
+            print("    curl -X POST http://localhost:5001/api/rss-feeds \\")
+            print("      -H 'Content-Type: application/json' \\")
+            print("      -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
+        
+        return True
+    except Exception as e:
+        print(f"✗ Database connection failed: {e}")
+        return False
+
+
+def main():
+    print("="*60)
+    print("News Crawler - Test Suite")
+    print("="*60 + "\n")
+    
+    # Test database connection
+    db_ok = test_database_connection()
+    
+    # Test content extraction
+    extract_ok = test_content_extraction()
+    
+    print("\n" + "="*60)
+    print("Test Results:")
+    print(f"  Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
+    print(f"  Content Extraction:  {'✓ PASS' if extract_ok else '✗ FAIL'}")
+    print("="*60 + "\n")
+    
+    if db_ok and extract_ok:
+        print("✓ All tests passed! Crawler is ready to use.")
+        print("\nRun the crawler with:")
+        print("  python crawler_service.py")
+        return 0
+    else:
+        print("✗ Some tests failed. Please check the errors above.")
+        return 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/tests/crawler/test_ollama.py
+++ b/tests/crawler/test_ollama.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""
+Test script for Ollama integration
+Tests connection, configuration, and summarization
+"""
+from config import Config
+from ollama_client import OllamaClient
+
+print("\n" + "="*70)
+print("Ollama Integration Test")
+print("="*70)
+
+# Print configuration
+Config.print_config()
+
+# Validate configuration
+issues = Config.validate()
+if issues:
+    print("⚠ Configuration Issues:")
+    for issue in issues:
+        print(f"  - {issue}")
+    print()
+
+# Initialize client
+client = OllamaClient(
+    base_url=Config.OLLAMA_BASE_URL,
+    model=Config.OLLAMA_MODEL,
+    api_key=Config.OLLAMA_API_KEY,
+    enabled=Config.OLLAMA_ENABLED,
+    timeout=Config.OLLAMA_TIMEOUT
+)
+
+# Test 1: Check if Ollama is enabled
+print("Test 1: Configuration Check")
+print(f"  Ollama Enabled: {Config.OLLAMA_ENABLED}")
+if not Config.OLLAMA_ENABLED:
+    print("  ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
+    print("\n" + "="*70)
+    exit(0)
+
+# Test 2: Test connection
+print("\nTest 2: Connection Test")
+conn_result = client.test_connection()
+print(f"  Available: {conn_result['available']}")
+print(f"  Current Model: {conn_result['current_model']}")
+
+if conn_result['available']:
+    print(f"  ✓ Connected to Ollama server")
+    if conn_result['models']:
+        print(f"  Available models: {', '.join(conn_result['models'][:5])}")
+        if conn_result['current_model'] not in conn_result['models']:
+            print(f"  ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
+else:
+    print(f"  ✗ Connection failed: {conn_result['error']}")
+    print("\n" + "="*70)
+    exit(1)
+
+# Test 3: Test summarization with sample article
+print("\nTest 3: Summarization Test")
+print("  Testing with sample German article...")
+
+sample_article = """
+Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet. 
+Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil. 
+Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen. 
+Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen 
+und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung 
+den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich 
+über 50.000 Fahrgästen auf der neuen Strecke.
+"""
+
+result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
+
+print(f"\n  Success: {result['success']}")
+if result['success']:
+    print(f"  ✓ Summarization successful!")
+    print(f"\n  Original word count: {result['original_word_count']}")
+    print(f"  Summary word count: {result['summary_word_count']}")
+    print(f"  Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
+    print(f"  Duration: {result['duration']:.2f}s")
+    print(f"\n  Summary (English):")
+    print(f"  {'-'*70}")
+    print(f"  {result['summary']}")
+    print(f"  {'-'*70}")
+else:
+    print(f"  ✗ Summarization failed: {result['error']}")
+
+# Test 4: Test with English article
+print("\nTest 4: English Article Test")
+print("  Testing with English article...")
+
+english_article = """
+The city council approved a new bike lane network spanning 50 kilometers across Munich. 
+The project aims to promote sustainable transportation and reduce car traffic in the city center. 
+Construction will begin next month and is expected to be completed within two years. 
+The bike lanes will connect major residential areas with business districts and public transport hubs. 
+Environmental groups have praised the initiative as a significant step toward carbon neutrality.
+"""
+
+result2 = client.summarize_article(english_article, max_words=50)
+
+print(f"\n  Success: {result2['success']}")
+if result2['success']:
+    print(f"  ✓ Summarization successful!")
+    print(f"  Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
+    print(f"  Duration: {result2['duration']:.2f}s")
+    print(f"\n  Summary:")
+    print(f"  {result2['summary']}")
+else:
+    print(f"  ✗ Summarization failed: {result2['error']}")
+
+# Summary
+print("\n" + "="*70)
+print("Test Summary")
+print("="*70)
+print(f"✓ Configuration: Valid")
+print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
+print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
+print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
+print("="*70)
+
+if result['success'] and result2['success']:
+    print("\n🎉 All tests passed! Ollama integration is working correctly.")
+    print("\nYou can now run the crawler with AI summarization:")
+    print("  python crawler_service.py 5")
+else:
+    print("\n⚠ Some tests failed. Check the errors above.")
+
+print()
--- a/tests/crawler/test_rss_feeds.py
+++ b/tests/crawler/test_rss_feeds.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+"""
+Test script to verify RSS feed URL extraction
+Tests actual feeds from the database
+"""
+import feedparser
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+from rss_utils import extract_article_url, extract_article_summary, extract_published_date
+
+# Load environment variables
+load_dotenv(dotenv_path='../.env')
+
+# MongoDB setup
+MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
+DB_NAME = 'munich_news'
+
+client = MongoClient(MONGODB_URI)
+db = client[DB_NAME]
+rss_feeds_collection = db['rss_feeds']
+
+
+def test_feed(feed_name, feed_url):
+    """Test a single RSS feed"""
+    print(f"\n{'='*70}")
+    print(f"Testing: {feed_name}")
+    print(f"URL: {feed_url}")
+    print('='*70)
+    
+    try:
+        # Parse the feed
+        print("Fetching RSS feed...")
+        feed = feedparser.parse(feed_url)
+        
+        if not feed.entries:
+            print("❌ No entries found in feed")
+            return False
+        
+        print(f"✓ Found {len(feed.entries)} entries\n")
+        
+        # Test first 5 entries
+        success_count = 0
+        fail_count = 0
+        
+        for i, entry in enumerate(feed.entries[:5], 1):
+            print(f"\n--- Entry {i} ---")
+            print(f"Title: {entry.get('title', 'No title')[:60]}")
+            
+            # Test URL extraction
+            article_url = extract_article_url(entry)
+            if article_url:
+                print(f"✓ URL: {article_url}")
+                success_count += 1
+            else:
+                print(f"❌ No valid URL found")
+                print(f"   Available fields: {list(entry.keys())}")
+                print(f"   link: {entry.get('link', 'N/A')}")
+                print(f"   guid: {entry.get('guid', 'N/A')}")
+                print(f"   id: {entry.get('id', 'N/A')}")
+                fail_count += 1
+            
+            # Test summary extraction
+            summary = extract_article_summary(entry)
+            if summary:
+                print(f"✓ Summary: {summary[:80]}...")
+            else:
+                print(f"⚠ No summary found")
+            
+            # Test date extraction
+            pub_date = extract_published_date(entry)
+            if pub_date:
+                print(f"✓ Published: {pub_date}")
+            else:
+                print(f"⚠ No published date found")
+        
+        print(f"\n{'='*70}")
+        print(f"Results for {feed_name}:")
+        print(f"  ✓ Success: {success_count}/5")
+        print(f"  ❌ Failed: {fail_count}/5")
+        print('='*70)
+        
+        return fail_count == 0
+        
+    except Exception as e:
+        print(f"❌ Error testing feed: {e}")
+        return False
+
+
+def main():
+    print("\n" + "="*70)
+    print("RSS Feed URL Extraction Test")
+    print("="*70)
+    
+    # Get all RSS feeds from database
+    print("\nFetching RSS feeds from database...")
+    feeds = list(rss_feeds_collection.find())
+    
+    if not feeds:
+        print("❌ No RSS feeds found in database")
+        print("\nAdd feeds using:")
+        print("  curl -X POST http://localhost:5001/api/rss-feeds \\")
+        print("    -H 'Content-Type: application/json' \\")
+        print("    -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
+        return 1
+    
+    print(f"✓ Found {len(feeds)} feed(s) in database\n")
+    
+    # Test each feed
+    results = {}
+    for feed in feeds:
+        feed_name = feed.get('name', 'Unknown')
+        feed_url = feed.get('url', '')
+        active = feed.get('active', True)
+        
+        if not active:
+            print(f"\n⏭ Skipping inactive feed: {feed_name}")
+            continue
+        
+        if not feed_url:
+            print(f"\n❌ Feed '{feed_name}' has no URL")
+            results[feed_name] = False
+            continue
+        
+        results[feed_name] = test_feed(feed_name, feed_url)
+    
+    # Summary
+    print("\n" + "="*70)
+    print("FINAL RESULTS")
+    print("="*70)
+    
+    for feed_name, success in results.items():
+        status = "✓ PASS" if success else "❌ FAIL"
+        print(f"{status} - {feed_name}")
+    
+    total = len(results)
+    passed = sum(1 for s in results.values() if s)
+    
+    print(f"\nTotal: {passed}/{total} feeds passed")
+    print("="*70 + "\n")
+    
+    if passed == total:
+        print("✓ All feeds are working correctly!")
+        print("\nYou can now run the crawler:")
+        print("  python crawler_service.py")
+        return 0
+    else:
+        print("⚠ Some feeds have issues. Check the output above.")
+        return 1
+
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main())