update
This commit is contained in:
83
tests/crawler/test_crawler.py
Normal file
83
tests/crawler/test_crawler.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify crawler functionality
|
||||
"""
|
||||
from crawler_service import extract_article_content, get_active_rss_feeds
|
||||
import sys
|
||||
|
||||
|
||||
def test_content_extraction():
|
||||
"""Test content extraction from a sample URL"""
|
||||
print("Testing content extraction...")
|
||||
|
||||
# Test with a simple news site
|
||||
test_url = "https://www.bbc.com/news"
|
||||
|
||||
print(f"Extracting content from: {test_url}")
|
||||
result = extract_article_content(test_url, timeout=10)
|
||||
|
||||
if result:
|
||||
print("✓ Content extraction successful!")
|
||||
print(f" Title: {result.get('title', 'N/A')[:50]}...")
|
||||
print(f" Content length: {len(result.get('content', ''))} chars")
|
||||
print(f" Word count: {result.get('word_count', 0)}")
|
||||
return True
|
||||
else:
|
||||
print("✗ Content extraction failed")
|
||||
return False
|
||||
|
||||
|
||||
def test_database_connection():
|
||||
"""Test MongoDB connection"""
|
||||
print("\nTesting database connection...")
|
||||
|
||||
try:
|
||||
feeds = get_active_rss_feeds()
|
||||
print(f"✓ Database connection successful!")
|
||||
print(f" Found {len(feeds)} active RSS feed(s)")
|
||||
|
||||
if feeds:
|
||||
print("\n Active feeds:")
|
||||
for feed in feeds:
|
||||
print(f" - {feed['name']}: {feed['url']}")
|
||||
else:
|
||||
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("News Crawler - Test Suite")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Test database connection
|
||||
db_ok = test_database_connection()
|
||||
|
||||
# Test content extraction
|
||||
extract_ok = test_content_extraction()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test Results:")
|
||||
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
|
||||
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
if db_ok and extract_ok:
|
||||
print("✓ All tests passed! Crawler is ready to use.")
|
||||
print("\nRun the crawler with:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("✗ Some tests failed. Please check the errors above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
129
tests/crawler/test_ollama.py
Normal file
129
tests/crawler/test_ollama.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script for Ollama integration
|
||||
Tests connection, configuration, and summarization
|
||||
"""
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("Ollama Integration Test")
|
||||
print("="*70)
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate configuration
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
print()
|
||||
|
||||
# Initialize client
|
||||
client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Test 1: Check if Ollama is enabled
|
||||
print("Test 1: Configuration Check")
|
||||
print(f" Ollama Enabled: {Config.OLLAMA_ENABLED}")
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
print(" ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
|
||||
print("\n" + "="*70)
|
||||
exit(0)
|
||||
|
||||
# Test 2: Test connection
|
||||
print("\nTest 2: Connection Test")
|
||||
conn_result = client.test_connection()
|
||||
print(f" Available: {conn_result['available']}")
|
||||
print(f" Current Model: {conn_result['current_model']}")
|
||||
|
||||
if conn_result['available']:
|
||||
print(f" ✓ Connected to Ollama server")
|
||||
if conn_result['models']:
|
||||
print(f" Available models: {', '.join(conn_result['models'][:5])}")
|
||||
if conn_result['current_model'] not in conn_result['models']:
|
||||
print(f" ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
|
||||
else:
|
||||
print(f" ✗ Connection failed: {conn_result['error']}")
|
||||
print("\n" + "="*70)
|
||||
exit(1)
|
||||
|
||||
# Test 3: Test summarization with sample article
|
||||
print("\nTest 3: Summarization Test")
|
||||
print(" Testing with sample German article...")
|
||||
|
||||
sample_article = """
|
||||
Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet.
|
||||
Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil.
|
||||
Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen.
|
||||
Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen
|
||||
und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung
|
||||
den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich
|
||||
über 50.000 Fahrgästen auf der neuen Strecke.
|
||||
"""
|
||||
|
||||
result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
|
||||
|
||||
print(f"\n Success: {result['success']}")
|
||||
if result['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f"\n Original word count: {result['original_word_count']}")
|
||||
print(f" Summary word count: {result['summary_word_count']}")
|
||||
print(f" Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
|
||||
print(f" Duration: {result['duration']:.2f}s")
|
||||
print(f"\n Summary (English):")
|
||||
print(f" {'-'*70}")
|
||||
print(f" {result['summary']}")
|
||||
print(f" {'-'*70}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result['error']}")
|
||||
|
||||
# Test 4: Test with English article
|
||||
print("\nTest 4: English Article Test")
|
||||
print(" Testing with English article...")
|
||||
|
||||
english_article = """
|
||||
The city council approved a new bike lane network spanning 50 kilometers across Munich.
|
||||
The project aims to promote sustainable transportation and reduce car traffic in the city center.
|
||||
Construction will begin next month and is expected to be completed within two years.
|
||||
The bike lanes will connect major residential areas with business districts and public transport hubs.
|
||||
Environmental groups have praised the initiative as a significant step toward carbon neutrality.
|
||||
"""
|
||||
|
||||
result2 = client.summarize_article(english_article, max_words=50)
|
||||
|
||||
print(f"\n Success: {result2['success']}")
|
||||
if result2['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f" Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
|
||||
print(f" Duration: {result2['duration']:.2f}s")
|
||||
print(f"\n Summary:")
|
||||
print(f" {result2['summary']}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result2['error']}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("Test Summary")
|
||||
print("="*70)
|
||||
print(f"✓ Configuration: Valid")
|
||||
print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
|
||||
print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
|
||||
print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
|
||||
print("="*70)
|
||||
|
||||
if result['success'] and result2['success']:
|
||||
print("\n🎉 All tests passed! Ollama integration is working correctly.")
|
||||
print("\nYou can now run the crawler with AI summarization:")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print("\n⚠ Some tests failed. Check the errors above.")
|
||||
|
||||
print()
|
||||
154
tests/crawler/test_rss_feeds.py
Normal file
154
tests/crawler/test_rss_feeds.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify RSS feed URL extraction
|
||||
Tests actual feeds from the database
|
||||
"""
|
||||
import feedparser
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
client = MongoClient(MONGODB_URI)
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def test_feed(feed_name, feed_url):
|
||||
"""Test a single RSS feed"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing: {feed_name}")
|
||||
print(f"URL: {feed_url}")
|
||||
print('='*70)
|
||||
|
||||
try:
|
||||
# Parse the feed
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
return False
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 5 entries
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, entry in enumerate(feed.entries[:5], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
print(f"Title: {entry.get('title', 'No title')[:60]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL: {article_url}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
fail_count += 1
|
||||
|
||||
# Test summary extraction
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
else:
|
||||
print(f"⚠ No summary found")
|
||||
|
||||
# Test date extraction
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Published: {pub_date}")
|
||||
else:
|
||||
print(f"⚠ No published date found")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Results for {feed_name}:")
|
||||
print(f" ✓ Success: {success_count}/5")
|
||||
print(f" ❌ Failed: {fail_count}/5")
|
||||
print('='*70)
|
||||
|
||||
return fail_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing feed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*70)
|
||||
|
||||
# Get all RSS feeds from database
|
||||
print("\nFetching RSS feeds from database...")
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database")
|
||||
print("\nAdd feeds using:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
results = {}
|
||||
for feed in feeds:
|
||||
feed_name = feed.get('name', 'Unknown')
|
||||
feed_url = feed.get('url', '')
|
||||
active = feed.get('active', True)
|
||||
|
||||
if not active:
|
||||
print(f"\n⏭ Skipping inactive feed: {feed_name}")
|
||||
continue
|
||||
|
||||
if not feed_url:
|
||||
print(f"\n❌ Feed '{feed_name}' has no URL")
|
||||
results[feed_name] = False
|
||||
continue
|
||||
|
||||
results[feed_name] = test_feed(feed_name, feed_url)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("FINAL RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for feed_name, success in results.items():
|
||||
status = "✓ PASS" if success else "❌ FAIL"
|
||||
print(f"{status} - {feed_name}")
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for s in results.values() if s)
|
||||
|
||||
print(f"\nTotal: {passed}/{total} feeds passed")
|
||||
print("="*70 + "\n")
|
||||
|
||||
if passed == total:
|
||||
print("✓ All feeds are working correctly!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("⚠ Some feeds have issues. Check the output above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user