84 lines
2.5 KiB
Python
84 lines
2.5 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Test script to verify crawler functionality
|
|
"""
|
|
from crawler_service import extract_article_content, get_active_rss_feeds
|
|
import sys
|
|
|
|
|
|
def test_content_extraction():
|
|
"""Test content extraction from a sample URL"""
|
|
print("Testing content extraction...")
|
|
|
|
# Test with a simple news site
|
|
test_url = "https://www.bbc.com/news"
|
|
|
|
print(f"Extracting content from: {test_url}")
|
|
result = extract_article_content(test_url, timeout=10)
|
|
|
|
if result:
|
|
print("✓ Content extraction successful!")
|
|
print(f" Title: {result.get('title', 'N/A')[:50]}...")
|
|
print(f" Content length: {len(result.get('content', ''))} chars")
|
|
print(f" Word count: {result.get('word_count', 0)}")
|
|
return True
|
|
else:
|
|
print("✗ Content extraction failed")
|
|
return False
|
|
|
|
|
|
def test_database_connection():
|
|
"""Test MongoDB connection"""
|
|
print("\nTesting database connection...")
|
|
|
|
try:
|
|
feeds = get_active_rss_feeds()
|
|
print(f"✓ Database connection successful!")
|
|
print(f" Found {len(feeds)} active RSS feed(s)")
|
|
|
|
if feeds:
|
|
print("\n Active feeds:")
|
|
for feed in feeds:
|
|
print(f" - {feed['name']}: {feed['url']}")
|
|
else:
|
|
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
|
|
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
|
print(" -H 'Content-Type: application/json' \\")
|
|
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"✗ Database connection failed: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
print("="*60)
|
|
print("News Crawler - Test Suite")
|
|
print("="*60 + "\n")
|
|
|
|
# Test database connection
|
|
db_ok = test_database_connection()
|
|
|
|
# Test content extraction
|
|
extract_ok = test_content_extraction()
|
|
|
|
print("\n" + "="*60)
|
|
print("Test Results:")
|
|
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
|
|
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
|
|
print("="*60 + "\n")
|
|
|
|
if db_ok and extract_ok:
|
|
print("✓ All tests passed! Crawler is ready to use.")
|
|
print("\nRun the crawler with:")
|
|
print(" python crawler_service.py")
|
|
return 0
|
|
else:
|
|
print("✗ Some tests failed. Please check the errors above.")
|
|
return 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|