You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
157 lines
5.7 KiB
157 lines
5.7 KiB
import unittest
|
|
import asyncio
|
|
from unittest.mock import patch, MagicMock
|
|
import warnings
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler import Crawl4AICrawler
|
|
|
|
|
|
class TestCrawl4AICrawler(unittest.TestCase):
|
|
"""Tests for the Crawl4AICrawler class."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
# Create a mock for the crawl4ai module
|
|
warnings.filterwarnings('ignore', message='coroutine.*never awaited')
|
|
self.crawl4ai_patcher = patch.dict('sys.modules', {'crawl4ai': MagicMock()})
|
|
self.crawl4ai_patcher.start()
|
|
|
|
# Create mocks for the classes
|
|
self.mock_async_web_crawler = MagicMock()
|
|
self.mock_browser_config = MagicMock()
|
|
|
|
# Set up the from_kwargs method
|
|
self.mock_config_instance = MagicMock()
|
|
self.mock_browser_config.from_kwargs.return_value = self.mock_config_instance
|
|
|
|
# Add the mocks to the crawl4ai module
|
|
import sys
|
|
sys.modules['crawl4ai'].AsyncWebCrawler = self.mock_async_web_crawler
|
|
sys.modules['crawl4ai'].BrowserConfig = self.mock_browser_config
|
|
|
|
# Set up mock instances
|
|
self.mock_crawler_instance = MagicMock()
|
|
self.mock_async_web_crawler.return_value = self.mock_crawler_instance
|
|
|
|
# For context manager behavior
|
|
self.mock_crawler_instance.__aenter__.return_value = self.mock_crawler_instance
|
|
self.mock_crawler_instance.__aexit__.return_value = None
|
|
|
|
# Create test browser_config
|
|
self.test_browser_config = {"headless": True}
|
|
|
|
# Create the crawler
|
|
self.crawler = Crawl4AICrawler(browser_config=self.test_browser_config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
self.crawl4ai_patcher.stop()
|
|
|
|
def test_init(self):
|
|
"""Test initialization."""
|
|
# Verify that the browser_config was stored
|
|
self.assertEqual(self.crawler.browser_config, self.test_browser_config)
|
|
|
|
# Verify that the crawler is not initialized
|
|
self.assertIsNone(self.crawler.crawler)
|
|
|
|
def test_lazy_init(self):
|
|
"""Test the lazy initialization of the crawler."""
|
|
# Call _lazy_init method
|
|
self.crawler._lazy_init()
|
|
|
|
# Verify BrowserConfig.from_kwargs was called
|
|
self.mock_browser_config.from_kwargs.assert_called_once_with(self.test_browser_config)
|
|
|
|
# Verify AsyncWebCrawler was initialized
|
|
self.mock_async_web_crawler.assert_called_once_with(config=self.mock_config_instance)
|
|
|
|
# Verify that the crawler is now set
|
|
self.assertEqual(self.crawler.crawler, self.mock_crawler_instance)
|
|
|
|
@patch('deepsearcher.loader.web_crawler.crawl4ai_crawler.asyncio.run')
|
|
def test_crawl_url(self, mock_asyncio_run):
|
|
"""Test crawling a single URL."""
|
|
url = "https://example.com"
|
|
|
|
# Set up mock document
|
|
mock_document = Document(
|
|
page_content="# Example Page\nThis is a test page.",
|
|
metadata={"reference": url, "title": "Example Page"}
|
|
)
|
|
|
|
# Configure asyncio.run to return a document
|
|
mock_asyncio_run.return_value = mock_document
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_url(url)
|
|
|
|
# Verify asyncio.run was called with _async_crawl
|
|
mock_asyncio_run.assert_called_once()
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 1)
|
|
self.assertEqual(documents[0], mock_document)
|
|
|
|
@patch('deepsearcher.loader.web_crawler.crawl4ai_crawler.asyncio.run')
|
|
def test_crawl_url_error(self, mock_asyncio_run):
|
|
"""Test error handling when crawling a URL."""
|
|
url = "https://example.com"
|
|
|
|
# Configure asyncio.run to raise an exception
|
|
mock_asyncio_run.side_effect = Exception("Test error")
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_url(url)
|
|
|
|
# Should return empty list on error
|
|
self.assertEqual(documents, [])
|
|
|
|
@patch('deepsearcher.loader.web_crawler.crawl4ai_crawler.asyncio.run')
|
|
def test_crawl_urls(self, mock_asyncio_run):
|
|
"""Test crawling multiple URLs."""
|
|
urls = ["https://example.com", "https://example.org"]
|
|
|
|
# Set up mock documents
|
|
mock_documents = [
|
|
Document(
|
|
page_content="# Example Page 1\nThis is test page 1.",
|
|
metadata={"reference": urls[0], "title": "Example Page 1"}
|
|
),
|
|
Document(
|
|
page_content="# Example Page 2\nThis is test page 2.",
|
|
metadata={"reference": urls[1], "title": "Example Page 2"}
|
|
)
|
|
]
|
|
|
|
# Configure asyncio.run to return documents
|
|
mock_asyncio_run.return_value = mock_documents
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_urls(urls)
|
|
|
|
# Verify asyncio.run was called with _async_crawl_many
|
|
mock_asyncio_run.assert_called_once()
|
|
|
|
# Check results
|
|
self.assertEqual(documents, mock_documents)
|
|
|
|
@patch('deepsearcher.loader.web_crawler.crawl4ai_crawler.asyncio.run')
|
|
def test_crawl_urls_error(self, mock_asyncio_run):
|
|
"""Test error handling when crawling multiple URLs."""
|
|
urls = ["https://example.com", "https://example.org"]
|
|
|
|
# Configure asyncio.run to raise an exception
|
|
mock_asyncio_run.side_effect = Exception("Test error")
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_urls(urls)
|
|
|
|
# Should return empty list on error
|
|
self.assertEqual(documents, [])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|