You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
5.2 KiB
135 lines
5.2 KiB
import unittest
|
|
import os
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler import FireCrawlCrawler
|
|
|
|
|
|
class TestFireCrawlCrawler(unittest.TestCase):
|
|
"""Tests for the FireCrawlCrawler class."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
# Patch the environment variable
|
|
self.env_patcher = patch.dict('os.environ', {'FIRECRAWL_API_KEY': 'fake-api-key'})
|
|
self.env_patcher.start()
|
|
|
|
# Create a mock for the FirecrawlApp
|
|
self.firecrawl_app_patcher = patch('deepsearcher.loader.web_crawler.firecrawl_crawler.FirecrawlApp')
|
|
self.mock_firecrawl_app = self.firecrawl_app_patcher.start()
|
|
|
|
# Set up mock instances
|
|
self.mock_app_instance = MagicMock()
|
|
self.mock_firecrawl_app.return_value = self.mock_app_instance
|
|
|
|
# Create the crawler
|
|
self.crawler = FireCrawlCrawler()
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
self.env_patcher.stop()
|
|
self.firecrawl_app_patcher.stop()
|
|
|
|
def test_init(self):
|
|
"""Test initialization."""
|
|
self.assertIsNone(self.crawler.app)
|
|
|
|
def test_crawl_url_single_page(self):
|
|
"""Test crawling a single URL."""
|
|
url = "https://example.com"
|
|
|
|
# Set up mock response for scrape_url
|
|
mock_response = MagicMock()
|
|
mock_response.model_dump.return_value = {
|
|
"markdown": "# Example Page\nThis is a test page.",
|
|
"metadata": {"title": "Example Page", "url": url}
|
|
}
|
|
self.mock_app_instance.scrape_url.return_value = mock_response
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_url(url)
|
|
|
|
# Verify FirecrawlApp was initialized
|
|
self.mock_firecrawl_app.assert_called_once_with(api_key='fake-api-key')
|
|
|
|
# Verify scrape_url was called correctly
|
|
self.mock_app_instance.scrape_url.assert_called_once_with(url=url, formats=["markdown"])
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 1)
|
|
document = documents[0]
|
|
self.assertEqual(document.page_content, "# Example Page\nThis is a test page.")
|
|
self.assertEqual(document.metadata["reference"], url)
|
|
self.assertEqual(document.metadata["title"], "Example Page")
|
|
|
|
def test_crawl_url_multiple_pages(self):
|
|
"""Test crawling multiple pages recursively."""
|
|
url = "https://example.com"
|
|
max_depth = 3
|
|
limit = 10
|
|
|
|
# Set up mock response for crawl_url
|
|
mock_response = MagicMock()
|
|
mock_response.model_dump.return_value = {
|
|
"data": [
|
|
{
|
|
"markdown": "# Page 1\nContent 1",
|
|
"metadata": {"title": "Page 1", "url": "https://example.com/page1"}
|
|
},
|
|
{
|
|
"markdown": "# Page 2\nContent 2",
|
|
"metadata": {"title": "Page 2", "url": "https://example.com/page2"}
|
|
}
|
|
]
|
|
}
|
|
self.mock_app_instance.crawl_url.return_value = mock_response
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_url(url, max_depth=max_depth, limit=limit)
|
|
|
|
# Verify FirecrawlApp was initialized
|
|
self.mock_firecrawl_app.assert_called_once_with(api_key='fake-api-key')
|
|
|
|
# Verify crawl_url was called correctly
|
|
self.mock_app_instance.crawl_url.assert_called_once()
|
|
call_kwargs = self.mock_app_instance.crawl_url.call_args[1]
|
|
self.assertEqual(call_kwargs['url'], url)
|
|
self.assertEqual(call_kwargs['max_depth'], max_depth)
|
|
self.assertEqual(call_kwargs['limit'], limit)
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 2)
|
|
|
|
# Check first document
|
|
self.assertEqual(documents[0].page_content, "# Page 1\nContent 1")
|
|
self.assertEqual(documents[0].metadata["reference"], "https://example.com/page1")
|
|
self.assertEqual(documents[0].metadata["title"], "Page 1")
|
|
|
|
# Check second document
|
|
self.assertEqual(documents[1].page_content, "# Page 2\nContent 2")
|
|
self.assertEqual(documents[1].metadata["reference"], "https://example.com/page2")
|
|
self.assertEqual(documents[1].metadata["title"], "Page 2")
|
|
|
|
def test_crawl_url_with_default_params(self):
|
|
"""Test crawling with default parameters."""
|
|
url = "https://example.com"
|
|
|
|
# Set up mock response for crawl_url
|
|
mock_response = MagicMock()
|
|
mock_response.model_dump.return_value = {"data": []}
|
|
self.mock_app_instance.crawl_url.return_value = mock_response
|
|
|
|
# Call the method with only max_depth
|
|
self.crawler.crawl_url(url, max_depth=2)
|
|
|
|
# Verify default values were used
|
|
call_kwargs = self.mock_app_instance.crawl_url.call_args[1]
|
|
self.assertEqual(call_kwargs['limit'], 20) # Default limit
|
|
self.assertEqual(call_kwargs['max_depth'], 2) # Provided max_depth
|
|
self.assertEqual(call_kwargs['allow_backward_links'], False) # Default allow_backward_links
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|