You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

135 lines
5.2 KiB

import unittest
import os
from unittest.mock import patch, MagicMock
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler import FireCrawlCrawler
class TestFireCrawlCrawler(unittest.TestCase):
"""Tests for the FireCrawlCrawler class."""
def setUp(self):
"""Set up test fixtures."""
# Patch the environment variable
self.env_patcher = patch.dict('os.environ', {'FIRECRAWL_API_KEY': 'fake-api-key'})
self.env_patcher.start()
# Create a mock for the FirecrawlApp
self.firecrawl_app_patcher = patch('deepsearcher.loader.web_crawler.firecrawl_crawler.FirecrawlApp')
self.mock_firecrawl_app = self.firecrawl_app_patcher.start()
# Set up mock instances
self.mock_app_instance = MagicMock()
self.mock_firecrawl_app.return_value = self.mock_app_instance
# Create the crawler
self.crawler = FireCrawlCrawler()
def tearDown(self):
"""Clean up test fixtures."""
self.env_patcher.stop()
self.firecrawl_app_patcher.stop()
def test_init(self):
"""Test initialization."""
self.assertIsNone(self.crawler.app)
def test_crawl_url_single_page(self):
"""Test crawling a single URL."""
url = "https://example.com"
# Set up mock response for scrape_url
mock_response = MagicMock()
mock_response.model_dump.return_value = {
"markdown": "# Example Page\nThis is a test page.",
"metadata": {"title": "Example Page", "url": url}
}
self.mock_app_instance.scrape_url.return_value = mock_response
# Call the method
documents = self.crawler.crawl_url(url)
# Verify FirecrawlApp was initialized
self.mock_firecrawl_app.assert_called_once_with(api_key='fake-api-key')
# Verify scrape_url was called correctly
self.mock_app_instance.scrape_url.assert_called_once_with(url=url, formats=["markdown"])
# Check results
self.assertEqual(len(documents), 1)
document = documents[0]
self.assertEqual(document.page_content, "# Example Page\nThis is a test page.")
self.assertEqual(document.metadata["reference"], url)
self.assertEqual(document.metadata["title"], "Example Page")
def test_crawl_url_multiple_pages(self):
"""Test crawling multiple pages recursively."""
url = "https://example.com"
max_depth = 3
limit = 10
# Set up mock response for crawl_url
mock_response = MagicMock()
mock_response.model_dump.return_value = {
"data": [
{
"markdown": "# Page 1\nContent 1",
"metadata": {"title": "Page 1", "url": "https://example.com/page1"}
},
{
"markdown": "# Page 2\nContent 2",
"metadata": {"title": "Page 2", "url": "https://example.com/page2"}
}
]
}
self.mock_app_instance.crawl_url.return_value = mock_response
# Call the method
documents = self.crawler.crawl_url(url, max_depth=max_depth, limit=limit)
# Verify FirecrawlApp was initialized
self.mock_firecrawl_app.assert_called_once_with(api_key='fake-api-key')
# Verify crawl_url was called correctly
self.mock_app_instance.crawl_url.assert_called_once()
call_kwargs = self.mock_app_instance.crawl_url.call_args[1]
self.assertEqual(call_kwargs['url'], url)
self.assertEqual(call_kwargs['max_depth'], max_depth)
self.assertEqual(call_kwargs['limit'], limit)
# Check results
self.assertEqual(len(documents), 2)
# Check first document
self.assertEqual(documents[0].page_content, "# Page 1\nContent 1")
self.assertEqual(documents[0].metadata["reference"], "https://example.com/page1")
self.assertEqual(documents[0].metadata["title"], "Page 1")
# Check second document
self.assertEqual(documents[1].page_content, "# Page 2\nContent 2")
self.assertEqual(documents[1].metadata["reference"], "https://example.com/page2")
self.assertEqual(documents[1].metadata["title"], "Page 2")
def test_crawl_url_with_default_params(self):
"""Test crawling with default parameters."""
url = "https://example.com"
# Set up mock response for crawl_url
mock_response = MagicMock()
mock_response.model_dump.return_value = {"data": []}
self.mock_app_instance.crawl_url.return_value = mock_response
# Call the method with only max_depth
self.crawler.crawl_url(url, max_depth=2)
# Verify default values were used
call_kwargs = self.mock_app_instance.crawl_url.call_args[1]
self.assertEqual(call_kwargs['limit'], 20) # Default limit
self.assertEqual(call_kwargs['max_depth'], 2) # Provided max_depth
self.assertEqual(call_kwargs['allow_backward_links'], False) # Default allow_backward_links
if __name__ == "__main__":
unittest.main()