You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
157 lines
5.8 KiB
157 lines
5.8 KiB
import unittest
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler import DoclingCrawler
|
|
|
|
|
|
class TestDoclingCrawler(unittest.TestCase):
|
|
"""Tests for the DoclingCrawler class."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
# Create mocks for the docling modules
|
|
self.docling_patcher = patch.dict('sys.modules', {
|
|
'docling': MagicMock(),
|
|
'docling.document_converter': MagicMock(),
|
|
'docling_core': MagicMock(),
|
|
'docling_core.transforms': MagicMock(),
|
|
'docling_core.transforms.chunker': MagicMock()
|
|
})
|
|
self.docling_patcher.start()
|
|
|
|
# Create mocks for the classes
|
|
self.mock_document_converter = MagicMock()
|
|
self.mock_hierarchical_chunker = MagicMock()
|
|
|
|
# Add the mocks to the modules
|
|
import sys
|
|
sys.modules['docling.document_converter'].DocumentConverter = self.mock_document_converter
|
|
sys.modules['docling_core.transforms.chunker'].HierarchicalChunker = self.mock_hierarchical_chunker
|
|
|
|
# Set up mock instances
|
|
self.mock_converter_instance = MagicMock()
|
|
self.mock_chunker_instance = MagicMock()
|
|
self.mock_document_converter.return_value = self.mock_converter_instance
|
|
self.mock_hierarchical_chunker.return_value = self.mock_chunker_instance
|
|
|
|
# Create the crawler
|
|
self.crawler = DoclingCrawler()
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
self.docling_patcher.stop()
|
|
|
|
def test_init(self):
|
|
"""Test initialization."""
|
|
# Verify instances were created
|
|
self.mock_document_converter.assert_called_once()
|
|
self.mock_hierarchical_chunker.assert_called_once()
|
|
|
|
# Check that the instances were assigned correctly
|
|
self.assertEqual(self.crawler.converter, self.mock_converter_instance)
|
|
self.assertEqual(self.crawler.chunker, self.mock_chunker_instance)
|
|
|
|
def test_crawl_url(self):
|
|
"""Test crawling a URL."""
|
|
url = "https://example.com"
|
|
|
|
# Set up mock document and chunks
|
|
mock_document = MagicMock()
|
|
mock_conversion_result = MagicMock()
|
|
mock_conversion_result.document = mock_document
|
|
|
|
# Set up three mock chunks
|
|
mock_chunks = []
|
|
for i in range(3):
|
|
chunk = MagicMock()
|
|
chunk.text = f"Chunk {i} content"
|
|
mock_chunks.append(chunk)
|
|
|
|
# Configure mock converter and chunker
|
|
self.mock_converter_instance.convert.return_value = mock_conversion_result
|
|
self.mock_chunker_instance.chunk.return_value = mock_chunks
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_url(url)
|
|
|
|
# Verify converter was called correctly
|
|
self.mock_converter_instance.convert.assert_called_once_with(url)
|
|
|
|
# Verify chunker was called correctly
|
|
self.mock_chunker_instance.chunk.assert_called_once_with(mock_document)
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 3)
|
|
|
|
# Check each document
|
|
for i, document in enumerate(documents):
|
|
self.assertEqual(document.page_content, f"Chunk {i} content")
|
|
self.assertEqual(document.metadata["reference"], url)
|
|
self.assertEqual(document.metadata["text"], f"Chunk {i} content")
|
|
|
|
def test_crawl_url_error(self):
|
|
"""Test error handling when crawling a URL."""
|
|
url = "https://example.com"
|
|
|
|
# Configure converter to raise an exception
|
|
self.mock_converter_instance.convert.side_effect = Exception("Test error")
|
|
|
|
# Verify that the error is propagated
|
|
with self.assertRaises(IOError):
|
|
self.crawler.crawl_url(url)
|
|
|
|
def test_supported_file_types(self):
|
|
"""Test the supported_file_types property."""
|
|
file_types = self.crawler.supported_file_types
|
|
|
|
# Check that all expected file types are included
|
|
expected_types = [
|
|
"pdf", "docx", "xlsx", "pptx", "md", "adoc", "asciidoc",
|
|
"html", "xhtml", "csv", "png", "jpg", "jpeg", "tif", "tiff", "bmp"
|
|
]
|
|
|
|
for file_type in expected_types:
|
|
self.assertIn(file_type, file_types)
|
|
|
|
# Check that the count matches
|
|
self.assertEqual(len(file_types), len(expected_types))
|
|
|
|
def test_crawl_urls(self):
|
|
"""Test crawling multiple URLs."""
|
|
urls = ["https://example.com", "https://example.org"]
|
|
|
|
# Set up mock document and chunks for each URL
|
|
mock_document = MagicMock()
|
|
mock_conversion_result = MagicMock()
|
|
mock_conversion_result.document = mock_document
|
|
|
|
# Set up one mock chunk per URL
|
|
mock_chunk = MagicMock()
|
|
mock_chunk.text = "Test chunk content"
|
|
|
|
# Configure mock converter and chunker
|
|
self.mock_converter_instance.convert.return_value = mock_conversion_result
|
|
self.mock_chunker_instance.chunk.return_value = [mock_chunk]
|
|
|
|
# Call the method
|
|
documents = self.crawler.crawl_urls(urls)
|
|
|
|
# Verify converter was called for each URL
|
|
self.assertEqual(self.mock_converter_instance.convert.call_count, 2)
|
|
|
|
# Verify chunker was called for each document
|
|
self.assertEqual(self.mock_chunker_instance.chunk.call_count, 2)
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 2)
|
|
|
|
# Each URL should have generated one document (with one chunk)
|
|
for document in documents:
|
|
self.assertEqual(document.page_content, "Test chunk content")
|
|
self.assertIn(document.metadata["reference"], urls)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|