You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

157 lines
5.8 KiB

import unittest
from unittest.mock import patch, MagicMock
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler import DoclingCrawler
class TestDoclingCrawler(unittest.TestCase):
"""Tests for the DoclingCrawler class."""
def setUp(self):
"""Set up test fixtures."""
# Create mocks for the docling modules
self.docling_patcher = patch.dict('sys.modules', {
'docling': MagicMock(),
'docling.document_converter': MagicMock(),
'docling_core': MagicMock(),
'docling_core.transforms': MagicMock(),
'docling_core.transforms.chunker': MagicMock()
})
self.docling_patcher.start()
# Create mocks for the classes
self.mock_document_converter = MagicMock()
self.mock_hierarchical_chunker = MagicMock()
# Add the mocks to the modules
import sys
sys.modules['docling.document_converter'].DocumentConverter = self.mock_document_converter
sys.modules['docling_core.transforms.chunker'].HierarchicalChunker = self.mock_hierarchical_chunker
# Set up mock instances
self.mock_converter_instance = MagicMock()
self.mock_chunker_instance = MagicMock()
self.mock_document_converter.return_value = self.mock_converter_instance
self.mock_hierarchical_chunker.return_value = self.mock_chunker_instance
# Create the crawler
self.crawler = DoclingCrawler()
def tearDown(self):
"""Clean up test fixtures."""
self.docling_patcher.stop()
def test_init(self):
"""Test initialization."""
# Verify instances were created
self.mock_document_converter.assert_called_once()
self.mock_hierarchical_chunker.assert_called_once()
# Check that the instances were assigned correctly
self.assertEqual(self.crawler.converter, self.mock_converter_instance)
self.assertEqual(self.crawler.chunker, self.mock_chunker_instance)
def test_crawl_url(self):
"""Test crawling a URL."""
url = "https://example.com"
# Set up mock document and chunks
mock_document = MagicMock()
mock_conversion_result = MagicMock()
mock_conversion_result.document = mock_document
# Set up three mock chunks
mock_chunks = []
for i in range(3):
chunk = MagicMock()
chunk.text = f"Chunk {i} content"
mock_chunks.append(chunk)
# Configure mock converter and chunker
self.mock_converter_instance.convert.return_value = mock_conversion_result
self.mock_chunker_instance.chunk.return_value = mock_chunks
# Call the method
documents = self.crawler.crawl_url(url)
# Verify converter was called correctly
self.mock_converter_instance.convert.assert_called_once_with(url)
# Verify chunker was called correctly
self.mock_chunker_instance.chunk.assert_called_once_with(mock_document)
# Check results
self.assertEqual(len(documents), 3)
# Check each document
for i, document in enumerate(documents):
self.assertEqual(document.page_content, f"Chunk {i} content")
self.assertEqual(document.metadata["reference"], url)
self.assertEqual(document.metadata["text"], f"Chunk {i} content")
def test_crawl_url_error(self):
"""Test error handling when crawling a URL."""
url = "https://example.com"
# Configure converter to raise an exception
self.mock_converter_instance.convert.side_effect = Exception("Test error")
# Verify that the error is propagated
with self.assertRaises(IOError):
self.crawler.crawl_url(url)
def test_supported_file_types(self):
"""Test the supported_file_types property."""
file_types = self.crawler.supported_file_types
# Check that all expected file types are included
expected_types = [
"pdf", "docx", "xlsx", "pptx", "md", "adoc", "asciidoc",
"html", "xhtml", "csv", "png", "jpg", "jpeg", "tif", "tiff", "bmp"
]
for file_type in expected_types:
self.assertIn(file_type, file_types)
# Check that the count matches
self.assertEqual(len(file_types), len(expected_types))
def test_crawl_urls(self):
"""Test crawling multiple URLs."""
urls = ["https://example.com", "https://example.org"]
# Set up mock document and chunks for each URL
mock_document = MagicMock()
mock_conversion_result = MagicMock()
mock_conversion_result.document = mock_document
# Set up one mock chunk per URL
mock_chunk = MagicMock()
mock_chunk.text = "Test chunk content"
# Configure mock converter and chunker
self.mock_converter_instance.convert.return_value = mock_conversion_result
self.mock_chunker_instance.chunk.return_value = [mock_chunk]
# Call the method
documents = self.crawler.crawl_urls(urls)
# Verify converter was called for each URL
self.assertEqual(self.mock_converter_instance.convert.call_count, 2)
# Verify chunker was called for each document
self.assertEqual(self.mock_chunker_instance.chunk.call_count, 2)
# Check results
self.assertEqual(len(documents), 2)
# Each URL should have generated one document (with one chunk)
for document in documents:
self.assertEqual(document.page_content, "Test chunk content")
self.assertIn(document.metadata["reference"], urls)
if __name__ == "__main__":
unittest.main()