You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
7.5 KiB
185 lines
7.5 KiB
import unittest
|
|
import os
|
|
import tempfile
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.file_loader import DoclingLoader
|
|
|
|
|
|
class TestDoclingLoader(unittest.TestCase):
|
|
"""Tests for the DoclingLoader class."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
# Create patches for the docling modules
|
|
self.docling_patcher = patch.dict('sys.modules', {
|
|
'docling': MagicMock(),
|
|
'docling.document_converter': MagicMock(),
|
|
'docling_core': MagicMock(),
|
|
'docling_core.transforms': MagicMock(),
|
|
'docling_core.transforms.chunker': MagicMock()
|
|
})
|
|
self.docling_patcher.start()
|
|
|
|
# Create mocks for the classes
|
|
self.mock_document_converter = MagicMock()
|
|
self.mock_hierarchical_chunker = MagicMock()
|
|
|
|
# Add the mocks to the modules
|
|
import sys
|
|
sys.modules['docling.document_converter'].DocumentConverter = self.mock_document_converter
|
|
sys.modules['docling_core.transforms.chunker'].HierarchicalChunker = self.mock_hierarchical_chunker
|
|
|
|
# Set up mock instances
|
|
self.mock_converter_instance = MagicMock()
|
|
self.mock_chunker_instance = MagicMock()
|
|
self.mock_document_converter.return_value = self.mock_converter_instance
|
|
self.mock_hierarchical_chunker.return_value = self.mock_chunker_instance
|
|
|
|
# Create a temporary directory
|
|
self.temp_dir = tempfile.TemporaryDirectory()
|
|
|
|
# Create a test markdown file
|
|
self.md_file_path = os.path.join(self.temp_dir.name, "test.md")
|
|
with open(self.md_file_path, "w", encoding="utf-8") as f:
|
|
f.write("# Test Markdown\nThis is a test markdown file.")
|
|
|
|
# Create a test unsupported file
|
|
self.unsupported_file_path = os.path.join(self.temp_dir.name, "test.xyz")
|
|
with open(self.unsupported_file_path, "w", encoding="utf-8") as f:
|
|
f.write("This is an unsupported file type.")
|
|
|
|
# Create a subdirectory with a test file
|
|
self.sub_dir = os.path.join(self.temp_dir.name, "subdir")
|
|
os.makedirs(self.sub_dir, exist_ok=True)
|
|
self.sub_file_path = os.path.join(self.sub_dir, "subfile.md")
|
|
with open(self.sub_file_path, "w", encoding="utf-8") as f:
|
|
f.write("# Subdir Test\nThis is a test markdown file in a subdirectory.")
|
|
|
|
# Create the loader
|
|
self.loader = DoclingLoader()
|
|
|
|
def tearDown(self):
|
|
"""Clean up test fixtures."""
|
|
self.docling_patcher.stop()
|
|
self.temp_dir.cleanup()
|
|
|
|
def test_init(self):
|
|
"""Test initialization."""
|
|
# Verify instances were created
|
|
self.mock_document_converter.assert_called_once()
|
|
self.mock_hierarchical_chunker.assert_called_once()
|
|
|
|
# Check that the instances were assigned correctly
|
|
self.assertEqual(self.loader.converter, self.mock_converter_instance)
|
|
self.assertEqual(self.loader.chunker, self.mock_chunker_instance)
|
|
|
|
def test_supported_file_types(self):
|
|
"""Test the supported_file_types property."""
|
|
file_types = self.loader.supported_file_types
|
|
|
|
# Check that the common file types are included
|
|
common_types = ["pdf", "docx", "md", "html", "csv", "jpg"]
|
|
for file_type in common_types:
|
|
self.assertIn(file_type, file_types)
|
|
|
|
def test_load_file(self):
|
|
"""Test loading a single file."""
|
|
# Set up mock document and chunks
|
|
mock_document = MagicMock()
|
|
mock_conversion_result = MagicMock()
|
|
mock_conversion_result.document = mock_document
|
|
|
|
# Set up three mock chunks
|
|
mock_chunks = []
|
|
for i in range(3):
|
|
chunk = MagicMock()
|
|
chunk.text = f"Chunk {i} content"
|
|
mock_chunks.append(chunk)
|
|
|
|
# Configure mock converter and chunker
|
|
self.mock_converter_instance.convert.return_value = mock_conversion_result
|
|
self.mock_chunker_instance.chunk.return_value = mock_chunks
|
|
|
|
# Call the method
|
|
documents = self.loader.load_file(self.md_file_path)
|
|
|
|
# Verify converter was called correctly
|
|
self.mock_converter_instance.convert.assert_called_once_with(self.md_file_path)
|
|
|
|
# Verify chunker was called correctly
|
|
self.mock_chunker_instance.chunk.assert_called_once_with(mock_document)
|
|
|
|
# Check results
|
|
self.assertEqual(len(documents), 3)
|
|
|
|
# Check each document
|
|
for i, document in enumerate(documents):
|
|
self.assertEqual(document.page_content, f"Chunk {i} content")
|
|
self.assertEqual(document.metadata["reference"], self.md_file_path)
|
|
self.assertEqual(document.metadata["text"], f"Chunk {i} content")
|
|
|
|
def test_load_file_not_found(self):
|
|
"""Test loading a non-existent file."""
|
|
non_existent_file = os.path.join(self.temp_dir.name, "non_existent.md")
|
|
with self.assertRaises(FileNotFoundError):
|
|
self.loader.load_file(non_existent_file)
|
|
|
|
def test_load_unsupported_file_type(self):
|
|
"""Test loading a file with unsupported extension."""
|
|
with self.assertRaises(ValueError):
|
|
self.loader.load_file(self.unsupported_file_path)
|
|
|
|
def test_load_file_error(self):
|
|
"""Test error handling when loading a file."""
|
|
# Configure converter to raise an exception
|
|
self.mock_converter_instance.convert.side_effect = Exception("Test error")
|
|
|
|
# Verify that the error is propagated
|
|
with self.assertRaises(IOError):
|
|
self.loader.load_file(self.md_file_path)
|
|
|
|
def test_load_directory(self):
|
|
"""Test loading a directory."""
|
|
# Set up mock document and chunks
|
|
mock_document = MagicMock()
|
|
mock_conversion_result = MagicMock()
|
|
mock_conversion_result.document = mock_document
|
|
|
|
# Set up a single mock chunk
|
|
mock_chunk = MagicMock()
|
|
mock_chunk.text = "Test chunk content"
|
|
|
|
# Configure mock converter and chunker
|
|
self.mock_converter_instance.convert.return_value = mock_conversion_result
|
|
self.mock_chunker_instance.chunk.return_value = [mock_chunk]
|
|
|
|
# Load the directory
|
|
documents = self.loader.load_directory(self.temp_dir.name)
|
|
|
|
# Verify converter was called twice (once for each MD file)
|
|
self.assertEqual(self.mock_converter_instance.convert.call_count, 2)
|
|
|
|
# Verify converter was called with both MD files
|
|
self.mock_converter_instance.convert.assert_any_call(self.md_file_path)
|
|
self.mock_converter_instance.convert.assert_any_call(self.sub_file_path)
|
|
|
|
# Check results - should have two documents (one from each MD file)
|
|
self.assertEqual(len(documents), 2)
|
|
|
|
# Check each document
|
|
for document in documents:
|
|
self.assertEqual(document.page_content, "Test chunk content")
|
|
self.assertEqual(document.metadata["text"], "Test chunk content")
|
|
self.assertIn(document.metadata["reference"], [self.md_file_path, self.sub_file_path])
|
|
|
|
def test_load_not_a_directory(self):
|
|
"""Test loading a path that is not a directory."""
|
|
with self.assertRaises(NotADirectoryError):
|
|
self.loader.load_directory(self.md_file_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|