You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

203 lines
8.2 KiB

import unittest
import os
import shutil
import tempfile
from unittest.mock import patch, MagicMock
from langchain_core.documents import Document
from deepsearcher.loader.file_loader import UnstructuredLoader
class TestUnstructuredLoader(unittest.TestCase):
"""Tests for the UnstructuredLoader class."""
def setUp(self):
"""Set up test fixtures."""
# Create a temporary directory for tests
self.temp_dir = tempfile.TemporaryDirectory()
# Create a test file
self.test_file_path = os.path.join(self.temp_dir.name, "test.txt")
with open(self.test_file_path, "w", encoding="utf-8") as f:
f.write("This is a test file.")
# Path for mock processed outputs
self.mock_output_dir = os.path.join(self.temp_dir.name, "mock_outputs")
os.makedirs(self.mock_output_dir, exist_ok=True)
# Create a mock JSON output file
self.mock_json_path = os.path.join(self.mock_output_dir, "test_output.json")
with open(self.mock_json_path, "w", encoding="utf-8") as f:
f.write('{"elements": [{"text": "This is extracted text.", "metadata": {"filename": "test.txt"}}]}')
# Set up patches for unstructured modules
self.unstructured_modules = {
'unstructured_ingest': MagicMock(),
'unstructured_ingest.interfaces': MagicMock(),
'unstructured_ingest.pipeline': MagicMock(),
'unstructured_ingest.pipeline.pipeline': MagicMock(),
'unstructured_ingest.processes': MagicMock(),
'unstructured_ingest.processes.connectors': MagicMock(),
'unstructured_ingest.processes.connectors.local': MagicMock(),
'unstructured_ingest.processes.partitioner': MagicMock(),
'unstructured': MagicMock(),
'unstructured.staging': MagicMock(),
'unstructured.staging.base': MagicMock(),
}
self.patches = []
for module_name, mock_module in self.unstructured_modules.items():
patcher = patch.dict('sys.modules', {module_name: mock_module})
patcher.start()
self.patches.append(patcher)
# Create mock Pipeline class
self.mock_pipeline = MagicMock()
self.unstructured_modules['unstructured_ingest.pipeline.pipeline'].Pipeline = self.mock_pipeline
self.mock_pipeline.from_configs.return_value = self.mock_pipeline
# Create mock Element class
self.mock_element = MagicMock()
self.mock_element.text = "This is extracted text."
self.mock_element.metadata = MagicMock()
self.mock_element.metadata.to_dict.return_value = {"filename": "test.txt"}
# Set up elements_from_json mock
self.unstructured_modules['unstructured.staging.base'].elements_from_json = MagicMock()
self.unstructured_modules['unstructured.staging.base'].elements_from_json.return_value = [self.mock_element]
# Patch makedirs and rmtree but don't assert on them
with patch('os.makedirs'):
with patch('shutil.rmtree'):
self.loader = UnstructuredLoader()
def tearDown(self):
"""Clean up test fixtures."""
# Stop all patches
for patcher in self.patches:
patcher.stop()
# Remove temporary directory
self.temp_dir.cleanup()
def test_init(self):
"""Test initialization."""
self.assertEqual(self.loader.directory_with_results, "./pdf_processed_outputs")
def test_supported_file_types(self):
"""Test the supported_file_types property."""
file_types = self.loader.supported_file_types
# Check that common file types are included
common_types = ["pdf", "docx", "txt", "html", "md", "jpg"]
for file_type in common_types:
self.assertIn(file_type, file_types)
# Check total number of supported types (should be extensive)
self.assertGreater(len(file_types), 20)
@patch('os.listdir')
def test_load_file(self, mock_listdir):
"""Test loading a single file."""
# Configure mocks
mock_listdir.return_value = ["test_output.json"]
# Call the method
documents = self.loader.load_file(self.test_file_path)
# Verify Pipeline.from_configs was called
self.mock_pipeline.from_configs.assert_called_once()
self.mock_pipeline.run.assert_called_once()
# Verify elements_from_json was called
self.unstructured_modules['unstructured.staging.base'].elements_from_json.assert_called_once()
# Check results
self.assertEqual(len(documents), 1)
self.assertEqual(documents[0].page_content, "This is extracted text.")
self.assertEqual(documents[0].metadata["reference"], self.test_file_path)
self.assertEqual(documents[0].metadata["filename"], "test.txt")
@patch('os.listdir')
def test_load_directory(self, mock_listdir):
"""Test loading a directory."""
# Configure mocks
mock_listdir.return_value = ["test_output.json"]
# Call the method
documents = self.loader.load_directory(self.temp_dir.name)
# Verify Pipeline.from_configs was called
self.mock_pipeline.from_configs.assert_called_once()
self.mock_pipeline.run.assert_called_once()
# Check results
self.assertEqual(len(documents), 1)
self.assertEqual(documents[0].page_content, "This is extracted text.")
self.assertEqual(documents[0].metadata["reference"], self.temp_dir.name)
@patch('os.listdir')
def test_load_with_api(self, mock_listdir):
"""Test loading with API environment variables."""
# Create a mock for os.environ.get
with patch('os.environ.get') as mock_env_get:
# Configure environment variables
mock_env_get.side_effect = lambda key, default=None: {
"UNSTRUCTURED_API_KEY": "test-key",
"UNSTRUCTURED_API_URL": "https://api.example.com"
}.get(key, default)
# Configure listdir mock
mock_listdir.return_value = ["test_output.json"]
# Create a mock for PartitionerConfig
mock_partitioner_config = MagicMock()
self.unstructured_modules['unstructured_ingest.processes.partitioner'].PartitionerConfig = mock_partitioner_config
# Call the method
documents = self.loader.load_file(self.test_file_path)
# Verify Pipeline.from_configs was called
self.mock_pipeline.from_configs.assert_called_once()
# Check that PartitionerConfig was called with correct parameters
mock_partitioner_config.assert_called_once()
args, kwargs = mock_partitioner_config.call_args
self.assertTrue(kwargs.get('partition_by_api'))
self.assertEqual(kwargs.get('api_key'), "test-key")
self.assertEqual(kwargs.get('partition_endpoint'), "https://api.example.com")
# Check results
self.assertEqual(len(documents), 1)
@patch('os.listdir')
def test_empty_output(self, mock_listdir):
"""Test handling of empty output directory."""
# Configure listdir to return no JSON files
mock_listdir.return_value = []
# Call the method
documents = self.loader.load_file(self.test_file_path)
# Check results
self.assertEqual(len(documents), 0)
@patch('os.listdir')
def test_error_reading_json(self, mock_listdir):
"""Test handling of errors when reading JSON files."""
# Configure listdir mock
mock_listdir.return_value = ["test_output.json"]
# Configure elements_from_json to raise an IOError
self.unstructured_modules['unstructured.staging.base'].elements_from_json.side_effect = IOError("Test error")
# Call the method
documents = self.loader.load_file(self.test_file_path)
# Check results (should be empty)
self.assertEqual(len(documents), 0)
if __name__ == "__main__":
unittest.main()