You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

142 lines
4.8 KiB

import unittest
import os
import tempfile
from unittest.mock import patch, MagicMock
from langchain_core.documents import Document
from deepsearcher.loader.file_loader import PDFLoader
class TestPDFLoader(unittest.TestCase):
"""Tests for the PDFLoader class."""
def setUp(self):
"""Set up the test environment."""
# Create a temporary directory
self.temp_dir = tempfile.TemporaryDirectory()
# Create a text file for testing
self.text_file_path = os.path.join(self.temp_dir.name, "test.txt")
with open(self.text_file_path, "w", encoding="utf-8") as f:
f.write("This is a test text file.")
# Create a markdown file for testing
self.md_file_path = os.path.join(self.temp_dir.name, "test.md")
with open(self.md_file_path, "w", encoding="utf-8") as f:
f.write("# Test Markdown\nThis is a test markdown file.")
# PDF file path (will be mocked)
self.pdf_file_path = os.path.join(self.temp_dir.name, "test.pdf")
# Create the loader
self.loader = PDFLoader()
def tearDown(self):
"""Clean up the test environment."""
self.temp_dir.cleanup()
def test_supported_file_types(self):
"""Test the supported_file_types property."""
file_types = self.loader.supported_file_types
self.assertIsInstance(file_types, list)
self.assertIn("pdf", file_types)
self.assertIn("md", file_types)
self.assertIn("txt", file_types)
def test_load_text_file(self):
"""Test loading a text file."""
documents = self.loader.load_file(self.text_file_path)
# Check that we got one document
self.assertEqual(len(documents), 1)
# Check the document content
document = documents[0]
self.assertEqual(document.page_content, "This is a test text file.")
# Check the metadata
self.assertEqual(document.metadata["reference"], self.text_file_path)
def test_load_markdown_file(self):
"""Test loading a markdown file."""
documents = self.loader.load_file(self.md_file_path)
# Check that we got one document
self.assertEqual(len(documents), 1)
# Check the document content
document = documents[0]
self.assertEqual(document.page_content, "# Test Markdown\nThis is a test markdown file.")
# Check the metadata
self.assertEqual(document.metadata["reference"], self.md_file_path)
@patch("pdfplumber.open")
def test_load_pdf_file(self, mock_pdf_open):
"""Test loading a PDF file."""
# Set up mock PDF pages
mock_page1 = MagicMock()
mock_page1.extract_text.return_value = "Page 1 content"
mock_page2 = MagicMock()
mock_page2.extract_text.return_value = "Page 2 content"
# Set up mock PDF file
mock_pdf = MagicMock()
mock_pdf.pages = [mock_page1, mock_page2]
mock_pdf.__enter__.return_value = mock_pdf
mock_pdf.__exit__.return_value = None
# Configure the mock to return our mock PDF
mock_pdf_open.return_value = mock_pdf
# Create a dummy PDF file
with open(self.pdf_file_path, "w") as f:
f.write("dummy pdf content")
# Load the PDF file
documents = self.loader.load_file(self.pdf_file_path)
# Verify pdfplumber.open was called
mock_pdf_open.assert_called_once_with(self.pdf_file_path)
# Check that we got one document
self.assertEqual(len(documents), 1)
# Check the document content
document = documents[0]
self.assertEqual(document.page_content, "Page 1 content\n\nPage 2 content")
# Check the metadata
self.assertEqual(document.metadata["reference"], self.pdf_file_path)
def test_load_directory(self):
"""Test loading a directory with mixed file types."""
# Create the loader
loader = PDFLoader()
# Mock the load_file method to track calls
original_load_file = loader.load_file
calls = []
def mock_load_file(file_path):
calls.append(file_path)
return original_load_file(file_path)
loader.load_file = mock_load_file
# Load the directory
documents = loader.load_directory(self.temp_dir.name)
# Check that we processed both text and markdown files
self.assertEqual(len(calls), 2) # text and markdown files
self.assertIn(self.text_file_path, calls)
self.assertIn(self.md_file_path, calls)
# Check that we got two documents
self.assertEqual(len(documents), 2)
if __name__ == "__main__":
unittest.main()