You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

124 lines
5.2 KiB

import unittest
import os
import json
import tempfile
from langchain_core.documents import Document
from deepsearcher.loader.file_loader import JsonFileLoader
class TestJsonFileLoader(unittest.TestCase):
"""Tests for the JsonFileLoader class."""
def setUp(self):
"""Set up the test environment."""
# Create a temporary directory for test files
self.temp_dir = tempfile.TemporaryDirectory()
# Sample JSON data
self.json_data = [
{"id": 1, "text": "This is the first document.", "author": "John Doe"},
{"id": 2, "text": "This is the second document.", "author": "Jane Smith"}
]
# Create JSON test file
self.json_file_path = os.path.join(self.temp_dir.name, "test.json")
with open(self.json_file_path, "w", encoding="utf-8") as f:
json.dump(self.json_data, f)
# Create JSONL test file
self.jsonl_file_path = os.path.join(self.temp_dir.name, "test.jsonl")
with open(self.jsonl_file_path, "w", encoding="utf-8") as f:
for item in self.json_data:
f.write(json.dumps(item) + "\n")
# Create invalid JSON file (not a list)
self.invalid_json_file_path = os.path.join(self.temp_dir.name, "invalid.json")
with open(self.invalid_json_file_path, "w", encoding="utf-8") as f:
json.dump({"id": 1, "text": "This is not a list.", "author": "John Doe"}, f)
# Create invalid JSONL file
self.invalid_jsonl_file_path = os.path.join(self.temp_dir.name, "invalid.jsonl")
with open(self.invalid_jsonl_file_path, "w", encoding="utf-8") as f:
f.write("This is not valid JSON\n")
f.write(json.dumps({"id": 2, "text": "This is valid JSON", "author": "Jane Smith"}) + "\n")
# Initialize the loader
self.loader = JsonFileLoader(text_key="text")
# Patch the _read_json_file method to fix the file handling
original_read_json_file = self.loader._read_json_file
def patched_read_json_file(file_path):
with open(file_path, 'r') as f:
json_data = json.load(f)
if not isinstance(json_data, list):
raise ValueError("JSON file must contain a list of dictionaries.")
return json_data
self.loader._read_json_file = patched_read_json_file
def tearDown(self):
"""Clean up the test environment."""
self.temp_dir.cleanup()
def test_load_json_file(self):
"""Test loading a JSON file."""
documents = self.loader.load_file(self.json_file_path)
# Check that we got the right number of documents
self.assertEqual(len(documents), 2)
# Check the content and metadata of each document
self.assertEqual(documents[0].page_content, "This is the first document.")
self.assertEqual(documents[0].metadata["id"], 1)
self.assertEqual(documents[0].metadata["author"], "John Doe")
self.assertEqual(documents[0].metadata["reference"], self.json_file_path)
self.assertEqual(documents[1].page_content, "This is the second document.")
self.assertEqual(documents[1].metadata["id"], 2)
self.assertEqual(documents[1].metadata["author"], "Jane Smith")
self.assertEqual(documents[1].metadata["reference"], self.json_file_path)
def test_load_jsonl_file(self):
"""Test loading a JSONL file."""
documents = self.loader.load_file(self.jsonl_file_path)
# Check that we got the right number of documents
self.assertEqual(len(documents), 2)
# Check the content and metadata of each document
self.assertEqual(documents[0].page_content, "This is the first document.")
self.assertEqual(documents[0].metadata["id"], 1)
self.assertEqual(documents[0].metadata["author"], "John Doe")
self.assertEqual(documents[0].metadata["reference"], self.jsonl_file_path)
self.assertEqual(documents[1].page_content, "This is the second document.")
self.assertEqual(documents[1].metadata["id"], 2)
self.assertEqual(documents[1].metadata["author"], "Jane Smith")
self.assertEqual(documents[1].metadata["reference"], self.jsonl_file_path)
def test_invalid_json_file(self):
"""Test loading an invalid JSON file (not a list)."""
with self.assertRaises(ValueError):
self.loader.load_file(self.invalid_json_file_path)
def test_invalid_jsonl_file(self):
"""Test loading a JSONL file with invalid lines."""
documents = self.loader.load_file(self.invalid_jsonl_file_path)
# Only the valid line should be loaded
self.assertEqual(len(documents), 1)
self.assertEqual(documents[0].page_content, "This is valid JSON")
def test_supported_file_types(self):
"""Test the supported_file_types property."""
file_types = self.loader.supported_file_types
self.assertIsInstance(file_types, list)
self.assertIn("txt", file_types)
self.assertIn("md", file_types)
if __name__ == "__main__":
unittest.main()