You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

98 lines
4.1 KiB

import unittest
from langchain_core.documents import Document
from deepsearcher.loader.splitter import Chunk, split_docs_to_chunks, _sentence_window_split
class TestSplitter(unittest.TestCase):
"""Tests for the splitter module."""
def test_chunk_init(self):
"""Test initialization of Chunk class."""
# Test with minimal parameters
chunk = Chunk(text="Test text", reference="test_ref")
self.assertEqual(chunk.text, "Test text")
self.assertEqual(chunk.reference, "test_ref")
self.assertEqual(chunk.metadata, {})
self.assertIsNone(chunk.embedding)
# Test with all parameters
metadata = {"key": "value"}
embedding = [0.1, 0.2, 0.3]
chunk = Chunk(text="Test text", reference="test_ref", metadata=metadata, embedding=embedding)
self.assertEqual(chunk.text, "Test text")
self.assertEqual(chunk.reference, "test_ref")
self.assertEqual(chunk.metadata, metadata)
self.assertEqual(chunk.embedding, embedding)
def test_sentence_window_split(self):
"""Test _sentence_window_split function."""
# Create a test document
original_text = "This is a test document. It has multiple sentences. This is for testing the splitter."
original_doc = Document(page_content=original_text, metadata={"reference": "test_doc"})
# Create split documents
split_docs = [
Document(page_content="This is a test document.", metadata={"reference": "test_doc"}),
Document(page_content="It has multiple sentences.", metadata={"reference": "test_doc"}),
Document(page_content="This is for testing the splitter.", metadata={"reference": "test_doc"})
]
# Test with default offset
chunks = _sentence_window_split(split_docs, original_doc)
# Verify the results
self.assertEqual(len(chunks), 3)
for i, chunk in enumerate(chunks):
self.assertEqual(chunk.text, split_docs[i].page_content)
self.assertEqual(chunk.reference, "test_doc")
self.assertIn("wider_text", chunk.metadata)
# The wider text should contain the original text since our test document is short
self.assertEqual(chunk.metadata["wider_text"], original_text)
# Test with smaller offset
chunks = _sentence_window_split(split_docs, original_doc, offset=10)
# Verify the results with smaller context windows
self.assertEqual(len(chunks), 3)
for chunk in chunks:
# With smaller offset, wider_text should be shorter than the full original text
self.assertLessEqual(len(chunk.metadata["wider_text"]), len(original_text))
def test_split_docs_to_chunks(self):
"""Test split_docs_to_chunks function."""
# Create test documents
docs = [
Document(
page_content="This is document one. It has some content for testing.",
metadata={"reference": "doc1"}
),
Document(
page_content="This is document two. It also has content for testing purposes.",
metadata={"reference": "doc2"}
)
]
# Test with default parameters
chunks = split_docs_to_chunks(docs)
# Verify the results
self.assertGreater(len(chunks), 0)
for chunk in chunks:
self.assertIsInstance(chunk, Chunk)
self.assertIn(chunk.reference, ["doc1", "doc2"])
self.assertIn("wider_text", chunk.metadata)
# Test with custom chunk size and overlap
chunks = split_docs_to_chunks(docs, chunk_size=10, chunk_overlap=2)
# With small chunk size, we should get more chunks
self.assertGreater(len(chunks), 2)
for chunk in chunks:
self.assertIsInstance(chunk, Chunk)
self.assertIn(chunk.reference, ["doc1", "doc2"])
self.assertIn("wider_text", chunk.metadata)
if __name__ == "__main__":
unittest.main()