You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
4.1 KiB
98 lines
4.1 KiB
import unittest
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.splitter import Chunk, split_docs_to_chunks, _sentence_window_split
|
|
|
|
|
|
class TestSplitter(unittest.TestCase):
|
|
"""Tests for the splitter module."""
|
|
|
|
def test_chunk_init(self):
|
|
"""Test initialization of Chunk class."""
|
|
# Test with minimal parameters
|
|
chunk = Chunk(text="Test text", reference="test_ref")
|
|
self.assertEqual(chunk.text, "Test text")
|
|
self.assertEqual(chunk.reference, "test_ref")
|
|
self.assertEqual(chunk.metadata, {})
|
|
self.assertIsNone(chunk.embedding)
|
|
|
|
# Test with all parameters
|
|
metadata = {"key": "value"}
|
|
embedding = [0.1, 0.2, 0.3]
|
|
chunk = Chunk(text="Test text", reference="test_ref", metadata=metadata, embedding=embedding)
|
|
self.assertEqual(chunk.text, "Test text")
|
|
self.assertEqual(chunk.reference, "test_ref")
|
|
self.assertEqual(chunk.metadata, metadata)
|
|
self.assertEqual(chunk.embedding, embedding)
|
|
|
|
def test_sentence_window_split(self):
|
|
"""Test _sentence_window_split function."""
|
|
# Create a test document
|
|
original_text = "This is a test document. It has multiple sentences. This is for testing the splitter."
|
|
original_doc = Document(page_content=original_text, metadata={"reference": "test_doc"})
|
|
|
|
# Create split documents
|
|
split_docs = [
|
|
Document(page_content="This is a test document.", metadata={"reference": "test_doc"}),
|
|
Document(page_content="It has multiple sentences.", metadata={"reference": "test_doc"}),
|
|
Document(page_content="This is for testing the splitter.", metadata={"reference": "test_doc"})
|
|
]
|
|
|
|
# Test with default offset
|
|
chunks = _sentence_window_split(split_docs, original_doc)
|
|
|
|
# Verify the results
|
|
self.assertEqual(len(chunks), 3)
|
|
for i, chunk in enumerate(chunks):
|
|
self.assertEqual(chunk.text, split_docs[i].page_content)
|
|
self.assertEqual(chunk.reference, "test_doc")
|
|
self.assertIn("wider_text", chunk.metadata)
|
|
# The wider text should contain the original text since our test document is short
|
|
self.assertEqual(chunk.metadata["wider_text"], original_text)
|
|
|
|
# Test with smaller offset
|
|
chunks = _sentence_window_split(split_docs, original_doc, offset=10)
|
|
|
|
# Verify the results with smaller context windows
|
|
self.assertEqual(len(chunks), 3)
|
|
for chunk in chunks:
|
|
# With smaller offset, wider_text should be shorter than the full original text
|
|
self.assertLessEqual(len(chunk.metadata["wider_text"]), len(original_text))
|
|
|
|
def test_split_docs_to_chunks(self):
|
|
"""Test split_docs_to_chunks function."""
|
|
# Create test documents
|
|
docs = [
|
|
Document(
|
|
page_content="This is document one. It has some content for testing.",
|
|
metadata={"reference": "doc1"}
|
|
),
|
|
Document(
|
|
page_content="This is document two. It also has content for testing purposes.",
|
|
metadata={"reference": "doc2"}
|
|
)
|
|
]
|
|
|
|
# Test with default parameters
|
|
chunks = split_docs_to_chunks(docs)
|
|
|
|
# Verify the results
|
|
self.assertGreater(len(chunks), 0)
|
|
for chunk in chunks:
|
|
self.assertIsInstance(chunk, Chunk)
|
|
self.assertIn(chunk.reference, ["doc1", "doc2"])
|
|
self.assertIn("wider_text", chunk.metadata)
|
|
|
|
# Test with custom chunk size and overlap
|
|
chunks = split_docs_to_chunks(docs, chunk_size=10, chunk_overlap=2)
|
|
|
|
# With small chunk size, we should get more chunks
|
|
self.assertGreater(len(chunks), 2)
|
|
for chunk in chunks:
|
|
self.assertIsInstance(chunk, Chunk)
|
|
self.assertIn(chunk.reference, ["doc1", "doc2"])
|
|
self.assertIn("wider_text", chunk.metadata)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|