You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

105 lines
3.7 KiB

## Sentence Window splitting strategy, ref:
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
class Chunk:
"""
Represents a chunk of text with associated metadata and embedding.
A chunk is a segment of text extracted from a document, along with its reference
information, metadata, and optional embedding vector.
Attributes:
text: The text content of the chunk.
reference: A reference to the source of the chunk (e.g., file path, URL).
metadata: Additional metadata associated with the chunk.
embedding: The vector embedding of the chunk, if available.
"""
def __init__(
self,
text: str,
reference: str,
metadata: dict = None,
embedding: List[float] = None,
):
"""
Initialize a Chunk object.
Args:
text: The text content of the chunk.
reference: A reference to the source of the chunk.
metadata: Additional metadata associated with the chunk. Defaults to an empty dict.
embedding: The vector embedding of the chunk. Defaults to None.
"""
self.text = text
self.reference = reference
self.metadata = metadata or {}
self.embedding = embedding or None
def _sentence_window_split(
split_docs: List[Document], original_document: Document, offset: int = 200
) -> List[Chunk]:
"""
Create chunks with context windows from split documents.
This function takes documents that have been split into smaller pieces and
adds context from the original document by including text before and after
each split piece, up to the specified offset.
Args:
split_docs: List of documents that have been split.
original_document: The original document before splitting.
offset: Number of characters to include before and after each split piece.
Returns:
A list of Chunk objects with context windows.
"""
chunks = []
original_text = original_document.page_content
for doc in split_docs:
doc_text = doc.page_content
start_index = original_text.index(doc_text)
end_index = start_index + len(doc_text) - 1
wider_text = original_text[
max(0, start_index - offset) : min(len(original_text), end_index + offset)
]
reference = doc.metadata.pop("reference", "")
doc.metadata["wider_text"] = wider_text
chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
chunks.append(chunk)
return chunks
def split_docs_to_chunks(
documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
) -> List[Chunk]:
"""
Split documents into chunks with context windows.
This function splits a list of documents into smaller chunks with overlapping text,
and adds context windows to each chunk by including text before and after the chunk.
Args:
documents: List of documents to split.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
Returns:
A list of Chunk objects with context windows.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
all_chunks = []
for doc in documents:
split_docs = text_splitter.split_documents([doc])
split_chunks = _sentence_window_split(split_docs, doc, offset=300)
all_chunks.extend(split_chunks)
return all_chunks