You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
105 lines
3.7 KiB
105 lines
3.7 KiB
## Sentence Window splitting strategy, ref:
|
|
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
|
|
|
|
from typing import List
|
|
|
|
from langchain_core.documents import Document
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
|
|
class Chunk:
|
|
"""
|
|
Represents a chunk of text with associated metadata and embedding.
|
|
|
|
A chunk is a segment of text extracted from a document, along with its reference
|
|
information, metadata, and optional embedding vector.
|
|
|
|
Attributes:
|
|
text: The text content of the chunk.
|
|
reference: A reference to the source of the chunk (e.g., file path, URL).
|
|
metadata: Additional metadata associated with the chunk.
|
|
embedding: The vector embedding of the chunk, if available.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
text: str,
|
|
reference: str,
|
|
metadata: dict = None,
|
|
embedding: List[float] = None,
|
|
):
|
|
"""
|
|
Initialize a Chunk object.
|
|
|
|
Args:
|
|
text: The text content of the chunk.
|
|
reference: A reference to the source of the chunk.
|
|
metadata: Additional metadata associated with the chunk. Defaults to an empty dict.
|
|
embedding: The vector embedding of the chunk. Defaults to None.
|
|
"""
|
|
self.text = text
|
|
self.reference = reference
|
|
self.metadata = metadata or {}
|
|
self.embedding = embedding or None
|
|
|
|
|
|
def _sentence_window_split(
|
|
split_docs: List[Document], original_document: Document, offset: int = 200
|
|
) -> List[Chunk]:
|
|
"""
|
|
Create chunks with context windows from split documents.
|
|
|
|
This function takes documents that have been split into smaller pieces and
|
|
adds context from the original document by including text before and after
|
|
each split piece, up to the specified offset.
|
|
|
|
Args:
|
|
split_docs: List of documents that have been split.
|
|
original_document: The original document before splitting.
|
|
offset: Number of characters to include before and after each split piece.
|
|
|
|
Returns:
|
|
A list of Chunk objects with context windows.
|
|
"""
|
|
chunks = []
|
|
original_text = original_document.page_content
|
|
for doc in split_docs:
|
|
doc_text = doc.page_content
|
|
start_index = original_text.index(doc_text)
|
|
end_index = start_index + len(doc_text) - 1
|
|
wider_text = original_text[
|
|
max(0, start_index - offset) : min(len(original_text), end_index + offset)
|
|
]
|
|
reference = doc.metadata.pop("reference", "")
|
|
doc.metadata["wider_text"] = wider_text
|
|
chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
|
|
chunks.append(chunk)
|
|
return chunks
|
|
|
|
|
|
def split_docs_to_chunks(
|
|
documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
|
|
) -> List[Chunk]:
|
|
"""
|
|
Split documents into chunks with context windows.
|
|
|
|
This function splits a list of documents into smaller chunks with overlapping text,
|
|
and adds context windows to each chunk by including text before and after the chunk.
|
|
|
|
Args:
|
|
documents: List of documents to split.
|
|
chunk_size: Size of each chunk in characters.
|
|
chunk_overlap: Number of characters to overlap between chunks.
|
|
|
|
Returns:
|
|
A list of Chunk objects with context windows.
|
|
"""
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
)
|
|
all_chunks = []
|
|
for doc in documents:
|
|
split_docs = text_splitter.split_documents([doc])
|
|
split_chunks = _sentence_window_split(split_docs, doc, offset=300)
|
|
all_chunks.extend(split_chunks)
|
|
return all_chunks
|
|
|