## Sentence Window splitting strategy, ref:
#  https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb

from typing import List

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


class Chunk:
    """
    Represents a chunk of text with associated metadata and embedding.

    A chunk is a segment of text extracted from a document, along with its reference
    information, metadata, and optional embedding vector.

    Attributes:
        text: The text content of the chunk.
        reference: A reference to the source of the chunk (e.g., file path, URL).
        metadata: Additional metadata associated with the chunk.
        embedding: The vector embedding of the chunk, if available.
    """

    def __init__(
        self,
        text: str,
        reference: str,
        metadata: dict = None,
        embedding: List[float] = None,
    ):
        """
        Initialize a Chunk object.

        Args:
            text: The text content of the chunk.
            reference: A reference to the source of the chunk.
            metadata: Additional metadata associated with the chunk. Defaults to an empty dict.
            embedding: The vector embedding of the chunk. Defaults to None.
        """
        self.text = text
        self.reference = reference
        self.metadata = metadata or {}
        self.embedding = embedding or None


def _sentence_window_split(
    split_docs: List[Document], original_document: Document, offset: int = 200
) -> List[Chunk]:
    """
    Create chunks with context windows from split documents.

    This function takes documents that have been split into smaller pieces and
    adds context from the original document by including text before and after
    each split piece, up to the specified offset.

    Args:
        split_docs: List of documents that have been split.
        original_document: The original document before splitting.
        offset: Number of characters to include before and after each split piece.

    Returns:
        A list of Chunk objects with context windows.
    """
    chunks = []
    original_text = original_document.page_content
    for doc in split_docs:
        doc_text = doc.page_content
        start_index = original_text.index(doc_text)
        end_index = start_index + len(doc_text) - 1
        wider_text = original_text[
            max(0, start_index - offset) : min(len(original_text), end_index + offset)
        ]
        reference = doc.metadata.pop("reference", "")
        doc.metadata["wider_text"] = wider_text
        chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
        chunks.append(chunk)
    return chunks


def split_docs_to_chunks(
    documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
) -> List[Chunk]:
    """
    Split documents into chunks with context windows.

    This function splits a list of documents into smaller chunks with overlapping text,
    and adds context windows to each chunk by including text before and after the chunk.

    Args:
        documents: List of documents to split.
        chunk_size: Size of each chunk in characters.
        chunk_overlap: Number of characters to overlap between chunks.

    Returns:
        A list of Chunk objects with context windows.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    all_chunks = []
    for doc in documents:
        split_docs = text_splitter.split_documents([doc])
        split_chunks = _sentence_window_split(split_docs, doc, offset=300)
        all_chunks.extend(split_chunks)
    return all_chunks