deep-searcher/deepsearcher/offline_loading.py


								import hashlib

								import os


								from tqdm import tqdm


								# from deepsearcher.configuration import embedding_model, vector_db, file_loader

								from deepsearcher import configuration

								from deepsearcher.loader.splitter import split_docs_to_chunks


								def load_from_local_files(

								    paths_or_directory: str | list[str],

								    collection_name: str = None,

								    collection_description: str = None,

								    force_rebuild: bool = False,

								    chunk_size: int = 1500,

								    chunk_overlap: int = 100,

								    batch_size: int = 256

								):

								    """

								    Load knowledge from local files or directories into the vector database.


								    This function processes files from the specified paths or directories,

								    splits them into chunks, embeds the chunks, and stores them in the vector database.


								    Args:

								        paths_or_directory: A single path or a list of paths to files or directories to load.

								        collection_name: Name of the collection to store the data in. If None, uses the default collection.

								        collection_description: Description of the collection. If None, no description is set.

								        force_rebuild: If True, drops the existing collection and creates a new one.

								        chunk_size: Size of each chunk in characters.

								        chunk_overlap: Number of characters to overlap between chunks.

								        batch_size: Number of chunks to process at once during embedding.

								        force_rebuild: If True, clears the existing collection and ensures no duplicates are inserted.


								    Raises:

								        FileNotFoundError: If any of the specified paths do not exist.

								    """

								    vector_db = configuration.vector_db

								    if collection_name is None:

								        collection_name = vector_db.default_collection

								    collection_name = collection_name.replace(" ", "_").replace("-", "_")

								    embedding_model = configuration.embedding_model

								    file_loader = configuration.file_loader


								    # 初始化集合（如果不存在则创建，如果force_rebuild为True则重建）

								    vector_db.init_collection(

								        dim=embedding_model.dimension,

								        collection=collection_name,

								        description=collection_description,

								        force_rebuild=force_rebuild,

								    )


								    if isinstance(paths_or_directory, str):

								        paths_or_directory = [paths_or_directory]

								    all_docs = []

								    for path in tqdm(paths_or_directory, desc="Loading files"):

								        if not os.path.exists(path):

								            raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")

								        if os.path.isdir(path):

								            docs = file_loader.load_directory(path)

								        else:

								            docs = file_loader.load_file(path)

								        all_docs.extend(docs)

								    # print("Splitting docs to chunks...")

								    chunks = split_docs_to_chunks(

								        all_docs,

								        chunk_size=chunk_size,

								        chunk_overlap=chunk_overlap,

								    )


								    # 为每个chunk计算SHA256哈希值作为主键，并检查重复

								    unique_chunks = []

								    for chunk in chunks:

								        # 计算chunk文本的SHA256哈希值

								        sha256_hash = hashlib.sha256(chunk.text.encode('utf-8')).hexdigest()

								        # 将哈希值添加到chunk的metadata中

								        chunk.metadata['id'] = sha256_hash

								        unique_chunks.append(chunk)


								    unique_chunks = embedding_model.embed_chunks(unique_chunks, batch_size=batch_size)

								    vector_db.insert_data(collection=collection_name, chunks=unique_chunks)


								def load_from_website(

								    urls: str | list[str],

								    collection_name: str = None,

								    collection_description: str = None,

								    force_rebuild: bool = False,

								    chunk_size: int = 1500,

								    chunk_overlap: int = 100,

								    batch_size: int = 256,

								    **crawl_kwargs,

								):

								    """

								    Load knowledge from websites into the vector database.


								    This function crawls the specified URLs, processes the content,

								    splits it into chunks, embeds the chunks, and stores them in the vector database.


								    Args:

								        urls: A single URL or a list of URLs to crawl.

								        collection_name: Name of the collection to store the data in. If None, uses the default collection.

								        collection_description: Description of the collection. If None, no description is set.

								        force_rebuild: If True, drops the existing collection and creates a new one.

								        chunk_size: Size of each chunk in characters.

								        chunk_overlap: Number of characters to overlap between chunks.

								        batch_size: Number of chunks to process at once during embedding.

								        **crawl_kwargs: Additional keyword arguments to pass to the web crawler.

								    """

								    if isinstance(urls, str):

								        urls = [urls]

								    vector_db = configuration.vector_db

								    embedding_model = configuration.embedding_model

								    web_crawler = configuration.web_crawler


								    vector_db.init_collection(

								        dim=embedding_model.dimension,

								        collection=collection_name,

								        description=collection_description,

								        force_rebuild=force_rebuild,

								    )


								    all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)


								    chunks = split_docs_to_chunks(

								        all_docs,

								        chunk_size=chunk_size,

								        chunk_overlap=chunk_overlap,

								    )

								    chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)

								    vector_db.insert_data(collection=collection_name, chunks=chunks)