import hashlib import os from typing import List, Union from tqdm import tqdm # from deepsearcher.configuration import embedding_model, vector_db, file_loader from deepsearcher import configuration from deepsearcher.loader.splitter import split_docs_to_chunks def load_from_local_files( paths_or_directory: Union[str, List[str]], collection_name: str = None, collection_description: str = None, force_new_collection: bool = False, chunk_size: int = 1500, chunk_overlap: int = 100, batch_size: int = 256, force_rebuild: bool = False, ): """ Load knowledge from local files or directories into the vector database. This function processes files from the specified paths or directories, splits them into chunks, embeds the chunks, and stores them in the vector database. Args: paths_or_directory: A single path or a list of paths to files or directories to load. collection_name: Name of the collection to store the data in. If None, uses the default collection. collection_description: Description of the collection. If None, no description is set. force_new_collection: If True, drops the existing collection and creates a new one. chunk_size: Size of each chunk in characters. chunk_overlap: Number of characters to overlap between chunks. batch_size: Number of chunks to process at once during embedding. force_rebuild: If True, clears the existing collection and ensures no duplicates are inserted. Raises: FileNotFoundError: If any of the specified paths do not exist. """ vector_db = configuration.vector_db if collection_name is None: collection_name = vector_db.default_collection collection_name = collection_name.replace(" ", "_").replace("-", "_") embedding_model = configuration.embedding_model file_loader = configuration.file_loader vector_db.init_collection( dim=embedding_model.dimension, collection=collection_name, description=collection_description, force_new_collection=force_new_collection, ) # 如果force_rebuild为True,则强制重建集合 if force_rebuild: vector_db.init_collection( dim=embedding_model.dimension, collection=collection_name, description=collection_description, force_new_collection=True, ) if isinstance(paths_or_directory, str): paths_or_directory = [paths_or_directory] all_docs = [] for path in tqdm(paths_or_directory, desc="Loading files"): if not os.path.exists(path): raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.") if os.path.isdir(path): docs = file_loader.load_directory(path) else: docs = file_loader.load_file(path) all_docs.extend(docs) # print("Splitting docs to chunks...") chunks = split_docs_to_chunks( all_docs, chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) # 为每个chunk计算SHA256哈希值作为主键,并检查重复 unique_chunks = [] for chunk in chunks: # 计算chunk文本的SHA256哈希值 sha256_hash = hashlib.sha256(chunk.text.encode('utf-8')).hexdigest() # 将哈希值添加到chunk的metadata中 chunk.metadata['id'] = sha256_hash unique_chunks.append(chunk) unique_chunks = embedding_model.embed_chunks(unique_chunks, batch_size=batch_size) vector_db.insert_data(collection=collection_name, chunks=unique_chunks) def load_from_website( urls: Union[str, List[str]], collection_name: str = None, collection_description: str = None, force_new_collection: bool = False, chunk_size: int = 1500, chunk_overlap: int = 100, batch_size: int = 256, **crawl_kwargs, ): """ Load knowledge from websites into the vector database. This function crawls the specified URLs, processes the content, splits it into chunks, embeds the chunks, and stores them in the vector database. Args: urls: A single URL or a list of URLs to crawl. collection_name: Name of the collection to store the data in. If None, uses the default collection. collection_description: Description of the collection. If None, no description is set. force_new_collection: If True, drops the existing collection and creates a new one. chunk_size: Size of each chunk in characters. chunk_overlap: Number of characters to overlap between chunks. batch_size: Number of chunks to process at once during embedding. **crawl_kwargs: Additional keyword arguments to pass to the web crawler. """ if isinstance(urls, str): urls = [urls] vector_db = configuration.vector_db embedding_model = configuration.embedding_model web_crawler = configuration.web_crawler vector_db.init_collection( dim=embedding_model.dimension, collection=collection_name, description=collection_description, force_new_collection=force_new_collection, ) all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs) chunks = split_docs_to_chunks( all_docs, chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size) vector_db.insert_data(collection=collection_name, chunks=chunks)