You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

132 lines
5.0 KiB

import hashlib
import os
from tqdm import tqdm
# from deepsearcher.configuration import embedding_model, vector_db, file_loader
from deepsearcher import configuration
from deepsearcher.loader.splitter import split_docs_to_chunks
def load_from_local_files(
paths_or_directory: str | list[str],
collection_name: str = None,
collection_description: str = None,
force_rebuild: bool = False,
chunk_size: int = 1500,
chunk_overlap: int = 100,
batch_size: int = 256
):
"""
Load knowledge from local files or directories into the vector database.
This function processes files from the specified paths or directories,
splits them into chunks, embeds the chunks, and stores them in the vector database.
Args:
paths_or_directory: A single path or a list of paths to files or directories to load.
collection_name: Name of the collection to store the data in. If None, uses the default collection.
collection_description: Description of the collection. If None, no description is set.
force_rebuild: If True, drops the existing collection and creates a new one.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
batch_size: Number of chunks to process at once during embedding.
force_rebuild: If True, clears the existing collection and ensures no duplicates are inserted.
Raises:
FileNotFoundError: If any of the specified paths do not exist.
"""
vector_db = configuration.vector_db
if collection_name is None:
collection_name = vector_db.default_collection
collection_name = collection_name.replace(" ", "_").replace("-", "_")
embedding_model = configuration.embedding_model
file_loader = configuration.file_loader
# 初始化集合(如果不存在则创建,如果force_rebuild为True则重建)
vector_db.init_collection(
dim=embedding_model.dimension,
collection=collection_name,
description=collection_description,
force_rebuild=force_rebuild,
)
if isinstance(paths_or_directory, str):
paths_or_directory = [paths_or_directory]
all_docs = []
for path in tqdm(paths_or_directory, desc="Loading files"):
if not os.path.exists(path):
raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")
if os.path.isdir(path):
docs = file_loader.load_directory(path)
else:
docs = file_loader.load_file(path)
all_docs.extend(docs)
# print("Splitting docs to chunks...")
chunks = split_docs_to_chunks(
all_docs,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
# 为每个chunk计算SHA256哈希值作为主键,并检查重复
unique_chunks = []
for chunk in chunks:
# 计算chunk文本的SHA256哈希值
sha256_hash = hashlib.sha256(chunk.text.encode('utf-8')).hexdigest()
# 将哈希值添加到chunk的metadata中
chunk.metadata['id'] = sha256_hash
unique_chunks.append(chunk)
unique_chunks = embedding_model.embed_chunks(unique_chunks, batch_size=batch_size)
vector_db.insert_data(collection=collection_name, chunks=unique_chunks)
def load_from_website(
urls: str | list[str],
collection_name: str = None,
collection_description: str = None,
force_rebuild: bool = False,
chunk_size: int = 1500,
chunk_overlap: int = 100,
batch_size: int = 256,
**crawl_kwargs,
):
"""
Load knowledge from websites into the vector database.
This function crawls the specified URLs, processes the content,
splits it into chunks, embeds the chunks, and stores them in the vector database.
Args:
urls: A single URL or a list of URLs to crawl.
collection_name: Name of the collection to store the data in. If None, uses the default collection.
collection_description: Description of the collection. If None, no description is set.
force_rebuild: If True, drops the existing collection and creates a new one.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
batch_size: Number of chunks to process at once during embedding.
**crawl_kwargs: Additional keyword arguments to pass to the web crawler.
"""
if isinstance(urls, str):
urls = [urls]
vector_db = configuration.vector_db
embedding_model = configuration.embedding_model
web_crawler = configuration.web_crawler
vector_db.init_collection(
dim=embedding_model.dimension,
collection=collection_name,
description=collection_description,
force_rebuild=force_rebuild,
)
all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)
chunks = split_docs_to_chunks(
all_docs,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
vector_db.insert_data(collection=collection_name, chunks=chunks)