You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
5.0 KiB
132 lines
5.0 KiB
import hashlib
|
|
import os
|
|
|
|
from tqdm import tqdm
|
|
|
|
# from deepsearcher.configuration import embedding_model, vector_db, file_loader
|
|
from deepsearcher import configuration
|
|
from deepsearcher.loader.splitter import split_docs_to_chunks
|
|
|
|
|
|
def load_from_local_files(
|
|
paths_or_directory: str | list[str],
|
|
collection_name: str = None,
|
|
collection_description: str = None,
|
|
force_rebuild: bool = False,
|
|
chunk_size: int = 1500,
|
|
chunk_overlap: int = 100,
|
|
batch_size: int = 256
|
|
):
|
|
"""
|
|
Load knowledge from local files or directories into the vector database.
|
|
|
|
This function processes files from the specified paths or directories,
|
|
splits them into chunks, embeds the chunks, and stores them in the vector database.
|
|
|
|
Args:
|
|
paths_or_directory: A single path or a list of paths to files or directories to load.
|
|
collection_name: Name of the collection to store the data in. If None, uses the default collection.
|
|
collection_description: Description of the collection. If None, no description is set.
|
|
force_rebuild: If True, drops the existing collection and creates a new one.
|
|
chunk_size: Size of each chunk in characters.
|
|
chunk_overlap: Number of characters to overlap between chunks.
|
|
batch_size: Number of chunks to process at once during embedding.
|
|
force_rebuild: If True, clears the existing collection and ensures no duplicates are inserted.
|
|
|
|
Raises:
|
|
FileNotFoundError: If any of the specified paths do not exist.
|
|
"""
|
|
vector_db = configuration.vector_db
|
|
if collection_name is None:
|
|
collection_name = vector_db.default_collection
|
|
collection_name = collection_name.replace(" ", "_").replace("-", "_")
|
|
embedding_model = configuration.embedding_model
|
|
file_loader = configuration.file_loader
|
|
|
|
# 初始化集合(如果不存在则创建,如果force_rebuild为True则重建)
|
|
vector_db.init_collection(
|
|
dim=embedding_model.dimension,
|
|
collection=collection_name,
|
|
description=collection_description,
|
|
force_rebuild=force_rebuild,
|
|
)
|
|
|
|
if isinstance(paths_or_directory, str):
|
|
paths_or_directory = [paths_or_directory]
|
|
all_docs = []
|
|
for path in tqdm(paths_or_directory, desc="Loading files"):
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")
|
|
if os.path.isdir(path):
|
|
docs = file_loader.load_directory(path)
|
|
else:
|
|
docs = file_loader.load_file(path)
|
|
all_docs.extend(docs)
|
|
# print("Splitting docs to chunks...")
|
|
chunks = split_docs_to_chunks(
|
|
all_docs,
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
)
|
|
|
|
# 为每个chunk计算SHA256哈希值作为主键,并检查重复
|
|
unique_chunks = []
|
|
for chunk in chunks:
|
|
# 计算chunk文本的SHA256哈希值
|
|
sha256_hash = hashlib.sha256(chunk.text.encode('utf-8')).hexdigest()
|
|
# 将哈希值添加到chunk的metadata中
|
|
chunk.metadata['id'] = sha256_hash
|
|
unique_chunks.append(chunk)
|
|
|
|
unique_chunks = embedding_model.embed_chunks(unique_chunks, batch_size=batch_size)
|
|
vector_db.insert_data(collection=collection_name, chunks=unique_chunks)
|
|
|
|
|
|
def load_from_website(
|
|
urls: str | list[str],
|
|
collection_name: str = None,
|
|
collection_description: str = None,
|
|
force_rebuild: bool = False,
|
|
chunk_size: int = 1500,
|
|
chunk_overlap: int = 100,
|
|
batch_size: int = 256,
|
|
**crawl_kwargs,
|
|
):
|
|
"""
|
|
Load knowledge from websites into the vector database.
|
|
|
|
This function crawls the specified URLs, processes the content,
|
|
splits it into chunks, embeds the chunks, and stores them in the vector database.
|
|
|
|
Args:
|
|
urls: A single URL or a list of URLs to crawl.
|
|
collection_name: Name of the collection to store the data in. If None, uses the default collection.
|
|
collection_description: Description of the collection. If None, no description is set.
|
|
force_rebuild: If True, drops the existing collection and creates a new one.
|
|
chunk_size: Size of each chunk in characters.
|
|
chunk_overlap: Number of characters to overlap between chunks.
|
|
batch_size: Number of chunks to process at once during embedding.
|
|
**crawl_kwargs: Additional keyword arguments to pass to the web crawler.
|
|
"""
|
|
if isinstance(urls, str):
|
|
urls = [urls]
|
|
vector_db = configuration.vector_db
|
|
embedding_model = configuration.embedding_model
|
|
web_crawler = configuration.web_crawler
|
|
|
|
vector_db.init_collection(
|
|
dim=embedding_model.dimension,
|
|
collection=collection_name,
|
|
description=collection_description,
|
|
force_rebuild=force_rebuild,
|
|
)
|
|
|
|
all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)
|
|
|
|
chunks = split_docs_to_chunks(
|
|
all_docs,
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
)
|
|
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
|
|
vector_db.insert_data(collection=collection_name, chunks=chunks)
|
|
|