You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
119 lines
4.5 KiB
119 lines
4.5 KiB
import os
|
|
from typing import List, Union
|
|
|
|
from tqdm import tqdm
|
|
|
|
# from deepsearcher.configuration import embedding_model, vector_db, file_loader
|
|
from deepsearcher import configuration
|
|
from deepsearcher.loader.splitter import split_docs_to_chunks
|
|
|
|
|
|
def load_from_local_files(
|
|
paths_or_directory: Union[str, List[str]],
|
|
collection_name: str = None,
|
|
collection_description: str = None,
|
|
force_new_collection: bool = False,
|
|
chunk_size: int = 1500,
|
|
chunk_overlap: int = 100,
|
|
batch_size: int = 256,
|
|
):
|
|
"""
|
|
Load knowledge from local files or directories into the vector database.
|
|
|
|
This function processes files from the specified paths or directories,
|
|
splits them into chunks, embeds the chunks, and stores them in the vector database.
|
|
|
|
Args:
|
|
paths_or_directory: A single path or a list of paths to files or directories to load.
|
|
collection_name: Name of the collection to store the data in. If None, uses the default collection.
|
|
collection_description: Description of the collection. If None, no description is set.
|
|
force_new_collection: If True, drops the existing collection and creates a new one.
|
|
chunk_size: Size of each chunk in characters.
|
|
chunk_overlap: Number of characters to overlap between chunks.
|
|
batch_size: Number of chunks to process at once during embedding.
|
|
|
|
Raises:
|
|
FileNotFoundError: If any of the specified paths do not exist.
|
|
"""
|
|
vector_db = configuration.vector_db
|
|
if collection_name is None:
|
|
collection_name = vector_db.default_collection
|
|
collection_name = collection_name.replace(" ", "_").replace("-", "_")
|
|
embedding_model = configuration.embedding_model
|
|
file_loader = configuration.file_loader
|
|
vector_db.init_collection(
|
|
dim=embedding_model.dimension,
|
|
collection=collection_name,
|
|
description=collection_description,
|
|
force_new_collection=force_new_collection,
|
|
)
|
|
if isinstance(paths_or_directory, str):
|
|
paths_or_directory = [paths_or_directory]
|
|
all_docs = []
|
|
for path in tqdm(paths_or_directory, desc="Loading files"):
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")
|
|
if os.path.isdir(path):
|
|
docs = file_loader.load_directory(path)
|
|
else:
|
|
docs = file_loader.load_file(path)
|
|
all_docs.extend(docs)
|
|
# print("Splitting docs to chunks...")
|
|
chunks = split_docs_to_chunks(
|
|
all_docs,
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
)
|
|
|
|
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
|
|
vector_db.insert_data(collection=collection_name, chunks=chunks)
|
|
|
|
|
|
def load_from_website(
|
|
urls: Union[str, List[str]],
|
|
collection_name: str = None,
|
|
collection_description: str = None,
|
|
force_new_collection: bool = False,
|
|
chunk_size: int = 1500,
|
|
chunk_overlap: int = 100,
|
|
batch_size: int = 256,
|
|
**crawl_kwargs,
|
|
):
|
|
"""
|
|
Load knowledge from websites into the vector database.
|
|
|
|
This function crawls the specified URLs, processes the content,
|
|
splits it into chunks, embeds the chunks, and stores them in the vector database.
|
|
|
|
Args:
|
|
urls: A single URL or a list of URLs to crawl.
|
|
collection_name: Name of the collection to store the data in. If None, uses the default collection.
|
|
collection_description: Description of the collection. If None, no description is set.
|
|
force_new_collection: If True, drops the existing collection and creates a new one.
|
|
chunk_size: Size of each chunk in characters.
|
|
chunk_overlap: Number of characters to overlap between chunks.
|
|
batch_size: Number of chunks to process at once during embedding.
|
|
**crawl_kwargs: Additional keyword arguments to pass to the web crawler.
|
|
"""
|
|
if isinstance(urls, str):
|
|
urls = [urls]
|
|
vector_db = configuration.vector_db
|
|
embedding_model = configuration.embedding_model
|
|
web_crawler = configuration.web_crawler
|
|
|
|
vector_db.init_collection(
|
|
dim=embedding_model.dimension,
|
|
collection=collection_name,
|
|
description=collection_description,
|
|
force_new_collection=force_new_collection,
|
|
)
|
|
|
|
all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)
|
|
|
|
chunks = split_docs_to_chunks(
|
|
all_docs,
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
)
|
|
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
|
|
vector_db.insert_data(collection=collection_name, chunks=chunks)
|
|
|