36 changed files with 12 additions and 2627 deletions
@ -1,536 +0,0 @@ |
|||
import array |
|||
import json |
|||
from typing import List, Optional, Union |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
|
|||
class OracleDB(BaseVectorDB): |
|||
"""OracleDB class is a subclass of DB class.""" |
|||
|
|||
client = None |
|||
|
|||
def __init__( |
|||
self, |
|||
user: str, |
|||
password: str, |
|||
dsn: str, |
|||
config_dir: str, |
|||
wallet_location: str, |
|||
wallet_password: str, |
|||
min: int = 1, |
|||
max: int = 10, |
|||
increment: int = 1, |
|||
default_collection: str = "deepsearcher", |
|||
): |
|||
""" |
|||
Initialize the Oracle database connection. |
|||
|
|||
Args: |
|||
user (str): Oracle database username. |
|||
password (str): Oracle database password. |
|||
dsn (str): Oracle database connection string. |
|||
config_dir (str): Directory containing Oracle configuration files. |
|||
wallet_location (str): Location of the Oracle wallet. |
|||
wallet_password (str): Password for the Oracle wallet. |
|||
min (int, optional): Minimum number of connections in the pool. Defaults to 1. |
|||
max (int, optional): Maximum number of connections in the pool. Defaults to 10. |
|||
increment (int, optional): Increment for adding new connections. Defaults to 1. |
|||
default_collection (str, optional): Default collection name. Defaults to "deepsearcher". |
|||
""" |
|||
super().__init__(default_collection) |
|||
self.default_collection = default_collection |
|||
|
|||
import oracledb |
|||
|
|||
oracledb.defaults.fetch_lobs = False |
|||
self.DB_TYPE_VECTOR = oracledb.DB_TYPE_VECTOR |
|||
|
|||
try: |
|||
self.client = oracledb.create_pool( |
|||
user=user, |
|||
password=password, |
|||
dsn=dsn, |
|||
config_dir=config_dir, |
|||
wallet_location=wallet_location, |
|||
wallet_password=wallet_password, |
|||
min=min, |
|||
max=max, |
|||
increment=increment, |
|||
) |
|||
log.color_print(f"Connected to Oracle database at {dsn}") |
|||
self.check_table() |
|||
except Exception as e: |
|||
log.critical(f"Failed to connect to Oracle database at {dsn}") |
|||
log.critical(f"Oracle database error in init: {e}") |
|||
raise |
|||
|
|||
def numpy_converter_in(self, value): |
|||
"""Convert numpy array to array.array""" |
|||
if value.dtype == np.float64: |
|||
dtype = "d" |
|||
elif value.dtype == np.float32: |
|||
dtype = "f" |
|||
else: |
|||
dtype = "b" |
|||
return array.array(dtype, value) |
|||
|
|||
def input_type_handler(self, cursor, value, arraysize): |
|||
"""Set the type handler for the input data""" |
|||
if isinstance(value, np.ndarray): |
|||
return cursor.var( |
|||
self.DB_TYPE_VECTOR, |
|||
arraysize=arraysize, |
|||
inconverter=self.numpy_converter_in, |
|||
) |
|||
|
|||
def numpy_converter_out(self, value): |
|||
"""Convert array.array to numpy array""" |
|||
if value.typecode == "b": |
|||
dtype = np.int8 |
|||
elif value.typecode == "f": |
|||
dtype = np.float32 |
|||
else: |
|||
dtype = np.float64 |
|||
return np.array(value, copy=False, dtype=dtype) |
|||
|
|||
def output_type_handler(self, cursor, metadata): |
|||
"""Set the type handler for the output data""" |
|||
if metadata.type_code is self.DB_TYPE_VECTOR: |
|||
return cursor.var( |
|||
metadata.type_code, |
|||
arraysize=cursor.arraysize, |
|||
outconverter=self.numpy_converter_out, |
|||
) |
|||
|
|||
def query(self, sql: str, params: dict = None) -> Union[dict, None]: |
|||
""" |
|||
Execute a SQL query and return the results. |
|||
|
|||
Args: |
|||
sql (str): SQL query to execute. |
|||
params (dict, optional): Parameters for the SQL query. Defaults to None. |
|||
|
|||
Returns: |
|||
Union[dict, None]: Query results as a dictionary or None if no results. |
|||
|
|||
Raises: |
|||
Exception: If there's an error executing the query. |
|||
""" |
|||
with self.client.acquire() as connection: |
|||
connection.inputtypehandler = self.input_type_handler |
|||
connection.outputtypehandler = self.output_type_handler |
|||
with connection.cursor() as cursor: |
|||
try: |
|||
if log.dev_mode: |
|||
print("sql:\n", sql) |
|||
# log.debug("def query:"+params) |
|||
# print("sql:\n",sql) |
|||
# print("params:\n",params) |
|||
cursor.execute(sql, params) |
|||
except Exception as e: |
|||
log.critical(f"Oracle database error in query: {e}") |
|||
raise |
|||
columns = [column[0].lower() for column in cursor.description] |
|||
rows = cursor.fetchall() |
|||
if rows: |
|||
data = [dict(zip(columns, row)) for row in rows] |
|||
else: |
|||
data = [] |
|||
if log.dev_mode: |
|||
print("data:\n", data) |
|||
return data |
|||
# self.client.drop(connection) |
|||
|
|||
def execute(self, sql: str, data: Union[list, dict] = None): |
|||
""" |
|||
Execute a SQL statement without returning results. |
|||
|
|||
Args: |
|||
sql (str): SQL statement to execute. |
|||
data (Union[list, dict], optional): Data for the SQL statement. Defaults to None. |
|||
|
|||
Raises: |
|||
Exception: If there's an error executing the statement. |
|||
""" |
|||
try: |
|||
with self.client.acquire() as connection: |
|||
connection.inputtypehandler = self.input_type_handler |
|||
connection.outputtypehandler = self.output_type_handler |
|||
with connection.cursor() as cursor: |
|||
# print("sql:\n",sql) |
|||
# print("data:\n",data) |
|||
if data is None: |
|||
cursor.execute(sql) |
|||
else: |
|||
cursor.execute(sql, data) |
|||
connection.commit() |
|||
except Exception as e: |
|||
log.critical(f"Oracle database error in execute: {e}") |
|||
log.error("ERROR sql:\n" + sql) |
|||
log.error("ERROR data:\n" + data) |
|||
raise |
|||
|
|||
def has_collection(self, collection: str = "deepsearcher"): |
|||
""" |
|||
Check if a collection exists in the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to check. Defaults to "deepsearcher". |
|||
|
|||
Returns: |
|||
bool: True if the collection exists, False otherwise. |
|||
""" |
|||
SQL = SQL_TEMPLATES["has_collection"] |
|||
params = {"collection": collection} |
|||
res = self.query(SQL, params) |
|||
if res: |
|||
if res[0]["rowcnt"] > 0: |
|||
return True |
|||
else: |
|||
return False |
|||
else: |
|||
return False |
|||
|
|||
def check_table(self): |
|||
""" |
|||
Check if required tables exist and create them if they don't. |
|||
|
|||
Raises: |
|||
Exception: If there's an error checking or creating tables. |
|||
""" |
|||
SQL = SQL_TEMPLATES["has_table"] |
|||
try: |
|||
res = self.query(SQL) |
|||
if len(res) < 2: |
|||
missing_table = TABLES.keys() - set([i["table_name"] for i in res]) |
|||
for table in missing_table: |
|||
self.create_tables(table) |
|||
except Exception as e: |
|||
log.critical(f"Failed to check table in Oracle database, error info: {e}") |
|||
raise |
|||
|
|||
def create_tables(self, table_name): |
|||
""" |
|||
Create a table in the database. |
|||
|
|||
Args: |
|||
table_name: Name of the table to create. |
|||
|
|||
Raises: |
|||
Exception: If there's an error creating the table. |
|||
""" |
|||
SQL = TABLES[table_name] |
|||
try: |
|||
self.execute(SQL) |
|||
log.color_print(f"Created table {table_name} in Oracle database") |
|||
except Exception as e: |
|||
log.critical(f"Failed to create table {table_name} in Oracle database, error info: {e}") |
|||
raise |
|||
|
|||
def drop_collection(self, collection: str = "deepsearcher"): |
|||
""" |
|||
Drop a collection from the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. Defaults to "deepsearcher". |
|||
|
|||
Raises: |
|||
Exception: If there's an error dropping the collection. |
|||
""" |
|||
try: |
|||
params = {"collection": collection} |
|||
SQL = SQL_TEMPLATES["drop_collection"] |
|||
self.execute(SQL, params) |
|||
|
|||
SQL = SQL_TEMPLATES["drop_collection_item"] |
|||
self.execute(SQL, params) |
|||
log.color_print(f"Collection {collection} dropped") |
|||
except Exception as e: |
|||
log.critical(f"fail to drop collection, error info: {e}") |
|||
raise |
|||
|
|||
def insertone(self, data): |
|||
""" |
|||
Insert a single record into the database. |
|||
|
|||
Args: |
|||
data: Data to insert. |
|||
""" |
|||
SQL = SQL_TEMPLATES["insert"] |
|||
self.execute(SQL, data) |
|||
log.debug("insert done!") |
|||
|
|||
def searchone( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
): |
|||
""" |
|||
Search for similar vectors in a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name to search in. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
|
|||
Returns: |
|||
list: List of search results. |
|||
|
|||
Raises: |
|||
Exception: If there's an error during search. |
|||
""" |
|||
log.debug("def searchone:" + collection) |
|||
try: |
|||
if isinstance(vector, List): |
|||
vector = np.array(vector) |
|||
embedding_string = "[" + ", ".join(map(str, vector.tolist())) + "]" |
|||
dimension = vector.shape[0] |
|||
dtype = str(vector.dtype).upper() |
|||
|
|||
SQL = SQL_TEMPLATES["search"].format(dimension=dimension, dtype=dtype) |
|||
max_distance = 0.8 |
|||
params = { |
|||
"collection": collection, |
|||
"embedding_string": embedding_string, |
|||
"top_k": top_k, |
|||
"max_distance": max_distance, |
|||
} |
|||
res = self.query(SQL, params) |
|||
if res: |
|||
return res |
|||
else: |
|||
return [] |
|||
except Exception as e: |
|||
log.critical(f"fail to search data, error info: {e}") |
|||
raise |
|||
|
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: Optional[str] = "deepsearcher", |
|||
description: Optional[str] = "", |
|||
force_new_collection: bool = False, |
|||
text_max_length: int = 65_535, |
|||
reference_max_length: int = 2048, |
|||
metric_type: str = "L2", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in the database. |
|||
|
|||
Args: |
|||
dim (int): Dimension of the vector embeddings. |
|||
collection (Optional[str], optional): Collection name. Defaults to "deepsearcher". |
|||
description (Optional[str], optional): Collection description. Defaults to "". |
|||
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False. |
|||
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535. |
|||
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048. |
|||
metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Raises: |
|||
Exception: If there's an error initializing the collection. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
if description is None: |
|||
description = "" |
|||
try: |
|||
has_collection = self.has_collection(collection) |
|||
if force_new_collection and has_collection: |
|||
self.drop_collection(collection) |
|||
elif has_collection: |
|||
return |
|||
# insert collection info |
|||
SQL = SQL_TEMPLATES["insert_collection"] |
|||
params = {"collection": collection, "description": description} |
|||
self.execute(SQL, params) |
|||
except Exception as e: |
|||
log.critical(f"fail to init_collection for oracle, error info: {e}") |
|||
|
|||
def insert_data( |
|||
self, |
|||
collection: Optional[str], |
|||
chunks: List[Chunk], |
|||
batch_size: int = 256, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Insert data into a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
chunks (List[Chunk]): List of Chunk objects to insert. |
|||
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Raises: |
|||
Exception: If there's an error inserting data. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
|
|||
datas = [] |
|||
for chunk in chunks: |
|||
_data = { |
|||
"embedding": self.numpy_converter_in(np.array(chunk.embedding)), |
|||
"text": chunk.text, |
|||
"reference": chunk.reference, |
|||
"metadata": json.dumps(chunk.metadata), |
|||
"collection": collection, |
|||
} |
|||
datas.append(_data) |
|||
|
|||
batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)] |
|||
try: |
|||
for batch_data in batch_datas: |
|||
for _data in batch_data: |
|||
self.insertone(data=_data) |
|||
log.color_print(f"Successfully insert {len(datas)} data") |
|||
except Exception as e: |
|||
log.critical(f"fail to insert data, error info: {e}") |
|||
raise |
|||
|
|||
def search_data( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
*args, |
|||
**kwargs, |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: List of retrieval results containing similar vectors. |
|||
|
|||
Raises: |
|||
Exception: If there's an error during search. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
# print("def search_data:",collection) |
|||
# print("def search_data:",type(vector)) |
|||
search_results = self.searchone(collection=collection, vector=vector, top_k=top_k) |
|||
# print("def search_data: search_results",search_results) |
|||
|
|||
return [ |
|||
RetrievalResult( |
|||
embedding=b["embedding"], |
|||
text=b["text"], |
|||
reference=b["reference"], |
|||
score=b["distance"], |
|||
metadata=json.loads(b["metadata"]), |
|||
) |
|||
for b in search_results |
|||
] |
|||
except Exception as e: |
|||
log.critical(f"fail to search data, error info: {e}") |
|||
raise |
|||
# return [] |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[CollectionInfo]: List of collection information objects. |
|||
""" |
|||
collection_infos = [] |
|||
try: |
|||
SQL = SQL_TEMPLATES["list_collections"] |
|||
log.debug("def list_collections:" + SQL) |
|||
collections = self.query(SQL) |
|||
if collections: |
|||
for collection in collections: |
|||
collection_infos.append( |
|||
CollectionInfo( |
|||
collection_name=collection["collection"], |
|||
description=collection["description"], |
|||
) |
|||
) |
|||
return collection_infos |
|||
except Exception as e: |
|||
log.critical(f"fail to list collections, error info: {e}") |
|||
raise |
|||
|
|||
def clear_db(self, collection: str = "deepsearcher", *args, **kwargs): |
|||
""" |
|||
Clear (drop) a collection from the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. Defaults to "deepsearcher". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
self.client.drop_collection(collection) |
|||
except Exception as e: |
|||
log.warning(f"fail to clear db, error info: {e}") |
|||
raise |
|||
|
|||
|
|||
TABLES = { |
|||
"DEEPSEARCHER_COLLECTION_INFO": """CREATE TABLE DEEPSEARCHER_COLLECTION_INFO ( |
|||
id INT generated by default as identity primary key, |
|||
collection varchar(256), |
|||
description CLOB, |
|||
status NUMBER DEFAULT 1, |
|||
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|||
updatetime TIMESTAMP DEFAULT NULL)""", |
|||
"DEEPSEARCHER_COLLECTION_ITEM": """CREATE TABLE DEEPSEARCHER_COLLECTION_ITEM ( |
|||
id INT generated by default as identity primary key, |
|||
collection varchar(256), |
|||
embedding VECTOR, |
|||
text CLOB, |
|||
reference varchar(4000), |
|||
metadata CLOB, |
|||
status NUMBER DEFAULT 1, |
|||
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|||
updatetime TIMESTAMP DEFAULT NULL)""", |
|||
} |
|||
|
|||
SQL_TEMPLATES = { |
|||
"has_table": f"""SELECT table_name FROM all_tables |
|||
WHERE table_name in ({",".join([f"'{k}'" for k in TABLES.keys()])})""", |
|||
"has_collection": "select count(*) as rowcnt from DEEPSEARCHER_COLLECTION_INFO where collection=:collection and status=1", |
|||
"list_collections": "select collection,description from DEEPSEARCHER_COLLECTION_INFO where status=1", |
|||
"drop_collection": "update DEEPSEARCHER_COLLECTION_INFO set status=0 where collection=:collection and status=1", |
|||
"drop_collection_item": "update DEEPSEARCHER_COLLECTION_ITEM set status=0 where collection=:collection and status=1", |
|||
"insert_collection": """INSERT INTO DEEPSEARCHER_COLLECTION_INFO (collection,description) |
|||
values (:collection,:description)""", |
|||
"insert": """INSERT INTO DEEPSEARCHER_COLLECTION_ITEM (collection,embedding,text,reference,metadata) |
|||
values (:collection,:embedding,:text,:reference,:metadata)""", |
|||
"search": """SELECT * FROM |
|||
(SELECT t.*, |
|||
VECTOR_DISTANCE(t.embedding,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance |
|||
FROM DEEPSEARCHER_COLLECTION_ITEM t |
|||
JOIN DEEPSEARCHER_COLLECTION_INFO c ON t.collection=c.collection |
|||
WHERE t.collection=:collection AND t.status=1 AND c.status=1) |
|||
WHERE distance<:max_distance ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""", |
|||
} |
@ -1,290 +0,0 @@ |
|||
import uuid |
|||
from typing import List, Optional, Union |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
DEFAULT_COLLECTION_NAME = "deepsearcher" |
|||
|
|||
TEXT_PAYLOAD_KEY = "text" |
|||
REFERENCE_PAYLOAD_KEY = "reference" |
|||
METADATA_PAYLOAD_KEY = "metadata" |
|||
|
|||
|
|||
class Qdrant(BaseVectorDB): |
|||
"""Vector DB implementation powered by [Qdrant](https://qdrant.tech/)""" |
|||
|
|||
def __init__( |
|||
self, |
|||
location: Optional[str] = None, |
|||
url: Optional[str] = None, |
|||
port: Optional[int] = 6333, |
|||
grpc_port: int = 6334, |
|||
prefer_grpc: bool = False, |
|||
https: Optional[bool] = None, |
|||
api_key: Optional[str] = None, |
|||
prefix: Optional[str] = None, |
|||
timeout: Optional[int] = None, |
|||
host: Optional[str] = None, |
|||
path: Optional[str] = None, |
|||
default_collection: str = DEFAULT_COLLECTION_NAME, |
|||
): |
|||
""" |
|||
Initialize the Qdrant client with flexible connection options. |
|||
|
|||
Args: |
|||
location (Optional[str], optional): |
|||
- If ":memory:" - use in-memory Qdrant instance. |
|||
- If str - use it as a URL parameter. |
|||
- If None - use default values for host and port. |
|||
Defaults to None. |
|||
|
|||
url (Optional[str], optional): |
|||
URL for Qdrant service, can include scheme, host, port, and prefix. |
|||
Allows flexible connection string specification. |
|||
Defaults to None. |
|||
|
|||
port (Optional[int], optional): |
|||
Port of the REST API interface. |
|||
Defaults to 6333. |
|||
|
|||
grpc_port (int, optional): |
|||
Port of the gRPC interface. |
|||
Defaults to 6334. |
|||
|
|||
prefer_grpc (bool, optional): |
|||
If True, use gRPC interface whenever possible in custom methods. |
|||
Defaults to False. |
|||
|
|||
https (Optional[bool], optional): |
|||
If True, use HTTPS (SSL) protocol. |
|||
Defaults to None. |
|||
|
|||
api_key (Optional[str], optional): |
|||
API key for authentication in Qdrant Cloud. |
|||
Defaults to None. |
|||
|
|||
prefix (Optional[str], optional): |
|||
If not None, add prefix to the REST URL path. |
|||
Example: 'service/v1' results in 'http://localhost:6333/service/v1/{qdrant-endpoint}' |
|||
Defaults to None. |
|||
|
|||
timeout (Optional[int], optional): |
|||
Timeout for REST and gRPC API requests. |
|||
Default is 5 seconds for REST and unlimited for gRPC. |
|||
Defaults to None. |
|||
|
|||
host (Optional[str], optional): |
|||
Host name of Qdrant service. |
|||
If url and host are None, defaults to 'localhost'. |
|||
Defaults to None. |
|||
|
|||
path (Optional[str], optional): |
|||
Persistence path for QdrantLocal. |
|||
Defaults to None. |
|||
|
|||
default_collection (str, optional): |
|||
Default collection name to be used. |
|||
""" |
|||
try: |
|||
from qdrant_client import QdrantClient |
|||
except ImportError as original_error: |
|||
raise ImportError( |
|||
"Qdrant client is not installed. Install it using: pip install qdrant-client\n" |
|||
) from original_error |
|||
|
|||
super().__init__(default_collection) |
|||
self.client = QdrantClient( |
|||
location=location, |
|||
url=url, |
|||
port=port, |
|||
grpc_port=grpc_port, |
|||
prefer_grpc=prefer_grpc, |
|||
https=https, |
|||
api_key=api_key, |
|||
prefix=prefix, |
|||
timeout=timeout, |
|||
host=host, |
|||
path=path, |
|||
) |
|||
|
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: Optional[str] = None, |
|||
description: Optional[str] = "", |
|||
force_new_collection: bool = False, |
|||
text_max_length: int = 65_535, |
|||
reference_max_length: int = 2048, |
|||
distance_metric: str = "Cosine", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in Qdrant. |
|||
|
|||
Args: |
|||
dim (int): Dimension of the vector embeddings. |
|||
collection (Optional[str], optional): Collection name. |
|||
description (Optional[str], optional): Collection description. Defaults to "". |
|||
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False. |
|||
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535. |
|||
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048. |
|||
distance_metric (str, optional): Metric type for vector similarity search. Defaults to "Cosine". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
from qdrant_client import models |
|||
|
|||
collection = collection or self.default_collection |
|||
|
|||
try: |
|||
collection_exists = self.client.collection_exists(collection_name=collection) |
|||
|
|||
if force_new_collection and collection_exists: |
|||
self.client.delete_collection(collection_name=collection) |
|||
collection_exists = False |
|||
|
|||
if not collection_exists: |
|||
self.client.create_collection( |
|||
collection_name=collection, |
|||
vectors_config=models.VectorParams(size=dim, distance=distance_metric), |
|||
*args, |
|||
**kwargs, |
|||
) |
|||
|
|||
log.color_print(f"Created collection [{collection}] successfully") |
|||
except Exception as e: |
|||
log.critical(f"Failed to init Qdrant collection, error info: {e}") |
|||
|
|||
def insert_data( |
|||
self, |
|||
collection: Optional[str], |
|||
chunks: List[Chunk], |
|||
batch_size: int = 256, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Insert data into a Qdrant collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. |
|||
chunks (List[Chunk]): List of Chunk objects to insert. |
|||
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
from qdrant_client import models |
|||
|
|||
try: |
|||
for i in range(0, len(chunks), batch_size): |
|||
batch_chunks = chunks[i : i + batch_size] |
|||
|
|||
points = [ |
|||
models.PointStruct( |
|||
id=uuid.uuid4().hex, |
|||
vector=chunk.embedding, |
|||
payload={ |
|||
TEXT_PAYLOAD_KEY: chunk.text, |
|||
REFERENCE_PAYLOAD_KEY: chunk.reference, |
|||
METADATA_PAYLOAD_KEY: chunk.metadata, |
|||
}, |
|||
) |
|||
for chunk in batch_chunks |
|||
] |
|||
|
|||
self.client.upsert( |
|||
collection_name=collection or self.default_collection, points=points |
|||
) |
|||
except Exception as e: |
|||
log.critical(f"Failed to insert data, error info: {e}") |
|||
|
|||
def search_data( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
*args, |
|||
**kwargs, |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a Qdrant collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name.. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: List of retrieval results containing similar vectors. |
|||
""" |
|||
try: |
|||
results = self.client.query_points( |
|||
collection_name=collection or self.default_collection, |
|||
query=vector, |
|||
limit=top_k, |
|||
with_payload=True, |
|||
with_vectors=True, |
|||
).points |
|||
|
|||
return [ |
|||
RetrievalResult( |
|||
embedding=result.vector, |
|||
text=result.payload.get(TEXT_PAYLOAD_KEY, ""), |
|||
reference=result.payload.get(REFERENCE_PAYLOAD_KEY, ""), |
|||
score=result.score, |
|||
metadata=result.payload.get(METADATA_PAYLOAD_KEY, {}), |
|||
) |
|||
for result in results |
|||
] |
|||
except Exception as e: |
|||
log.critical(f"Failed to search data, error info: {e}") |
|||
return [] |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the Qdrant database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[CollectionInfo]: List of collection information objects. |
|||
""" |
|||
collection_infos = [] |
|||
|
|||
try: |
|||
collections = self.client.get_collections().collections |
|||
for collection in collections: |
|||
collection_infos.append( |
|||
CollectionInfo( |
|||
collection_name=collection.name, |
|||
# Qdrant doesn't have a native description field |
|||
description=collection.name, |
|||
) |
|||
) |
|||
except Exception as e: |
|||
log.critical(f"Failed to list collections, error info: {e}") |
|||
|
|||
return collection_infos |
|||
|
|||
def clear_db(self, collection: Optional[str] = None, *args, **kwargs): |
|||
""" |
|||
Clear (drop) a collection from the Qdrant database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
try: |
|||
self.client.delete_collection(collection_name=collection or self.default_collection) |
|||
except Exception as e: |
|||
log.warning(f"Failed to drop collection, error info: {e}") |
@ -1,42 +0,0 @@ |
|||
# DeepSearcher Documentation |
|||
|
|||
This directory contains the documentation for DeepSearcher, powered by MkDocs. |
|||
|
|||
## Setup |
|||
|
|||
1. Install MkDocs and required plugins: |
|||
|
|||
```bash |
|||
pip install mkdocs mkdocs-material mkdocs-jupyter pymdown-extensions |
|||
``` |
|||
|
|||
2. Clone the repository: |
|||
|
|||
```bash |
|||
git clone https://github.com/zilliztech/deep-searcher.git |
|||
cd deep-searcher |
|||
``` |
|||
|
|||
## Development |
|||
|
|||
To serve the documentation locally: |
|||
|
|||
```bash |
|||
mkdocs serve |
|||
``` |
|||
|
|||
This will start a local server at http://127.0.0.1:8000/ where you can preview the documentation. |
|||
|
|||
## Building |
|||
|
|||
To build the static site: |
|||
|
|||
```bash |
|||
mkdocs build |
|||
``` |
|||
|
|||
This will generate the static site in the `site` directory. |
|||
|
|||
## Deployment |
|||
|
|||
The documentation is automatically deployed when changes are pushed to the main branch using GitHub Actions. |
Before Width: | Height: | Size: 307 KiB |
Before Width: | Height: | Size: 3.4 MiB |
Before Width: | Height: | Size: 53 KiB |
Before Width: | Height: | Size: 54 KiB |
@ -1,126 +0,0 @@ |
|||
# Embedding Model Configuration |
|||
|
|||
DeepSearcher supports various embedding models to convert text into vector representations for semantic search. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available Embedding Providers |
|||
|
|||
| Provider | Description | Key Features | |
|||
|----------|-------------|--------------| |
|||
| **OpenAIEmbedding** | OpenAI's text embedding models | High quality, production-ready | |
|||
| **MilvusEmbedding** | Built-in embedding models via Pymilvus | Multiple model options | |
|||
| **VoyageEmbedding** | VoyageAI embedding models | Specialized for search | |
|||
| **BedrockEmbedding** | Amazon Bedrock embedding | AWS integration | |
|||
| **GeminiEmbedding** | Google's Gemini embedding | High performance | |
|||
| **GLMEmbedding** | ChatGLM embeddings | Chinese language support | |
|||
| **OllamaEmbedding** | Local embedding with Ollama | Self-hosted option | |
|||
| **PPIOEmbedding** | PPIO cloud embedding | Scalable solution | |
|||
| **SiliconflowEmbedding** | Siliconflow's models | Enterprise support | |
|||
| **VolcengineEmbedding** | Volcengine embedding | High throughput | |
|||
| **NovitaEmbedding** | Novita AI embedding | Cost-effective | |
|||
| **SentenceTransformerEmbedding** | Sentence Transfomer Embedding | Self-hosted option | |
|||
| **IBM watsonx.ai** | Various options | IBM's Enterprise AI platform | |
|||
|
|||
## 🔍 Provider Examples |
|||
|
|||
### OpenAI Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
### Milvus Built-in Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"}) |
|||
``` |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"}) |
|||
``` |
|||
*For Jina's embedding model, requires `JINAAI_API_KEY` environment variable* |
|||
|
|||
### VoyageAI Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"}) |
|||
``` |
|||
*Requires `VOYAGE_API_KEY` environment variable and `pip install voyageai`* |
|||
|
|||
## 📚 Additional Providers |
|||
|
|||
??? example "Amazon Bedrock" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"}) |
|||
``` |
|||
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`* |
|||
|
|||
??? example "Novita AI" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"}) |
|||
``` |
|||
*Requires `NOVITA_API_KEY` environment variable* |
|||
|
|||
??? example "Siliconflow" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"}) |
|||
``` |
|||
*Requires `SILICONFLOW_API_KEY` environment variable* |
|||
|
|||
??? example "Volcengine" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"}) |
|||
``` |
|||
*Requires `VOLCENGINE_API_KEY` environment variable* |
|||
|
|||
??? example "GLM" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"}) |
|||
``` |
|||
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`* |
|||
|
|||
??? example "Google Gemini" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"}) |
|||
``` |
|||
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`* |
|||
|
|||
??? example "Ollama" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"}) |
|||
``` |
|||
*Requires local Ollama installation and `pip install ollama`* |
|||
|
|||
??? example "PPIO" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"}) |
|||
``` |
|||
*Requires `PPIO_API_KEY` environment variable* |
|||
|
|||
??? example "SentenceTransformer" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "SentenceTransformerEmbedding", {"model": "BAAI/bge-large-zh-v1.5"}) |
|||
``` |
|||
*Requires `pip install sentence-transformers`* |
|||
|
|||
??? example "IBM WatsonX" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"}) |
|||
``` |
|||
*Requires `pip install ibm-watsonx-ai`* |
@ -1,70 +0,0 @@ |
|||
# File Loader Configuration |
|||
|
|||
DeepSearcher supports various file loaders to extract and process content from different file formats. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available File Loaders |
|||
|
|||
| Loader | Description | Supported Formats | |
|||
|--------|-------------|-------------------| |
|||
| **UnstructuredLoader** | General purpose document loader with broad format support | PDF, DOCX, PPT, HTML, etc. | |
|||
| **DoclingLoader** | Document processing library with extraction capabilities | See [documentation](https://docling-project.github.io/docling/usage/supported_formats/) | |
|||
|
|||
## 🔍 File Loader Options |
|||
|
|||
### Unstructured |
|||
|
|||
[Unstructured](https://unstructured.io/) is a powerful library for extracting content from various document formats. |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "UnstructuredLoader", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
You can use Unstructured in two ways: |
|||
|
|||
1. **With API** (recommended for production) |
|||
- Set environment variables: |
|||
- `UNSTRUCTURED_API_KEY` |
|||
- `UNSTRUCTURED_API_URL` |
|||
|
|||
2. **Local Processing** |
|||
- Simply don't set the API environment variables |
|||
- Install required dependencies: |
|||
```bash |
|||
# Install core dependencies |
|||
pip install unstructured-ingest |
|||
|
|||
# For all document formats |
|||
pip install "unstructured[all-docs]" |
|||
|
|||
# For specific formats (e.g., PDF only) |
|||
pip install "unstructured[pdf]" |
|||
``` |
|||
|
|||
For more information: |
|||
- [Unstructured Documentation](https://docs.unstructured.io/ingestion/overview) |
|||
- [Installation Guide](https://docs.unstructured.io/open-source/installation/full-installation) |
|||
|
|||
### Docling |
|||
|
|||
[Docling](https://docling-project.github.io/docling/) provides document processing capabilities with support for multiple formats. |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "DoclingLoader", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Docling: |
|||
```bash |
|||
pip install docling |
|||
``` |
|||
|
|||
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats). |
@ -1,33 +0,0 @@ |
|||
# Configuration Overview |
|||
|
|||
DeepSearcher provides flexible configuration options for all its components. You can customize the following aspects of the system: |
|||
|
|||
## 📋 Components |
|||
|
|||
| Component | Purpose | Documentation | |
|||
|-----------|---------|---------------| |
|||
| **LLM** | Large Language Models for query processing | [LLM Configuration](llm.md) | |
|||
| **Embedding Models** | Text embedding for vector retrieval | [Embedding Models](embedding.md) | |
|||
| **Vector Database** | Storage and retrieval of vector embeddings | [Vector Database](vector_db.md) | |
|||
| **File Loader** | Loading and processing various file formats | [File Loader](file_loader.md) | |
|||
| **Web Crawler** | Gathering information from web sources | [Web Crawler](web_crawler.md) | |
|||
|
|||
## 🔄 Configuration Method |
|||
|
|||
DeepSearcher uses a consistent configuration approach for all components: |
|||
|
|||
```python |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Create configuration |
|||
config = Configuration() |
|||
|
|||
# Set provider configurations |
|||
config.set_provider_config("[component]", "[provider]", {"option": "value"}) |
|||
|
|||
# Initialize with configuration |
|||
init_config(config=config) |
|||
``` |
|||
|
|||
For detailed configuration options for each component, please visit the corresponding documentation pages linked in the table above. |
|||
|
@ -1,192 +0,0 @@ |
|||
# LLM Configuration |
|||
|
|||
DeepSearcher supports various Large Language Models (LLMs) for processing queries and generating responses. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "(LLMName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available LLM Providers |
|||
|
|||
| Provider | Description | Key Models | |
|||
|----------|-------------|------------| |
|||
| **OpenAI** | OpenAI's API for GPT models | o1-mini, GPT-4 | |
|||
| **DeepSeek** | DeepSeek AI offering | deepseek-reasoner, coder | |
|||
| **Anthropic** | Anthropic's Claude models | claude-sonnet-4-0 | |
|||
| **Gemini** | Google's Gemini models | gemini-1.5-pro, gemini-2.0-flash | |
|||
| **XAI** | X.AI's Grok models | grok-2-latest | |
|||
| **Ollama** | Local LLM deployment | llama3, qwq, etc. | |
|||
| **SiliconFlow** | Enterprise AI platform | deepseek-r1 | |
|||
| **TogetherAI** | Multiple model options | llama-4, deepseek | |
|||
| **PPIO** | Cloud AI infrastructure | deepseek, llama | |
|||
| **Volcengine** | ByteDance LLM platform | deepseek-r1 | |
|||
| **GLM** | ChatGLM models | glm-4-plus | |
|||
| **Bedrock** | Amazon Bedrock LLMs | anthropic.claude, ai21.j2 | |
|||
| **Novita** | Novita AI models | Various options | |
|||
| **IBM watsonx.ai** | IBM Enterprise AI platform | Various options | |
|||
|
|||
## 🔍 Provider Examples |
|||
|
|||
### OpenAI |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
### DeepSeek |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"}) |
|||
``` |
|||
*Requires `DEEPSEEK_API_KEY` environment variable* |
|||
|
|||
### IBM WatsonX |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"}) |
|||
``` |
|||
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables* |
|||
|
|||
## 📚 Additional Providers |
|||
|
|||
??? example "DeepSeek from SiliconFlow" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"}) |
|||
``` |
|||
*Requires `SILICONFLOW_API_KEY` environment variable* |
|||
|
|||
More details about SiliconFlow: [https://docs.siliconflow.cn/quickstart](https://docs.siliconflow.cn/quickstart) |
|||
|
|||
??? example "DeepSeek from TogetherAI" |
|||
|
|||
*Requires `TOGETHER_API_KEY` environment variable and `pip install together`* |
|||
|
|||
For DeepSeek R1: |
|||
```python |
|||
config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"}) |
|||
``` |
|||
|
|||
For Llama 4: |
|||
```python |
|||
config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"}) |
|||
``` |
|||
|
|||
More details about TogetherAI: [https://www.together.ai/](https://www.together.ai/) |
|||
|
|||
??? example "XAI Grok" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "XAI", {"model": "grok-2-latest"}) |
|||
``` |
|||
*Requires `XAI_API_KEY` environment variable* |
|||
|
|||
More details about XAI Grok: [https://docs.x.ai/docs/overview#featured-models](https://docs.x.ai/docs/overview#featured-models) |
|||
|
|||
??? example "Claude" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"}) |
|||
``` |
|||
*Requires `ANTHROPIC_API_KEY` environment variable* |
|||
|
|||
More details about Anthropic Claude: [https://docs.anthropic.com/en/home](https://docs.anthropic.com/en/home) |
|||
|
|||
??? example "Google Gemini" |
|||
|
|||
```python |
|||
config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' }) |
|||
``` |
|||
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`* |
|||
|
|||
More details about Gemini: [https://ai.google.dev/gemini-api/docs](https://ai.google.dev/gemini-api/docs) |
|||
|
|||
??? example "DeepSeek from PPIO" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"}) |
|||
``` |
|||
*Requires `PPIO_API_KEY` environment variable* |
|||
|
|||
More details about PPIO: [https://ppinfra.com/docs/get-started/quickstart.html](https://ppinfra.com/docs/get-started/quickstart.html) |
|||
|
|||
??? example "Ollama" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Ollama", {"model": "qwq"}) |
|||
``` |
|||
|
|||
Follow [these instructions](https://github.com/jmorganca/ollama) to set up and run a local Ollama instance: |
|||
|
|||
1. [Download](https://ollama.ai/download) and install Ollama |
|||
2. View available models via the [model library](https://ollama.ai/library) |
|||
3. Pull models with `ollama pull <name-of-model>` |
|||
4. By default, Ollama has a REST API on [http://localhost:11434](http://localhost:11434) |
|||
|
|||
??? example "Volcengine" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"}) |
|||
``` |
|||
*Requires `VOLCENGINE_API_KEY` environment variable* |
|||
|
|||
More details about Volcengine: [https://www.volcengine.com/docs/82379/1099455](https://www.volcengine.com/docs/82379/1099455) |
|||
|
|||
??? example "GLM" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"}) |
|||
``` |
|||
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`* |
|||
|
|||
More details about GLM: [https://bigmodel.cn/dev/welcome](https://bigmodel.cn/dev/welcome) |
|||
|
|||
??? example "Amazon Bedrock" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"}) |
|||
``` |
|||
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`* |
|||
|
|||
More details about Amazon Bedrock: [https://docs.aws.amazon.com/bedrock/](https://docs.aws.amazon.com/bedrock/) |
|||
|
|||
??? example "Aliyun Bailian" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "OpenAI", {"model": "deepseek-r1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
More details about Aliyun Bailian models: [https://bailian.console.aliyun.com](https://bailian.console.aliyun.com) |
|||
|
|||
??? example "IBM watsonx.ai LLM" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"}) |
|||
``` |
|||
|
|||
With custom parameters: |
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", { |
|||
"model": "ibm/granite-3-3-8b-instruct", |
|||
"max_new_tokens": 1000, |
|||
"temperature": 0.7, |
|||
"top_p": 0.9, |
|||
"top_k": 50 |
|||
}) |
|||
``` |
|||
|
|||
With space_id instead of project_id: |
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", { |
|||
"model": "ibm/granite-3-3-8b-instruct"" |
|||
}) |
|||
``` |
|||
|
|||
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables and `pip install ibm-watsonx-ai`* |
|||
|
|||
More details about WatsonX: [https://www.ibm.com/products/watsonx-ai/foundation-models](https://www.ibm.com/products/watsonx-ai/foundation-models) |
|||
``` |
@ -1,52 +0,0 @@ |
|||
# Vector Database Configuration |
|||
|
|||
DeepSearcher uses vector databases to store and retrieve document embeddings for efficient semantic search. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
Currently supported vector databases: |
|||
- Milvus (including Milvus Lite and Zilliz Cloud) |
|||
|
|||
## 🔍 Milvus Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""}) |
|||
``` |
|||
|
|||
### Deployment Options |
|||
|
|||
??? example "Local Storage with Milvus Lite" |
|||
|
|||
Setting the `uri` as a local file (e.g., `./milvus.db`) automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file. This is the most convenient method for development and smaller datasets. |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""}) |
|||
``` |
|||
|
|||
??? example "Standalone Milvus Server" |
|||
|
|||
For larger datasets, you can set up a more performant Milvus server using [Docker or Kubernetes](https://milvus.io/docs/quickstart.md). In this setup, use the server URI as your `uri` parameter: |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "token": ""}) |
|||
``` |
|||
|
|||
Also, you could specify other connection parameters supported by Milvus such as `user`, `password`, `secure` or others. |
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "user": "<username>", "password":"<password>", "secure": True, "token": ""}) |
|||
``` |
|||
|
|||
??? example "Zilliz Cloud (Managed Service)" |
|||
|
|||
[Zilliz Cloud](https://zilliz.com/cloud) provides a fully managed cloud service for Milvus. To use Zilliz Cloud, adjust the `uri` and `token` according to the [Public Endpoint and API Key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details): |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", { |
|||
"uri": "https://your-instance-id.api.gcp-us-west1.zillizcloud.com", |
|||
"token": "your_api_key" |
|||
}) |
|||
``` |
@ -1,97 +0,0 @@ |
|||
# Web Crawler Configuration |
|||
|
|||
DeepSearcher supports various web crawlers to collect data from websites for processing and indexing. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available Web Crawlers |
|||
|
|||
| Crawler | Description | Key Feature | |
|||
|---------|-------------|-------------| |
|||
| **FireCrawlCrawler** | Cloud-based web crawling service | Simple API, managed service | |
|||
| **Crawl4AICrawler** | Browser automation crawler | Full JavaScript support | |
|||
| **JinaCrawler** | Content extraction service | High accuracy parsing | |
|||
| **DoclingCrawler** | Doc processing with crawling | Multiple format support | |
|||
|
|||
## 🔍 Web Crawler Options |
|||
|
|||
### FireCrawl |
|||
|
|||
[FireCrawl](https://docs.firecrawl.dev/introduction) is a cloud-based web crawling service designed for AI applications. |
|||
|
|||
**Key features:** |
|||
- Simple API |
|||
- Managed Service |
|||
- Advanced Parsing |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "FireCrawlCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Sign up for FireCrawl and get an API key |
|||
2. Set the API key as an environment variable: |
|||
```bash |
|||
export FIRECRAWL_API_KEY="your_api_key" |
|||
``` |
|||
3. For more information, see the [FireCrawl documentation](https://docs.firecrawl.dev/introduction) |
|||
|
|||
### Crawl4AI |
|||
|
|||
[Crawl4AI](https://docs.crawl4ai.com/) is a Python package for web crawling with browser automation capabilities. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Crawl4AI: |
|||
```bash |
|||
pip install crawl4ai |
|||
``` |
|||
2. Run the setup command: |
|||
```bash |
|||
crawl4ai-setup |
|||
``` |
|||
3. For more information, see the [Crawl4AI documentation](https://docs.crawl4ai.com/) |
|||
|
|||
### Jina Reader |
|||
|
|||
[Jina Reader](https://jina.ai/reader/) is a service for extracting content from web pages with high accuracy. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "JinaCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Get a Jina API key |
|||
2. Set the API key as an environment variable: |
|||
```bash |
|||
export JINA_API_TOKEN="your_api_key" |
|||
# or |
|||
export JINAAI_API_KEY="your_api_key" |
|||
``` |
|||
3. For more information, see the [Jina Reader documentation](https://jina.ai/reader/) |
|||
|
|||
### Docling Crawler |
|||
|
|||
[Docling](https://docling-project.github.io/docling/) provides web crawling capabilities alongside its document processing features. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "DoclingCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Docling: |
|||
```bash |
|||
pip install docling |
|||
``` |
|||
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats) |
@ -1,159 +0,0 @@ |
|||
# Contributing to DeepSearcher |
|||
|
|||
We welcome contributions from everyone. This document provides guidelines to make the contribution process straightforward. |
|||
|
|||
|
|||
## Pull Request Process |
|||
|
|||
1. Fork the repository and create your branch from `master`. |
|||
2. Make your changes. |
|||
3. Run tests and linting to ensure your code meets the project's standards. |
|||
4. Update documentation if necessary. |
|||
5. Submit a pull request. |
|||
|
|||
|
|||
## Linting and Formatting |
|||
|
|||
Keeping a consistent style for code, code comments, commit messages, and PR descriptions will greatly accelerate your PR review process. |
|||
We require you to run code linter and formatter before submitting your pull requests: |
|||
|
|||
To check the coding styles: |
|||
|
|||
```shell |
|||
make lint |
|||
``` |
|||
|
|||
To fix the coding styles: |
|||
|
|||
```shell |
|||
make format |
|||
``` |
|||
Our CI pipeline also runs these checks automatically on all pull requests to ensure code quality and consistency. |
|||
|
|||
|
|||
## Development Environment Setup with uv |
|||
|
|||
DeepSearcher uses [uv](https://github.com/astral-sh/uv) as the recommended package manager. uv is a fast, reliable Python package manager and installer. The project's `pyproject.toml` is configured to work with uv, which will provide faster dependency resolution and package installation compared to traditional tools. |
|||
|
|||
### Install Project in Development Mode(aka Editable Installation) |
|||
|
|||
1. Install uv if you haven't already: |
|||
Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/). |
|||
|
|||
2. Clone the repository and navigate to the project directory: |
|||
```shell |
|||
git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher |
|||
``` |
|||
3. Synchronize and install dependencies: |
|||
```shell |
|||
uv sync |
|||
source .venv/bin/activate |
|||
``` |
|||
`uv sync` will install all dependencies specified in `uv.lock` file. And the `source .venv/bin/activate` command will activate the virtual environment. |
|||
|
|||
- (Optional) To install all optional dependencies: |
|||
```shell |
|||
uv sync --all-extras --dev |
|||
``` |
|||
|
|||
- (Optional) To install specific optional dependencies: |
|||
```shell |
|||
# Take optional `ollama` dependency for example |
|||
uv sync --extra ollama |
|||
``` |
|||
For more optional dependencies, refer to the `[project.optional-dependencies]` part of `pyproject.toml` file. |
|||
|
|||
|
|||
|
|||
### Adding Dependencies |
|||
|
|||
When you need to add new dependencies to the `pyproject.toml` file, you can use the following commands: |
|||
|
|||
```shell |
|||
uv add <package_name> |
|||
``` |
|||
DeepSearcher uses optional dependencies to keep the default installation lightweight. Optional features can be installed using the syntax `deepsearcher[<extra>]`. To add a dependency to an optional extra, use the following command: |
|||
|
|||
```shell |
|||
uv add <package_name> --optional <extra> |
|||
``` |
|||
For more details, refer to the [offical Managing dependencies documentation](https://docs.astral.sh/uv/concepts/projects/dependencies/). |
|||
|
|||
### Dependencies Locking |
|||
|
|||
For development, we use lockfiles to ensure consistent dependencies. You can use |
|||
```shell |
|||
uv lock --check |
|||
``` |
|||
to verify if your lockfile is up-to-date with your project dependencies. |
|||
|
|||
When you modify or add dependencies in the project, the lockfile will be automatically updated the next time you run a uv command. You can also explicitly update the lockfile using: |
|||
```shell |
|||
uv lock |
|||
``` |
|||
|
|||
While the environment is synced automatically, it may also be explicitly synced using uv sync: |
|||
```shell |
|||
uv sync |
|||
``` |
|||
Syncing the environment manually is especially useful for ensuring your editor has the correct versions of dependencies. |
|||
|
|||
|
|||
For more detailed information about dependency locking and syncing, refer to the [offical Locking and syncing documentation](https://docs.astral.sh/uv/concepts/projects/sync/). |
|||
|
|||
|
|||
## Running Tests |
|||
|
|||
Before submitting your pull request, make sure to run the test suite to ensure your changes haven't introduced any regressions. |
|||
|
|||
### Installing Test Dependencies |
|||
|
|||
First, ensure you have pytest installed. If you haven't installed the development dependencies yet, you can do so with: |
|||
|
|||
```shell |
|||
uv sync --all-extras --dev |
|||
``` |
|||
|
|||
This will install all development dependencies and optional dependencies including pytest and other testing tools. |
|||
|
|||
### Running the Tests |
|||
|
|||
To run all tests in the `tests` directory: |
|||
|
|||
```shell |
|||
uv run pytest tests |
|||
``` |
|||
|
|||
For more verbose output that shows individual test results: |
|||
|
|||
```shell |
|||
uv run pytest tests -v |
|||
``` |
|||
|
|||
You can also run tests for specific directories or files. For example: |
|||
|
|||
```shell |
|||
# Run tests in a specific directory |
|||
uv run pytest tests/embedding |
|||
|
|||
# Run tests in a specific file |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py |
|||
|
|||
# Run a specific test class |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding |
|||
|
|||
# Run a specific test method |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding::test_init_default |
|||
``` |
|||
|
|||
The `-v` flag (verbose mode) provides more detailed output, showing each test case and its result individually. This is particularly useful when you want to see which specific tests are passing or failing. |
|||
|
|||
|
|||
## Developer Certificate of Origin (DCO) |
|||
|
|||
All contributions require a sign-off, acknowledging the [Developer Certificate of Origin](https://developercertificate.org/). |
|||
Add a `Signed-off-by` line to your commit message: |
|||
|
|||
```text |
|||
Signed-off-by: Your Name <your.email@example.com> |
|||
``` |
@ -1,65 +0,0 @@ |
|||
# Basic Example |
|||
|
|||
This example demonstrates the core functionality of DeepSearcher - loading documents and performing semantic search. |
|||
|
|||
## Overview |
|||
|
|||
The script performs these steps: |
|||
|
|||
1. Configures DeepSearcher with default settings |
|||
2. Loads a PDF document about Milvus |
|||
3. Asks a question about Milvus and vector databases |
|||
4. Displays token usage information |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
config = Configuration() # Customize your config here |
|||
init_config(config=config) |
|||
|
|||
|
|||
# You should clone the milvus docs repo to your local machine first, execute: |
|||
# git clone https://github.com/milvus-io/milvus-docs.git |
|||
# Then replace the path below with the path to the milvus-docs repo on your local machine |
|||
# import glob |
|||
# all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True) |
|||
# load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents") |
|||
|
|||
# Hint: You can also load a single file, please execute it in the root directory of the deep searcher project |
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
question = "Write a report comparing Milvus with other vector databases." |
|||
|
|||
_, _, consumed_token = query(question, max_iter=1) |
|||
print(f"Consumed tokens: {consumed_token}") |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Make sure you have installed DeepSearcher: `pip install deepsearcher` |
|||
2. Create a data directory and add a PDF about Milvus (or use your own data) |
|||
3. Run the script: `python basic_example.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Configuration**: Using the default configuration |
|||
- **Document Loading**: Loading a single PDF file |
|||
- **Querying**: Asking a complex question requiring synthesis of information |
|||
- **Token Tracking**: Monitoring token usage from the LLM |
@ -1,101 +0,0 @@ |
|||
# Docling Integration Example |
|||
|
|||
This example shows how to use Docling for loading local files and crawling web content. |
|||
|
|||
## Overview |
|||
|
|||
The script demonstrates: |
|||
|
|||
1. Configuring DeepSearcher to use Docling for both file loading and web crawling |
|||
2. Loading data from local files using Docling's document parser |
|||
3. Crawling web content from multiple sources including Markdown and PDF files |
|||
4. Querying the loaded data |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_local_files, load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Configure Vector Database and Docling providers |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("file_loader", "DoclingLoader", {}) |
|||
config.set_provider_config("web_crawler", "DoclingCrawler", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2a: Load data from a local file using DoclingLoader |
|||
local_file = "your_local_file_or_directory" |
|||
local_collection_name = "DoclingLocalFiles" |
|||
local_collection_description = "Milvus Documents loaded using DoclingLoader" |
|||
|
|||
print("\n=== Loading local files using DoclingLoader ===") |
|||
|
|||
try: |
|||
load_from_local_files( |
|||
paths_or_directory=local_file, |
|||
collection_name=local_collection_name, |
|||
collection_description=local_collection_description, |
|||
force_new_collection=True |
|||
) |
|||
print(f"Successfully loaded: {local_file}") |
|||
except ValueError as e: |
|||
print(f"Validation error: {str(e)}") |
|||
except Exception as e: |
|||
print(f"Error: {str(e)}") |
|||
|
|||
print("Successfully loaded all local files") |
|||
|
|||
# Step 2b: Crawl URLs using DoclingCrawler |
|||
urls = [ |
|||
# Markdown documentation files |
|||
"https://milvus.io/docs/quickstart.md", |
|||
"https://milvus.io/docs/overview.md", |
|||
# PDF example - can handle various URL formats |
|||
"https://arxiv.org/pdf/2408.09869", |
|||
] |
|||
web_collection_name = "DoclingWebCrawl" |
|||
web_collection_description = "Milvus Documentation crawled using DoclingCrawler" |
|||
|
|||
print("\n=== Crawling web pages using DoclingCrawler ===") |
|||
|
|||
load_from_website( |
|||
urls=urls, |
|||
collection_name=web_collection_name, |
|||
collection_description=web_collection_description, |
|||
force_new_collection=True |
|||
) |
|||
print("Successfully crawled all URLs") |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher and Docling: `pip install deepsearcher docling` |
|||
2. Replace `your_local_file_or_directory` with your actual file/directory path |
|||
3. Run the script: `python load_and_crawl_using_docling.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Multiple Providers**: Configuring both file loader and web crawler to use Docling |
|||
- **Local Files**: Loading documents from your local filesystem |
|||
- **Web Crawling**: Retrieving content from multiple web URLs with different formats |
|||
- **Error Handling**: Graceful error handling for loading operations |
@ -1,82 +0,0 @@ |
|||
# FireCrawl Integration Example |
|||
|
|||
This example demonstrates how to use FireCrawl with DeepSearcher to crawl and extract content from websites. |
|||
|
|||
## Overview |
|||
|
|||
FireCrawl is a specialized web crawling service designed for AI applications. This example shows: |
|||
|
|||
1. Setting up FireCrawl with DeepSearcher |
|||
2. Configuring API keys for the service |
|||
3. Crawling a website and extracting content |
|||
4. Querying the extracted content |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
# Set API keys (ensure these are set securely in real applications) |
|||
os.environ['OPENAI_API_KEY'] = 'sk-***************' |
|||
os.environ['FIRECRAWL_API_KEY'] = 'fc-***************' |
|||
|
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Set up Vector Database (Milvus) and Web Crawler (FireCrawlCrawler) |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("web_crawler", "FireCrawlCrawler", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2: Load data from a website into Milvus |
|||
website_url = "https://example.com" # Replace with your target website |
|||
collection_name = "FireCrawl" |
|||
collection_description = "All Milvus Documents" |
|||
|
|||
# crawl a single webpage |
|||
load_from_website(urls=website_url, collection_name=collection_name, collection_description=collection_description) |
|||
# only applicable if using Firecrawl: deepsearcher can crawl multiple webpages, by setting max_depth, limit, allow_backward_links |
|||
# load_from_website(urls=website_url, max_depth=2, limit=20, allow_backward_links=True, collection_name=collection_name, collection_description=collection_description) |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" # Replace with your actual question |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher: `pip install deepsearcher` |
|||
2. Sign up for a FireCrawl API key at [firecrawl.dev](https://docs.firecrawl.dev/introduction) |
|||
3. Replace the placeholder API keys with your actual keys |
|||
4. Change the `website_url` to the website you want to crawl |
|||
5. Run the script: `python load_website_using_firecrawl.py` |
|||
|
|||
## Advanced Crawling Options |
|||
|
|||
FireCrawl provides several advanced options for crawling: |
|||
|
|||
- `max_depth`: Control how many links deep the crawler should go |
|||
- `limit`: Set a maximum number of pages to crawl |
|||
- `allow_backward_links`: Allow the crawler to navigate to parent/sibling pages |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Web Crawling**: Extracting content from websites |
|||
- **Depth Control**: Managing how deep the crawler navigates |
|||
- **URL Processing**: Handling multiple pages from a single starting point |
|||
- **Vector Storage**: Storing the crawled content in a vector database for search |
@ -1,15 +0,0 @@ |
|||
# Usage Examples |
|||
|
|||
DeepSearcher provides several example scripts to help you get started quickly. These examples demonstrate different ways to use DeepSearcher for various use cases. |
|||
|
|||
## 📋 Available Examples |
|||
|
|||
| Example | Description | Key Features | |
|||
|---------|-------------|--------------| |
|||
| [Basic Example](basic_example.md) | Simple example showing core functionality | Loading PDFs, querying | |
|||
| [Docling Integration](docling.md) | Using Docling for file loading and web crawling | Multiple sources, local and web | |
|||
| [Unstructured Integration](unstructured.md) | Using Unstructured for parsing documents | API and local processing | |
|||
| [FireCrawl Integration](firecrawl.md) | Web crawling with FireCrawl | Website data extraction | |
|||
| [Oracle Setup](oracle.md) | Advanced configuration with Oracle | Path setup, token tracking | |
|||
|
|||
Click on any example to see detailed code and explanations. |
@ -1,70 +0,0 @@ |
|||
# Oracle Example |
|||
|
|||
This example demonstrates an advanced setup using path manipulation and detailed token tracking. |
|||
|
|||
## Overview |
|||
|
|||
This example shows: |
|||
|
|||
1. Setting up Python path for importing from the parent directory |
|||
2. Initializing DeepSearcher with default configuration |
|||
3. Loading a PDF document and creating a vector database |
|||
4. Performing a complex query with full result and token tracking |
|||
5. Optional token consumption monitoring |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import sys, os |
|||
from pathlib import Path |
|||
script_directory = Path(__file__).resolve().parent.parent |
|||
sys.path.append(os.path.abspath(script_directory)) |
|||
|
|||
import logging |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
# Customize your config here |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
config = Configuration() |
|||
init_config(config=config) |
|||
|
|||
# Load your local data |
|||
# Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
|
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
# Query |
|||
from deepsearcher.online_query import query |
|||
|
|||
question = 'Write a report comparing Milvus with other vector databases.' |
|||
answer, retrieved_results, consumed_token = query(question) |
|||
print(answer) |
|||
|
|||
# get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model |
|||
# print(f"Consumed tokens: {consumed_token}") |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher: `pip install deepsearcher` |
|||
2. Make sure you have the data directory with "WhatisMilvus.pdf" (or change the path) |
|||
3. Run the script: `python basic_example_oracle.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Path Management**: Setting up Python path to import from parent directory |
|||
- **Query Unpacking**: Getting full result details (answer, retrieved context, and tokens) |
|||
- **Complex Querying**: Asking for a comparative analysis that requires synthesis |
|||
- **Token Economy**: Monitoring token usage for cost optimization |
@ -1,76 +0,0 @@ |
|||
# Unstructured Integration Example |
|||
|
|||
This example demonstrates how to use the Unstructured library with DeepSearcher for advanced document parsing. |
|||
|
|||
## Overview |
|||
|
|||
Unstructured is a powerful document processing library that can extract content from various document formats. This example shows: |
|||
|
|||
1. Setting up Unstructured with DeepSearcher |
|||
2. Configuring the Unstructured API keys (optional) |
|||
3. Loading documents with Unstructured's parser |
|||
4. Querying the extracted content |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
# (Optional) Set API keys (ensure these are set securely in real applications) |
|||
os.environ['UNSTRUCTURED_API_KEY'] = '***************' |
|||
os.environ['UNSTRUCTURED_API_URL'] = '***************' |
|||
|
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Configure Vector Database (Milvus) and File Loader (UnstructuredLoader) |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("file_loader", "UnstructuredLoader", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2: Load data from a local file or directory into Milvus |
|||
input_file = "your_local_file_or_directory" # Replace with your actual file path |
|||
collection_name = "Unstructured" |
|||
collection_description = "All Milvus Documents" |
|||
|
|||
load_from_local_files(paths_or_directory=input_file, collection_name=collection_name, collection_description=collection_description) |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" # Replace with your actual question |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher with Unstructured support: `pip install deepsearcher "unstructured[all-docs]"` |
|||
2. (Optional) Sign up for the Unstructured API at [unstructured.io](https://unstructured.io) if you want to use their cloud service |
|||
3. Replace `your_local_file_or_directory` with your own document file path or directory |
|||
4. Run the script: `python load_local_file_using_unstructured.py` |
|||
|
|||
## Unstructured Options |
|||
|
|||
You can use Unstructured in two modes: |
|||
|
|||
1. **API Mode**: Set the environment variables `UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_API_URL` to use their cloud service |
|||
2. **Local Mode**: Don't set the environment variables, and Unstructured will process documents locally on your machine |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Document Processing**: Advanced document parsing for various formats |
|||
- **API/Local Options**: Flexibility in deployment based on your needs |
|||
- **Integration**: Seamless integration with DeepSearcher's vector database and query capabilities |
@ -1,73 +0,0 @@ |
|||
# Frequently Asked Questions |
|||
|
|||
## 🔍 Common Issues and Solutions |
|||
|
|||
--- |
|||
|
|||
### 💬 Q1: Why am I failing to parse LLM output format / How to select the right LLM? |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Solution:</strong> Small language models often struggle to follow prompts and generate responses in the expected format. For better results, we recommend using large reasoning models such as:</p> |
|||
|
|||
<ul> |
|||
<li>DeepSeek-R1 671B</li> |
|||
<li>OpenAI o-series models</li> |
|||
<li>Claude 3.7 Sonnet</li> |
|||
</ul> |
|||
|
|||
<p>These models provide superior reasoning capabilities and are more likely to produce correctly formatted outputs.</p> |
|||
</div> |
|||
|
|||
--- |
|||
|
|||
### 🌐 Q2: "We couldn't connect to 'https://huggingface.co'" error |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Error Message:</strong></p> |
|||
<div class="error-message"> |
|||
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json. |
|||
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. |
|||
</div> |
|||
|
|||
<p><strong>Solution:</strong> This issue is typically caused by network access problems to Hugging Face. Try these solutions:</p> |
|||
|
|||
<details> |
|||
<summary><strong>Network Issue? Try Using a Mirror</strong></summary> |
|||
|
|||
```bash |
|||
export HF_ENDPOINT=https://hf-mirror.com |
|||
``` |
|||
</details> |
|||
|
|||
<details> |
|||
<summary><strong>Permission Issue? Set Up a Personal Token</strong></summary> |
|||
|
|||
```bash |
|||
export HUGGING_FACE_HUB_TOKEN=xxxx |
|||
``` |
|||
</details> |
|||
</div> |
|||
|
|||
--- |
|||
|
|||
### 📓 Q3: DeepSearcher doesn't run in Jupyter notebook |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Solution:</strong> This is a common issue with asyncio in Jupyter notebooks. Install <code>nest_asyncio</code> and add the following code to the top of your notebook:</p> |
|||
|
|||
<div class="code-steps"> |
|||
<p><strong>Step 1:</strong> Install the required package</p> |
|||
|
|||
```bash |
|||
pip install nest_asyncio |
|||
``` |
|||
|
|||
<p><strong>Step 2:</strong> Add these lines to the beginning of your notebook</p> |
|||
|
|||
```python |
|||
import nest_asyncio |
|||
nest_asyncio.apply() |
|||
``` |
|||
</div> |
|||
</div> |
|||
</div> |
@ -1,8 +0,0 @@ |
|||
# Future Plans |
|||
|
|||
- Enhance web crawling functionality |
|||
- Support more vector databases (e.g., FAISS...) |
|||
- Add support for additional large models |
|||
- Provide RESTful API interface (**DONE**) |
|||
|
|||
We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯 |
@ -1,45 +0,0 @@ |
|||
# 🔍 DeepSearcher |
|||
|
|||
 |
|||
|
|||
<div align="center"> |
|||
|
|||
<a href="https://opensource.org/licenses/Apache-2.0"> |
|||
<img height="28" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=flat" alt="License"> |
|||
</a> |
|||
<a href="https://twitter.com/zilliz_universe"> |
|||
<img height="28" src="https://img.shields.io/badge/Follow-%40Zilliz-1DA1F2?style=flat&logo=twitter" alt="Twitter"> |
|||
</a> |
|||
<a href="https://discord.gg/mKc3R95yE5"> |
|||
<img height="28" src="https://img.shields.io/badge/Discord-Join%20Chat-5865F2?style=flat&logo=discord&logoColor=white" alt="Discord"> |
|||
</a> |
|||
|
|||
</div> |
|||
|
|||
|
|||
--- |
|||
|
|||
## ✨ Overview |
|||
|
|||
DeepSearcher combines cutting-edge LLMs (OpenAI o1, o3-mini, DeepSeek, Grok 3, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answers and comprehensive reports. |
|||
|
|||
> **Perfect for:** Enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios. |
|||
|
|||
|
|||
 |
|||
|
|||
|
|||
## 🚀 Key Features |
|||
|
|||
| Feature | Description | |
|||
|---------|-------------| |
|||
| 🔒 **Private Data Search** | Maximizes utilization of enterprise internal data while ensuring data security. When necessary, integrates online content for more accurate answers. | |
|||
| 🗄️ **Vector Database Management** | Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval. | |
|||
| 🧩 **Flexible Embedding Options** | Compatible with multiple embedding models for optimal selection based on your needs. | |
|||
| 🤖 **Multiple LLM Support** | Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation. | |
|||
| 📄 **Document Loader** | Supports local file loading, with web crawling capabilities under development. | |
|||
|
|||
## 🎬 Demo |
|||
|
|||
 |
|||
|
@ -1,64 +0,0 @@ |
|||
# 🛠️ Development Mode Installation |
|||
|
|||
This guide is for contributors who want to modify DeepSearcher's code or develop new features. |
|||
|
|||
## 📋 Prerequisites |
|||
|
|||
- Python 3.10 or higher |
|||
- git |
|||
- [uv](https://github.com/astral-sh/uv) package manager (recommended for faster installation) |
|||
|
|||
## 🔄 Installation Steps |
|||
|
|||
### Step 1: Install uv (Recommended) |
|||
|
|||
[uv](https://github.com/astral-sh/uv) is a faster alternative to pip for Python package management. |
|||
|
|||
=== "Using pip" |
|||
```bash |
|||
pip install uv |
|||
``` |
|||
|
|||
=== "Using curl (Unix/macOS)" |
|||
```bash |
|||
curl -LsSf https://astral.sh/uv/install.sh | sh |
|||
``` |
|||
|
|||
=== "Using PowerShell (Windows)" |
|||
```powershell |
|||
irm https://astral.sh/uv/install.ps1 | iex |
|||
``` |
|||
|
|||
For more options, see the [official uv installation guide](https://docs.astral.sh/uv/getting-started/installation/). |
|||
|
|||
### Step 2: Clone the repository |
|||
|
|||
```bash |
|||
git clone https://github.com/zilliztech/deep-searcher.git |
|||
cd deep-searcher |
|||
``` |
|||
|
|||
### Step 3: Set up the development environment |
|||
|
|||
=== "Using uv (Recommended)" |
|||
```bash |
|||
uv sync |
|||
source .venv/bin/activate |
|||
``` |
|||
|
|||
=== "Using pip" |
|||
```bash |
|||
python -m venv .venv |
|||
source .venv/bin/activate # On Windows: .venv\Scripts\activate |
|||
pip install -e ".[dev,all]" |
|||
``` |
|||
|
|||
## 🧪 Running Tests |
|||
|
|||
```bash |
|||
pytest tests/ |
|||
``` |
|||
|
|||
## 📚 Additional Resources |
|||
|
|||
For more detailed development setup instructions, including contribution guidelines, code style, and testing procedures, please refer to the [CONTRIBUTING.md](https://github.com/zilliztech/deep-searcher/blob/main/CONTRIBUTING.md) file in the repository. |
@ -1,29 +0,0 @@ |
|||
# 🔧 Installation |
|||
|
|||
DeepSearcher offers multiple installation methods to suit different user needs. |
|||
|
|||
## 📋 Installation Options |
|||
|
|||
| Method | Best For | Description | |
|||
|--------|----------|-------------| |
|||
| [📦 Installation via pip](pip.md) | Most users | Quick and easy installation using pip package manager | |
|||
| [🛠️ Development mode](development.md) | Contributors | Setup for those who want to modify the code or contribute | |
|||
|
|||
## 🚀 Quick Start |
|||
|
|||
Once installed, you can verify your installation: |
|||
|
|||
```python |
|||
from deepsearcher.configuration import Configuration |
|||
from deepsearcher.online_query import query |
|||
|
|||
# Initialize with default configuration |
|||
config = Configuration() |
|||
print("DeepSearcher installed successfully!") |
|||
``` |
|||
|
|||
## 💻 System Requirements |
|||
|
|||
- Python 3.10 or higher |
|||
- 4GB RAM minimum (8GB+ recommended) |
|||
- Internet connection for downloading models and dependencies |
@ -1,52 +0,0 @@ |
|||
# 📦 Installation via pip |
|||
|
|||
This method is recommended for most users who want to use DeepSearcher without modifying its source code. |
|||
|
|||
## 📋 Prerequisites |
|||
|
|||
- Python 3.10 or higher |
|||
- pip package manager (included with Python) |
|||
- Virtual environment tool (recommended) |
|||
|
|||
## 🔄 Step-by-Step Installation |
|||
|
|||
### Step 1: Create a virtual environment |
|||
|
|||
```bash |
|||
python -m venv .venv |
|||
``` |
|||
|
|||
### Step 2: Activate the virtual environment |
|||
|
|||
=== "Linux/macOS" |
|||
```bash |
|||
source .venv/bin/activate |
|||
``` |
|||
|
|||
=== "Windows" |
|||
```bash |
|||
.venv\Scripts\activate |
|||
``` |
|||
|
|||
### Step 3: Install DeepSearcher |
|||
|
|||
```bash |
|||
pip install deepsearcher |
|||
``` |
|||
|
|||
## 🧩 Optional Dependencies |
|||
|
|||
DeepSearcher supports various integrations through optional dependencies. |
|||
|
|||
| Integration | Command | Description | |
|||
|-------------|---------|-------------| |
|||
| Ollama | `pip install "deepsearcher[ollama]"` | For local LLM deployment | |
|||
| All extras | `pip install "deepsearcher[all]"` | Installs all optional dependencies | |
|||
|
|||
## ✅ Verify Installation |
|||
|
|||
```python |
|||
# Simple verification |
|||
from deepsearcher import __version__ |
|||
print(f"DeepSearcher version: {__version__}") |
|||
``` |
@ -1,75 +0,0 @@ |
|||
# Module Support |
|||
|
|||
DeepSearcher supports various integration modules including embedding models, large language models, document loaders and vector databases. |
|||
|
|||
## 📊 Overview |
|||
|
|||
| Module Type | Count | Description | |
|||
|-------------|-------|-------------| |
|||
| [Embedding Models](#embedding-models) | 7+ | Text vectorization tools | |
|||
| [Large Language Models](#llm-support) | 11+ | Query processing and text generation | |
|||
| [Document Loaders](#document-loader) | 5+ | Parse and process documents in various formats | |
|||
| [Vector Databases](#vector-database-support) | 2+ | Store and retrieve vector data | |
|||
|
|||
## 🔢 Embedding Models {#embedding-models} |
|||
|
|||
Support for various embedding models to convert text into vector representations for semantic search. |
|||
|
|||
| Provider | Required Environment Variables | Features | |
|||
|----------|--------------------------------|---------| |
|||
| **[Open-source models](https://milvus.io/docs/embeddings.md)** | None | Locally runnable open-source models | |
|||
| **[OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases)** | `OPENAI_API_KEY` | High-quality embeddings, easy to use | |
|||
| **[VoyageAI](https://docs.voyageai.com/embeddings/)** | `VOYAGE_API_KEY` | Embeddings optimized for retrieval | |
|||
| **[Amazon Bedrock](https://docs.aws.amazon.com/bedrock/)** | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` | AWS integration, enterprise-grade | |
|||
| **[FastEmbed](https://qdrant.github.io/fastembed/)** | None | Fast lightweight embeddings | |
|||
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Flexible cloud embeddings | |
|||
| **[Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings)** | `NOVITA_API_KEY` | Rich model selection | |
|||
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform | |
|||
|
|||
## 🧠 Large Language Models {#llm-support} |
|||
|
|||
Support for various large language models (LLMs) to process queries and generate responses. |
|||
|
|||
| Provider | Required Environment Variables | Features | |
|||
|----------|--------------------------------|---------| |
|||
| **[OpenAI](https://platform.openai.com/docs/models)** | `OPENAI_API_KEY` | GPT model family | |
|||
| **[DeepSeek](https://api-docs.deepseek.com/)** | `DEEPSEEK_API_KEY` | Powerful reasoning capabilities | |
|||
| **[XAI Grok](https://x.ai/blog/grok-3)** | `XAI_API_KEY` | Real-time knowledge and humor | |
|||
| **[Anthropic Claude](https://docs.anthropic.com/en/home)** | `ANTHROPIC_API_KEY` | Excellent long-context understanding | |
|||
| **[SiliconFlow](https://docs.siliconflow.cn/en/userguide/introduction)** | `SILICONFLOW_API_KEY` | Enterprise inference service | |
|||
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Diverse model support | |
|||
| **[TogetherAI](https://docs.together.ai/docs/introduction)** | `TOGETHER_API_KEY` | Wide range of open-source models | |
|||
| **[Google Gemini](https://ai.google.dev/gemini-api/docs)** | `GEMINI_API_KEY` | Google's multimodal models | |
|||
| **[SambaNova](https://docs.together.ai/docs/introduction)** | `SAMBANOVA_API_KEY` | High-performance AI platform | |
|||
| **[Ollama](https://ollama.com/)** | None | Local LLM deployment | |
|||
| **[Novita AI](https://novita.ai/docs/guides/introduction)** | `NOVITA_API_KEY` | Diverse AI services | |
|||
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform | |
|||
|
|||
## 📄 Document Loader {#document-loader} |
|||
|
|||
Support for loading and processing documents from various sources. |
|||
|
|||
### Local File Loaders |
|||
|
|||
| Loader | Supported Formats | Required Environment Variables | |
|||
|--------|-------------------|--------------------------------| |
|||
| **Built-in Loader** | PDF, TXT, MD | None | |
|||
| **[Unstructured](https://unstructured.io/)** | Multiple document formats | `UNSTRUCTURED_API_KEY`, `UNSTRUCTURED_URL` (optional) | |
|||
|
|||
### Web Crawlers |
|||
|
|||
| Crawler | Description | Required Environment Variables/Setup | |
|||
|---------|-------------|--------------------------------------| |
|||
| **[FireCrawl](https://docs.firecrawl.dev/introduction)** | Crawler designed for AI applications | `FIRECRAWL_API_KEY` | |
|||
| **[Jina Reader](https://jina.ai/reader/)** | High-accuracy web content extraction | `JINA_API_TOKEN` | |
|||
| **[Crawl4AI](https://docs.crawl4ai.com/)** | Browser automation crawler | Run `crawl4ai-setup` for first-time use | |
|||
|
|||
## 💾 Vector Database Support {#vector-database-support} |
|||
|
|||
Support for various vector databases for efficient storage and retrieval of embeddings. |
|||
|
|||
| Database | Description | Features | |
|||
|----------|-------------|----------| |
|||
| **[Milvus](https://milvus.io/)** | Open-source vector database | High-performance, scalable | |
|||
| **[Zilliz Cloud](https://www.zilliz.com/)** | Managed Milvus service | Fully managed, maintenance-free | |
|||
| **[Qdrant](https://qdrant.tech/)** | Vector similarity search engine | Simple, efficient | |
@ -1,78 +0,0 @@ |
|||
/* Add your custom CSS here */ |
|||
|
|||
/* FAQ Styling */ |
|||
.faq-answer { |
|||
background-color: #f8f9fa; |
|||
border-left: 4px solid #5c6bc0; |
|||
padding: 15px 20px; |
|||
margin-bottom: 20px; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
.error-message { |
|||
background-color: #ffebee; |
|||
border-left: 4px solid #f44336; |
|||
padding: 10px 15px; |
|||
margin: 10px 0; |
|||
font-family: monospace; |
|||
white-space: pre-wrap; |
|||
font-size: 0.9em; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
.code-steps { |
|||
margin: 15px 0; |
|||
} |
|||
|
|||
.code-steps p { |
|||
margin-bottom: 5px; |
|||
} |
|||
|
|||
details { |
|||
margin-bottom: 10px; |
|||
padding: 10px; |
|||
background-color: #e3f2fd; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
summary { |
|||
cursor: pointer; |
|||
padding: 8px 0; |
|||
} |
|||
|
|||
details[open] summary { |
|||
margin-bottom: 10px; |
|||
} |
|||
|
|||
h3 { |
|||
margin-top: 30px; |
|||
margin-bottom: 15px; |
|||
} |
|||
|
|||
/* Add smooth transition for collapsible sections */ |
|||
details summary { |
|||
transition: margin 0.3s ease; |
|||
} |
|||
|
|||
/* Styling for code blocks within FAQ */ |
|||
.faq-answer pre { |
|||
border-radius: 4px; |
|||
margin: 10px 0; |
|||
} |
|||
|
|||
/* Add styling for list items */ |
|||
.faq-answer ul { |
|||
padding-left: 25px; |
|||
} |
|||
|
|||
.faq-answer ul li { |
|||
margin: 5px 0; |
|||
} |
|||
|
|||
/* Add horizontal rule styling */ |
|||
hr { |
|||
border: 0; |
|||
height: 1px; |
|||
background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.1), rgba(0, 0, 0, 0)); |
|||
margin: 25px 0; |
|||
} |
@ -1,63 +0,0 @@ |
|||
# 💻 Command Line Interface |
|||
|
|||
DeepSearcher provides a convenient command line interface for loading data and querying. |
|||
|
|||
## 📥 Loading Data |
|||
|
|||
Load data from files or URLs: |
|||
|
|||
```shell |
|||
deepsearcher load "your_local_path_or_url" |
|||
``` |
|||
|
|||
Load into a specific collection: |
|||
|
|||
```shell |
|||
deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description" |
|||
``` |
|||
|
|||
### Examples |
|||
|
|||
#### Loading from local files: |
|||
|
|||
```shell |
|||
# Load a single file |
|||
deepsearcher load "/path/to/your/local/file.pdf" |
|||
|
|||
# Load multiple files at once |
|||
deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md" |
|||
``` |
|||
|
|||
#### Loading from URL: |
|||
|
|||
> **Note:** Set `FIRECRAWL_API_KEY` in your environment variables. See [FireCrawl documentation](https://docs.firecrawl.dev/introduction) for more details. |
|||
|
|||
```shell |
|||
deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek" |
|||
``` |
|||
|
|||
## 🔍 Querying Data |
|||
|
|||
Query your loaded data: |
|||
|
|||
```shell |
|||
deepsearcher query "Write a report about xxx." |
|||
``` |
|||
|
|||
## ❓ Help Commands |
|||
|
|||
Get general help information: |
|||
|
|||
```shell |
|||
deepsearcher --help |
|||
``` |
|||
|
|||
Get help for specific subcommands: |
|||
|
|||
```shell |
|||
# Help for load command |
|||
deepsearcher load --help |
|||
|
|||
# Help for query command |
|||
deepsearcher query --help |
|||
``` |
@ -1,73 +0,0 @@ |
|||
# 🌐 Deployment |
|||
|
|||
This guide explains how to deploy DeepSearcher as a web service. |
|||
|
|||
## ⚙️ Configure Modules |
|||
|
|||
You can configure all arguments by modifying the configuration file: |
|||
|
|||
```yaml |
|||
# config.yaml - https://github.com/zilliztech/deep-searcher/blob/main/config.yaml |
|||
llm: |
|||
provider: "OpenAI" |
|||
api_key: "your_openai_api_key_here" |
|||
# Additional configuration options... |
|||
``` |
|||
|
|||
> **Important:** Set your `OPENAI_API_KEY` in the `llm` section of the YAML file. |
|||
|
|||
## 🚀 Start Service |
|||
|
|||
The main script will run a FastAPI service with default address `localhost:8000`: |
|||
|
|||
```shell |
|||
$ python main.py |
|||
``` |
|||
|
|||
Once started, you should see output indicating the service is running successfully. |
|||
|
|||
## 🔍 Access via Browser |
|||
|
|||
You can access the web service through your browser: |
|||
|
|||
1. Open your browser and navigate to [http://localhost:8000/docs](http://localhost:8000/docs) |
|||
2. The Swagger UI will display all available API endpoints |
|||
3. Click the "Try it out" button on any endpoint to interact with it |
|||
4. Fill in the required parameters and execute the request |
|||
|
|||
This interactive documentation makes it easy to test and use all DeepSearcher API functionality. |
|||
|
|||
## 🐳 Docker Deployment |
|||
|
|||
You can also deploy DeepSearcher using Docker for easier environment setup and management. |
|||
|
|||
### Build Docker Image |
|||
|
|||
To build the Docker image, run the following command from the project root directory: |
|||
|
|||
```shell |
|||
docker build -t deepsearcher:latest . |
|||
``` |
|||
|
|||
This command builds a Docker image using the Dockerfile in the current directory and tags it as `deepsearcher:latest`. |
|||
|
|||
### Run Docker Container |
|||
|
|||
Once the image is built, you can run it as a container: |
|||
|
|||
```shell |
|||
docker run -p 8000:8000 \ |
|||
-e OPENAI_API_KEY=your_openai_api_key \ |
|||
-v $(pwd)/data:/app/data \ |
|||
-v $(pwd)/logs:/app/logs \ |
|||
-v $(pwd)/deepsearcher/config.yaml:/app/deepsearcher/config.yaml \ |
|||
deepsearcher:latest |
|||
``` |
|||
|
|||
This command: |
|||
- Maps port 8000 from the container to port 8000 on your host |
|||
- Sets the `OPENAI_API_KEY` environment variable |
|||
- Mounts the local `data`, `logs`, and configuration file to the container |
|||
- Runs the previously built `deepsearcher:latest` image |
|||
|
|||
> **Note:** Replace `your_openai_api_key` with your actual OpenAI API key, or set any other environment variables required for your configuration. |
@ -1,13 +0,0 @@ |
|||
# 📚 Usage Guide |
|||
|
|||
DeepSearcher provides multiple ways to use the system, including Python API, command line interface, and web service deployment. |
|||
|
|||
## 🔍 Usage Overview |
|||
|
|||
| Guide | Description | |
|||
|-------|-------------| |
|||
| [🚀 Quick Start](quick_start.md) | Quick start guide for Python API integration | |
|||
| [💻 Command Line Interface](cli.md) | Instructions for using the command line interface | |
|||
| [🌐 Deployment](deployment.md) | Guide for deploying as a web service | |
|||
|
|||
Choose the method that best suits your needs and follow the instructions on the corresponding page. |
@ -1,42 +0,0 @@ |
|||
# 🚀 Quick Start |
|||
|
|||
## Prerequisites |
|||
|
|||
✅ Before you begin, prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key. |
|||
|
|||
## Basic Usage |
|||
|
|||
```python |
|||
# Import configuration modules |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.online_query import query |
|||
|
|||
# Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Customize your config here |
|||
# (See the Configuration Details section below for more options) |
|||
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"}) |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"}) |
|||
init_config(config=config) |
|||
|
|||
# Load data from local files |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
load_from_local_files(paths_or_directory=your_local_path) |
|||
|
|||
# (Optional) Load data from websites |
|||
# Requires FIRECRAWL_API_KEY environment variable |
|||
from deepsearcher.offline_loading import load_from_website |
|||
load_from_website(urls=website_url) |
|||
|
|||
# Query your data |
|||
result = query("Write a report about xxx.") # Replace with your question |
|||
print(result) |
|||
``` |
|||
|
|||
## Next Steps |
|||
|
|||
After completing this quick start, you might want to explore: |
|||
|
|||
- [Command Line Interface](cli.md) for non-programmatic usage |
|||
- [Deployment](deployment.md) for setting up a web service |
Loading…
Reference in new issue