Browse Source

initial commit

main
tanxing 2 weeks ago
commit
9497784957
  1. 32
      .github/ISSUE_TEMPLATE/bug_report.md
  2. 22
      .github/ISSUE_TEMPLATE/feature_request.md
  3. 34
      .github/mergify.yml
  4. 20
      .github/workflows/cd-docs.yml
  5. 24
      .github/workflows/ci-docs.yml
  6. 27
      .github/workflows/docs.yml
  7. 37
      .github/workflows/release.yml
  8. 25
      .github/workflows/ruff.yml
  9. 199
      .gitignore
  10. 1
      .python-version
  11. 11
      .vscode/settings.json
  12. 19
      Dockerfile
  13. 201
      LICENSE
  14. 7
      Makefile
  15. 590
      README.md
  16. BIN
      assets/pic/deep-searcher-arch.png
  17. BIN
      assets/pic/demo.gif
  18. BIN
      assets/pic/logo.png
  19. 5
      deepsearcher/__init__.py
  20. 12
      deepsearcher/agent/__init__.py
  21. 103
      deepsearcher/agent/base.py
  22. 326
      deepsearcher/agent/chain_of_rag.py
  23. 98
      deepsearcher/agent/collection_router.py
  24. 319
      deepsearcher/agent/deep_search.py
  25. 128
      deepsearcher/agent/naive_rag.py
  26. 93
      deepsearcher/agent/rag_router.py
  27. 118
      deepsearcher/cli.py
  28. 87
      deepsearcher/config.yaml
  29. 240
      deepsearcher/configuration.py
  30. 5
      deepsearcher/embedding/__init__.py
  31. 76
      deepsearcher/embedding/base.py
  32. 103
      deepsearcher/embedding/openai_embedding.py
  33. 5
      deepsearcher/llm/__init__.py
  34. 120
      deepsearcher/llm/base.py
  35. 61
      deepsearcher/llm/openai_llm.py
  36. 0
      deepsearcher/loader/__init__.py
  37. 7
      deepsearcher/loader/file_loader/__init__.py
  38. 70
      deepsearcher/loader/file_loader/base.py
  39. 117
      deepsearcher/loader/file_loader/docling_loader.py
  40. 94
      deepsearcher/loader/file_loader/json_loader.py
  41. 54
      deepsearcher/loader/file_loader/pdf_loader.py
  42. 43
      deepsearcher/loader/file_loader/text_loader.py
  43. 201
      deepsearcher/loader/file_loader/unstructured_loader.py
  44. 105
      deepsearcher/loader/splitter.py
  45. 11
      deepsearcher/loader/web_crawler/__init__.py
  46. 55
      deepsearcher/loader/web_crawler/base.py
  47. 140
      deepsearcher/loader/web_crawler/crawl4ai_crawler.py
  48. 98
      deepsearcher/loader/web_crawler/docling_crawler.py
  49. 88
      deepsearcher/loader/web_crawler/firecrawl_crawler.py
  50. 62
      deepsearcher/loader/web_crawler/jina_crawler.py
  51. 119
      deepsearcher/offline_loading.py
  52. 96
      deepsearcher/online_query.py
  53. 0
      deepsearcher/utils/__init__.py
  54. 160
      deepsearcher/utils/log.py
  55. 6
      deepsearcher/vector_db/__init__.py
  56. 279
      deepsearcher/vector_db/azure_search.py
  57. 207
      deepsearcher/vector_db/base.py
  58. 305
      deepsearcher/vector_db/milvus.py
  59. 536
      deepsearcher/vector_db/oracle.py
  60. 290
      deepsearcher/vector_db/qdrant.py
  61. 42
      docs/README.md
  62. BIN
      docs/assets/pic/deep-searcher-arch.png
  63. BIN
      docs/assets/pic/demo.gif
  64. BIN
      docs/assets/pic/logo-badge.png
  65. BIN
      docs/assets/pic/logo.png
  66. 126
      docs/configuration/embedding.md
  67. 70
      docs/configuration/file_loader.md
  68. 33
      docs/configuration/index.md
  69. 192
      docs/configuration/llm.md
  70. 52
      docs/configuration/vector_db.md
  71. 97
      docs/configuration/web_crawler.md
  72. 159
      docs/contributing/index.md
  73. 65
      docs/examples/basic_example.md
  74. 101
      docs/examples/docling.md
  75. 82
      docs/examples/firecrawl.md
  76. 15
      docs/examples/index.md
  77. 70
      docs/examples/oracle.md
  78. 76
      docs/examples/unstructured.md
  79. 73
      docs/faq/index.md
  80. 8
      docs/future_plans.md
  81. 45
      docs/index.md
  82. 64
      docs/installation/development.md
  83. 29
      docs/installation/index.md
  84. 52
      docs/installation/pip.md
  85. 75
      docs/integrations/index.md
  86. 0
      docs/overrides/.gitkeep
  87. 78
      docs/stylesheets/extra.css
  88. 63
      docs/usage/cli.md
  89. 73
      docs/usage/deployment.md
  90. 13
      docs/usage/index.md
  91. 42
      docs/usage/quick_start.md
  92. 53
      evaluation/README.md
  93. 119
      evaluation/eval_config.yaml
  94. 329
      evaluation/evaluate.py
  95. BIN
      evaluation/plot_results/max_iter_vs_avg_token_usage.png
  96. BIN
      evaluation/plot_results/max_iter_vs_error_num.png
  97. BIN
      evaluation/plot_results/max_iter_vs_recall.png
  98. 35
      examples/basic_example.py
  99. 68
      examples/basic_example_azuresearch.py
  100. 40
      examples/basic_example_oracle.py

32
.github/ISSUE_TEMPLATE/bug_report.md

@ -0,0 +1,32 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
Please describe your issue **in English**
*Note: Small LLMs cannot perform well at prompt following, and are prone to hallucinations. Please make sure your LLM is cutting-edge, preferably a reasoning model, e.g. OpenAI o-series, DeepSeek R1, Claude 3.7 Sonnet etc.*
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Environment (please complete the following information):**
- OS: [e.g. MacOS]
- pip dependencies
- Version [e.g. 0.0.1]
**Additional context**
Add any other context about the problem here.

22
.github/ISSUE_TEMPLATE/feature_request.md

@ -0,0 +1,22 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
Please describe your suggestion **in English**.
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.

34
.github/mergify.yml

@ -0,0 +1,34 @@
misc:
- branch: &BRANCHES
# In this pull request, the changes are based on the main branch
- &MASTER_BRANCH base=main
- name: Label bug fix PRs
conditions:
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
- or: *BRANCHES
- 'title~=^fix:'
actions:
label:
add:
- kind/bug
- name: Label feature PRs
conditions:
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
- or: *BRANCHES
- 'title~=^feat:'
actions:
label:
add:
- kind/feature
- name: Label enhancement PRs
conditions:
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
- or: *BRANCHES
- 'title~=^enhance:'
actions:
label:
add:
- kind/enhancement

20
.github/workflows/cd-docs.yml

@ -0,0 +1,20 @@
name: "Run Docs CD with UV"
on:
push:
branches:
- "main"
- "master"
paths:
- 'docs/**'
- 'mkdocs.yml'
- '.github/workflows/docs.yml'
jobs:
build-deploy-docs:
if: github.repository == 'zilliztech/deep-searcher'
uses: ./.github/workflows/docs.yml
with:
deploy: true
permissions:
contents: write

24
.github/workflows/ci-docs.yml

@ -0,0 +1,24 @@
name: "Run Docs CI with UV"
on:
pull_request:
types: [opened, reopened, synchronize]
paths:
- 'docs/**'
- 'mkdocs.yml'
- '.github/workflows/docs.yml'
push:
branches:
- "**"
- "!gh-pages"
paths:
- 'docs/**'
- 'mkdocs.yml'
- '.github/workflows/docs.yml'
jobs:
build-docs:
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'zilliztech/deep-searcher') }}
uses: ./.github/workflows/docs.yml
with:
deploy: false

27
.github/workflows/docs.yml

@ -0,0 +1,27 @@
on:
workflow_call:
inputs:
deploy:
type: boolean
description: "If true, the docs will be deployed."
default: false
jobs:
run-docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install dependencies
run: |
uv sync --all-extras --dev
source .venv/bin/activate
- name: Build docs
run: uv run mkdocs build --verbose --clean
- name: Build and push docs
if: inputs.deploy
run: uv run mkdocs gh-deploy --force

37
.github/workflows/release.yml

@ -0,0 +1,37 @@
#git tag v0.x.x # Must be same as the version in pyproject.toml
#git push --tags
name: Publish Python Package to PyPI
on:
push:
tags:
- "v*"
jobs:
publish:
name: Publish to PyPI
runs-on: ubuntu-latest
environment: pypi
permissions:
id-token: write
contents: read
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install build tools
run: python -m pip install build
- name: Build package
run: python -m build
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

25
.github/workflows/ruff.yml

@ -0,0 +1,25 @@
name: Ruff
on:
push:
branches: [ main, master ]
pull_request:
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install the project
run: |
uv sync --all-extras --dev
source .venv/bin/activate
- name: Run Ruff
run: |
uv run ruff format --diff
uv run ruff check
# - name: Run tests
# run: uv run pytest tests

199
.gitignore

@ -0,0 +1,199 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
.DS_Store
*.db

1
.python-version

@ -0,0 +1 @@
3.10

11
.vscode/settings.json

@ -0,0 +1,11 @@
{
"python.testing.unittestArgs": [
"-v",
"-s",
"./tests",
"-p",
"test_*.py"
],
"python.testing.pytestEnabled": false,
"python.testing.unittestEnabled": true
}

19
Dockerfile

@ -0,0 +1,19 @@
FROM ghcr.io/astral-sh/uv:python3.10-bookworm-slim
WORKDIR /app
RUN mkdir -p /tmp/uv-cache /app/data /app/logs
COPY pyproject.toml uv.lock LICENSE README.md ./
COPY deepsearcher/ ./deepsearcher/
RUN uv sync
COPY . .
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/docs || exit 1
CMD ["uv", "run", "python", "main.py", "--enable-cors", "true"]

201
LICENSE

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Zilliz
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

7
Makefile

@ -0,0 +1,7 @@
lint:
uv run ruff format --diff
uv run ruff check
format:
uv run ruff format
uv run ruff check --fix

590
README.md

@ -0,0 +1,590 @@
![DeepSearcher](./assets/pic/logo.png)
<div align="center">
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![DeepWiki](https://img.shields.io/badge/DeepWiki-AI%20Docs-orange.svg)](https://deepwiki.com/zilliztech/deep-searcher)
[![Twitter](https://img.shields.io/twitter/url/https/twitter.com/zilliz_universe.svg?style=social&label=Follow%20%40Zilliz)](https://twitter.com/zilliz_universe)
<a href="https://discord.gg/mKc3R95yE5"><img height="20" src="https://img.shields.io/badge/Discord-%235865F2.svg?style=for-the-badge&logo=discord&logoColor=white" alt="discord"/></a>
</div>
---
DeepSearcher combines cutting-edge LLMs (OpenAI o3, Qwen3, DeepSeek, Grok 4, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answer and comprehensive report. This project is suitable for enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios.
![Architecture](./assets/pic/deep-searcher-arch.png)
## 🚀 Features
- **Private Data Search**: Maximizes the utilization of enterprise internal data while ensuring data security. When necessary, it can integrate online content for more accurate answers.
- **Vector Database Management**: Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval.
- **Flexible Embedding Options**: Compatible with multiple embedding models for optimal selection.
- **Multiple LLM Support**: Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation.
- **Document Loader**: Supports local file loading, with web crawling capabilities under development.
---
## 🎉 Demo
![demo](./assets/pic/demo.gif)
## 📖 Quick Start
### Installation
Install DeepSearcher using one of the following methods:
#### Option 1: Using pip
Create and activate a virtual environment(Python 3.10 version is recommended).
```bash
python -m venv .venv
source .venv/bin/activate
```
Install DeepSearcher
```bash
pip install deepsearcher
```
For optional dependencies, e.g., ollama:
```bash
pip install "deepsearcher[ollama]"
```
#### Option 2: Install in Development Mode
We recommend using [uv](https://github.com/astral-sh/uv) for faster and more reliable installation. Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/) to install it.
Clone the repository and navigate to the project directory:
```shell
git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher
```
Synchronize and install dependencies:
```shell
uv sync
source .venv/bin/activate
```
For more detailed development setup and optional dependency installation options, see [CONTRIBUTING.md](CONTRIBUTING.md#development-environment-setup-with-uv).
### Quick start demo
To run this quick start demo, please prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key.
```python
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.online_query import query
config = Configuration()
# Customize your config here,
# more configuration see the Configuration Details section below.
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"})
init_config(config = config)
# Load your local data
from deepsearcher.offline_loading import load_from_local_files
load_from_local_files(paths_or_directory=your_local_path)
# (Optional) Load from web crawling (`FIRECRAWL_API_KEY` env variable required)
from deepsearcher.offline_loading import load_from_website
load_from_website(urls=website_url)
# Query
result = query("Write a report about xxx.") # Your question here
```
### Configuration Details:
#### LLM Configuration
<pre><code>config.set_provider_config("llm", "(LLMName)", "(Arguments dict)")</code></pre>
<p>The "LLMName" can be one of the following: ["DeepSeek", "OpenAI", "XAI", "SiliconFlow", "Aliyun", "PPIO", "TogetherAI", "Gemini", "Ollama", "Novita"]</p>
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the LLM class.</p>
<details>
<summary>Example (OpenAI)</summary>
<p> Make sure you have prepared your OPENAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})</code></pre>
<p> More details about OpenAI models: https://platform.openai.com/docs/models </p>
</details>
<details>
<summary>Example (Qwen3 from Aliyun Bailian)</summary>
<p> Make sure you have prepared your Bailian API KEY as an env variable <code>DASHSCOPE_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "Aliyun", {"model": "qwen-plus-latest"})</code></pre>
<p> More details about Aliyun Bailian models: https://bailian.console.aliyun.com </p>
</details>
<details>
<summary>Example (Qwen3 from OpenRouter)</summary>
<pre><code>config.set_provider_config("llm", "OpenAI", {"model": "qwen/qwen3-235b-a22b:free", "base_url": "https://openrouter.ai/api/v1", "api_key": "OPENROUTER_API_KEY"})</code></pre>
<p> More details about OpenRouter models: https://openrouter.ai/qwen/qwen3-235b-a22b:free </p>
</details>
<details>
<summary>Example (DeepSeek from official)</summary>
<p> Make sure you have prepared your DEEPSEEK API KEY as an env variable <code>DEEPSEEK_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"})</code></pre>
<p> More details about DeepSeek: https://api-docs.deepseek.com/ </p>
</details>
<details>
<summary>Example (DeepSeek from SiliconFlow)</summary>
<p> Make sure you have prepared your SILICONFLOW API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre>
<p> More details about SiliconFlow: https://docs.siliconflow.cn/quickstart </p>
</details>
<details>
<summary>Example (DeepSeek from TogetherAI)</summary>
<p> Make sure you have prepared your TOGETHER API KEY as an env variable <code>TOGETHER_API_KEY</code>.</p>
For deepseek R1:
<pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre>
For Llama 4:
<pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"})</code></pre>
<p> You need to install together before running, execute: <code>pip install together</code>. More details about TogetherAI: https://www.together.ai/ </p>
</details>
<details>
<summary>Example (XAI Grok)</summary>
<p> Make sure you have prepared your XAI API KEY as an env variable <code>XAI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "XAI", {"model": "grok-4-0709"})</code></pre>
<p> More details about XAI Grok: https://docs.x.ai/docs/overview#featured-models </p>
</details>
<details>
<summary>Example (Claude)</summary>
<p> Make sure you have prepared your ANTHROPIC API KEY as an env variable <code>ANTHROPIC_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"})</code></pre>
<p> More details about Anthropic Claude: https://docs.anthropic.com/en/home </p>
</details>
<details>
<summary>Example (Google Gemini)</summary>
<p> Make sure you have prepared your GEMINI API KEY as an env variable <code>GEMINI_API_KEY</code>.</p>
<pre><code>config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' })</code></pre>
<p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p>
</details>
<details>
<summary>Example (DeepSeek from PPIO)</summary>
<p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>. You can create an API Key <a href="https://ppinfra.com/settings/key-management?utm_source=github_deep-searcher">here</a>. </p>
<pre><code>config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"})</code></pre>
<p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p>
</details>
<details>
<summary>Example (Ollama)</summary>
<p> Follow <a href="https://github.com/jmorganca/ollama">these instructions</a> to set up and run a local Ollama instance:</p>
<p> <a href="https://ollama.ai/download">Download</a> and install Ollama onto the available supported platforms (including Windows Subsystem for Linux).</p>
<p> View a list of available models via the <a href="https://ollama.ai/library">model library</a>.</p>
<p> Fetch available LLM models via <code>ollama pull &lt;name-of-model&gt;</code></p>
<p> Example: <code>ollama pull qwen3</code></p>
<p> To chat directly with a model from the command line, use <code>ollama run &lt;name-of-model&gt;</code>.</p>
<p> By default, Ollama has a REST API for running and managing models on <a href="http://localhost:11434">http://localhost:11434</a>.</p>
<pre><code>config.set_provider_config("llm", "Ollama", {"model": "qwen3"})</code></pre>
</details>
<details>
<summary>Example (Volcengine)</summary>
<p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>. You can create an API Key <a href="https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey">here</a>. </p>
<pre><code>config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"})</code></pre>
<p> More details about Volcengine: https://www.volcengine.com/docs/82379/1099455?utm_source=github_deep-searcher </p>
</details>
<details>
<summary>Example (GLM)</summary>
<p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"})</code></pre>
<p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p>
</details>
<details>
<summary>Example (Amazon Bedrock)</summary>
<p> Make sure you have prepared your Amazon Bedrock API KEY as an env variable <code>AWS_ACCESS_KEY_ID</code> and <code>AWS_SECRET_ACCESS_KEY</code>.</p>
<pre><code>config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"})</code></pre>
<p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p>
</details>
<details>
<summary>Example (IBM watsonx.ai)</summary>
<p> Make sure you have prepared your watsonx.ai credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p>
<pre><code>config.set_provider_config("llm", "watsonx", {"model": "us.deepseek.r1-v1:0"})</code></pre>
<p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p>
</details>
#### Embedding Model Configuration
<pre><code>config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)")</code></pre>
<p>The "EmbeddingModelName" can be one of the following: ["MilvusEmbedding", "OpenAIEmbedding", "VoyageEmbedding", "SiliconflowEmbedding", "PPIOEmbedding", "NovitaEmbedding"]</p>
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the embedding model class.</p>
<details>
<summary>Example (OpenAI embedding)</summary>
<p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"})</code></pre>
<p> More details about OpenAI models: https://platform.openai.com/docs/guides/embeddings/use-cases </p>
</details>
<details>
<summary>Example (OpenAI embedding Azure)</summary>
<p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", {
"model": "text-embedding-ada-002",
"azure_endpoint": "https://<youraifoundry>.openai.azure.com/",
"api_version": "2023-05-15"
})</code></pre>
</details>
<details>
<summary>Example (Pymilvus built-in embedding model)</summary>
<p> Use the built-in embedding model in Pymilvus, you can set the model name as <code>"default"</code>, <code>"BAAI/bge-base-en-v1.5"</code>, <code>"BAAI/bge-large-en-v1.5"</code>, <code>"jina-embeddings-v3"</code>, etc. <br/>
See [milvus_embedding.py](deepsearcher/embedding/milvus_embedding.py) for more details. </p>
<pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"})</code></pre>
<pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"})</code></pre>
<p> For Jina's embedding model, you need<code>JINAAI_API_KEY</code>.</p>
<p> You need to install pymilvus model before running, execute: <code>pip install pymilvus.model</code>. More details about Pymilvus: https://milvus.io/docs/embeddings.md </p>
</details>
<details>
<summary>Example (VoyageAI embedding)</summary>
<p> Make sure you have prepared your VOYAGE API KEY as an env variable <code>VOYAGE_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"})</code></pre>
<p> You need to install voyageai before running, execute: <code>pip install voyageai</code>. More details about VoyageAI: https://docs.voyageai.com/embeddings/ </p>
</details>
<details>
<summary>Example (Amazon Bedrock embedding)</summary>
<pre><code>config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"})</code></pre>
<p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p>
</details>
<details>
<summary>Example (Novita AI embedding)</summary>
<p> Make sure you have prepared your Novita AI API KEY as an env variable <code>NOVITA_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"})</code></pre>
<p> More details about Novita AI: https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link </p>
</details>
<details>
<summary>Example (Siliconflow embedding)</summary>
<p> Make sure you have prepared your Siliconflow API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"})</code></pre>
<p> More details about Siliconflow: https://docs.siliconflow.cn/en/api-reference/embeddings/create-embeddings </p>
</details>
<details>
<summary>Example (Volcengine embedding)</summary>
<p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"})</code></pre>
<p> More details about Volcengine: https://www.volcengine.com/docs/82379/1302003 </p>
</details>
<details>
<summary>Example (GLM embedding)</summary>
<p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"})</code></pre>
<p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p>
</details>
<details>
<summary>Example (Google Gemini embedding)</summary>
<p> Make sure you have prepared your Gemini API KEY as an env variable <code>GEMINI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"})</code></pre>
<p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p>
</details>
<details>
<summary>Example (Ollama embedding)</summary>
<pre><code>config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"})</code></pre>
<p> You need to install ollama before running, execute: <code>pip install ollama</code>. More details about Ollama Python SDK: https://github.com/ollama/ollama-python </p>
</details>
<details>
<summary>Example (PPIO embedding)</summary>
<p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>.</p>
<pre><code>config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"})</code></pre>
<p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p>
</details>
<details>
<summary>Example (FastEmbed embedding)</summary>
<pre><code>config.set_provider_config("embedding", "FastEmbedEmbedding", {"model": "intfloat/multilingual-e5-large"})</code></pre>
<p> You need to install fastembed before running, execute: <code>pip install fastembed</code>. More details about fastembed: https://github.com/qdrant/fastembed </p>
</details>
<details>
<summary>Example (IBM watsonx.ai embedding)</summary>
<p> Make sure you have prepared your WatsonX credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p>
<pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"})</code></pre>
<pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "sentence-transformers/all-minilm-l6-v2"})</code></pre>
<p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p>
</details>
#### Vector Database Configuration
<pre><code>config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)")</code></pre>
<p>The "VectorDBName" can be one of the following: ["Milvus"] (Under development)</p>
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Vector Database class.</p>
<details>
<summary>Example (Milvus)</summary>
<pre><code>config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})</code></pre>
<p> More details about Milvus Config:</p>
<ul>
<li>
Setting the <code>uri</code> as a local file, e.g. <code>./milvus.db</code>, is the most convenient method, as it automatically utilizes <a href="https://milvus.io/docs/milvus_lite.md" target="_blank">Milvus Lite</a> to store all data in this file.
</li>
</ul>
<ul>
<li>
If you have a large-scale dataset, you can set up a more performant Milvus server using
<a href="https://milvus.io/docs/quickstart.md" target="_blank">Docker or Kubernetes</a>.
In this setup, use the server URI, e.g., <code>http://localhost:19530</code>, as your <code>uri</code>.
You can also use any other connection parameters supported by Milvus such as <code>host</code>, <code>user</code>, <code>password</code>, or <code>secure</code>.
</li>
</ul>
<ul>
<li>
If you want to use <a href="https://zilliz.com/cloud" target="_blank">Zilliz Cloud</a>,
the fully managed cloud service for Milvus, adjust the <code>uri</code> and <code>token</code>
according to the <a href="https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details"
target="_blank">Public Endpoint and API Key</a> in Zilliz Cloud.
</li>
</ul>
</details>
<details>
<summary>Example (AZURE AI Search)</summary>
<pre><code>config.set_provider_config("vector_db", "AzureSearch", {
"endpoint": "https://<yourazureaisearch>.search.windows.net",
"index_name": "<yourindex>",
"api_key": "<yourkey>",
"vector_field": ""
})</code></pre>
<p> More details about Milvus Config:</p>
</details>
#### File Loader Configuration
<pre><code>config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)")</code></pre>
<p>The "FileLoaderName" can be one of the following: ["PDFLoader", "TextLoader", "UnstructuredLoader"]</p>
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the File Loader class.</p>
<details>
<summary>Example (Unstructured)</summary>
<p>You can use Unstructured in two ways:</p>
<ul>
<li>With API: Set environment variables <code>UNSTRUCTURED_API_KEY</code> and <code>UNSTRUCTURED_API_URL</code></li>
<li>Without API: Use the local processing mode by simply not setting these environment variables</li>
</ul>
<pre><code>config.set_provider_config("file_loader", "UnstructuredLoader", {})</code></pre>
<ul>
<li>Currently supported file types: ["pdf"] (Under development)</li>
<li>Installation requirements:
<ul>
<li>Install ingest pipeline: <code>pip install unstructured-ingest</code></li>
<li>For all document formats: <code>pip install "unstructured[all-docs]"</code></li>
<li>For specific formats (e.g., PDF only): <code>pip install "unstructured[pdf]"</code></li>
</ul>
</li>
<li>More information:
<ul>
<li>Unstructured documentation: <a href="https://docs.unstructured.io/ingestion/overview">https://docs.unstructured.io/ingestion/overview</a></li>
<li>Installation guide: <a href="https://docs.unstructured.io/open-source/installation/full-installation">https://docs.unstructured.io/open-source/installation/full-installation</a></li>
</ul>
</li>
</ul>
</details>
<details>
<summary>Example (Docling)</summary>
<pre><code>config.set_provider_config("file_loader", "DoclingLoader", {})</code></pre>
<p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p>
<p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p>
</details>
#### Web Crawler Configuration
<pre><code>config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)")</code></pre>
<p>The "WebCrawlerName" can be one of the following: ["FireCrawlCrawler", "Crawl4AICrawler", "JinaCrawler"]</p>
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Web Crawler class.</p>
<details>
<summary>Example (FireCrawl)</summary>
<p> Make sure you have prepared your FireCrawl API KEY as an env variable <code>FIRECRAWL_API_KEY</code>.</p>
<pre><code>config.set_provider_config("web_crawler", "FireCrawlCrawler", {})</code></pre>
<p> More details about FireCrawl: https://docs.firecrawl.dev/introduction </p>
</details>
<details>
<summary>Example (Crawl4AI)</summary>
<p> Make sure you have run <code>crawl4ai-setup</code> in your environment.</p>
<pre><code>config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}})</code></pre>
<p> You need to install crawl4ai before running, execute: <code>pip install crawl4ai</code>. More details about Crawl4AI: https://docs.crawl4ai.com/ </p>
</details>
<details>
<summary>Example (Jina Reader)</summary>
<p> Make sure you have prepared your Jina Reader API KEY as an env variable <code>JINA_API_TOKEN</code> or <code>JINAAI_API_KEY</code>.</p>
<pre><code>config.set_provider_config("web_crawler", "JinaCrawler", {})</code></pre>
<p> More details about Jina Reader: https://jina.ai/reader/ </p>
</details>
<details>
<summary>Example (Docling)</summary>
<pre><code>config.set_provider_config("web_crawler", "DoclingCrawler", {})</code></pre>
<p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p>
<p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p>
</details>
### Python CLI Mode
#### Load
```shell
deepsearcher load "your_local_path_or_url"
# load into a specific collection
deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description"
```
Example loading from local file:
```shell
deepsearcher load "/path/to/your/local/file.pdf"
# or more files at once
deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md"
```
Example loading from url (*Set `FIRECRAWL_API_KEY` in your environment variables, see [FireCrawl](https://docs.firecrawl.dev/introduction) for more details*):
```shell
deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek"
```
#### Query
```shell
deepsearcher query "Write a report about xxx."
```
More help information
```shell
deepsearcher --help
```
For more help information about a specific subcommand, you can use `deepsearcher [subcommand] --help`.
```shell
deepsearcher load --help
deepsearcher query --help
```
### Deployment
#### Configure modules
You can configure all arguments by modifying [config.yaml](./config.yaml) to set up your system with default modules.
For example, set your `OPENAI_API_KEY` in the `llm` section of the YAML file.
#### Start service
The main script will run a FastAPI service with default address `localhost:8000`.
```shell
$ python main.py
```
#### Access via browser
You can open url http://localhost:8000/docs in browser to access the web service.
Click on the button "Try it out", it allows you to fill the parameters and directly interact with the API.
---
## ❓ Q&A
**Q1**: Why I failed to parse LLM output format / How to select the LLM?
**A1**: Small LLMs struggle to follow the prompt to generate a desired response, which usually cause the format parsing problem. A better practice is to use large reasoning models e.g. deepseek-r1 671b, OpenAI o-series, Claude 4 sonnet, etc. as your LLM.
---
**Q2**:
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
**A2**: This is mainly due to abnormal access to huggingface, which may be a network or permission problem. You can try the following two methods:
1. If there is a network problem, set up a proxy, try adding the following environment variable.
```bash
export HF_ENDPOINT=https://hf-mirror.com
```
2. If there is a permission problem, set up a personal token, try adding the following environment variable.
```bash
export HUGGING_FACE_HUB_TOKEN=xxxx
```
---
**Q3**: DeepSearcher doesn't run in Jupyter notebook.
**A3**: Install `nest_asyncio` and then put this code block in front of your jupyter notebook.
```
pip install nest_asyncio
```
```
import nest_asyncio
nest_asyncio.apply()
```
---
## 🔧 Module Support
### 🔹 Embedding Models
- [Open-source embedding models](https://milvus.io/docs/embeddings.md)
- [OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases) (`OPENAI_API_KEY` env variable required)
- [VoyageAI](https://docs.voyageai.com/embeddings/) (`VOYAGE_API_KEY` env variable required)
- [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/) (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` env variable required)
- [FastEmbed](https://qdrant.github.io/fastembed/)
- [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required)
- [Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required)
- [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variables required)
### 🔹 LLM Support
- [OpenAI](https://platform.openai.com/docs/models) (`OPENAI_API_KEY` env variable required)
- [DeepSeek](https://api-docs.deepseek.com/) (`DEEPSEEK_API_KEY` env variable required)
- [XAI Grok](https://x.ai/api) (`XAI_API_KEY` env variable required)
- [Anthropic Claude](https://docs.anthropic.com/en/home) (`ANTHROPIC_API_KEY` env variable required)
- [SiliconFlow Inference Service](https://docs.siliconflow.cn/en/userguide/introduction) (`SILICONFLOW_API_KEY` env variable required)
- [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required)
- [TogetherAI Inference Service](https://docs.together.ai/docs/introduction) (`TOGETHER_API_KEY` env variable required)
- [Google Gemini](https://ai.google.dev/gemini-api/docs) (`GEMINI_API_KEY` env variable required)
- [SambaNova Cloud Inference Service](https://docs.together.ai/docs/introduction) (`SAMBANOVA_API_KEY` env variable required)
- [Ollama](https://ollama.com/)
- [Novita AI](https://novita.ai/docs/guides/introduction?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required)
- [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variable required)
### 🔹 Document Loader
- Local File
- PDF(with txt/md) loader
- [Unstructured](https://unstructured.io/) (under development) (`UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_URL` env variables required)
- Web Crawler
- [FireCrawl](https://docs.firecrawl.dev/introduction) (`FIRECRAWL_API_KEY` env variable required)
- [Jina Reader](https://jina.ai/reader/) (`JINA_API_TOKEN` env variable required)
- [Crawl4AI](https://docs.crawl4ai.com/) (You should run command `crawl4ai-setup` for the first time)
### 🔹 Vector Database Support
- [Milvus](https://milvus.io/) and [Zilliz Cloud](https://www.zilliz.com/) (fully managed Milvus)
- [Qdrant](https://qdrant.tech/)
---
## 📊 Evaluation
See the [Evaluation](./evaluation) directory for more details.
---
## 📌 Future Plans
- Enhance web crawling functionality
- Support more vector databases (e.g., FAISS...)
- Add support for additional large models
- Provide RESTful API interface (**DONE**)
We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯

BIN
assets/pic/deep-searcher-arch.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 307 KiB

BIN
assets/pic/demo.gif

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 MiB

BIN
assets/pic/logo.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

5
deepsearcher/__init__.py

@ -0,0 +1,5 @@
import os
# ignore the warnings
# None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

12
deepsearcher/agent/__init__.py

@ -0,0 +1,12 @@
from .base import BaseAgent, RAGAgent
from .chain_of_rag import ChainOfRAG
from .deep_search import DeepSearch
from .naive_rag import NaiveRAG
__all__ = [
"ChainOfRAG",
"DeepSearch",
"NaiveRAG",
"BaseAgent",
"RAGAgent",
]

103
deepsearcher/agent/base.py

@ -0,0 +1,103 @@
from abc import ABC
from typing import Any, List, Tuple
from deepsearcher.vector_db import RetrievalResult
def describe_class(description):
"""
Decorator function to add a description to a class.
This decorator adds a __description__ attribute to the decorated class,
which can be used for documentation or introspection.
Args:
description: The description to add to the class.
Returns:
A decorator function that adds the description to the class.
"""
def decorator(cls):
cls.__description__ = description
return cls
return decorator
class BaseAgent(ABC):
"""
Abstract base class for all agents in the DeepSearcher system.
This class defines the basic interface for agents, including initialization
and invocation methods.
"""
def __init__(self, **kwargs):
"""
Initialize a BaseAgent object.
Args:
**kwargs: Arbitrary keyword arguments.
"""
pass
def invoke(self, query: str, **kwargs) -> Any:
"""
Invoke the agent and return the result.
Args:
query: The query string.
**kwargs: Additional keyword arguments.
Returns:
The result of invoking the agent.
"""
class RAGAgent(BaseAgent):
"""
Abstract base class for Retrieval-Augmented Generation (RAG) agents.
This class extends BaseAgent with methods specific to RAG, including
retrieval and query methods.
"""
def __init__(self, **kwargs):
"""
Initialize a RAGAgent object.
Args:
**kwargs: Arbitrary keyword arguments.
"""
pass
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
"""
Retrieve document results from the knowledge base.
Args:
query: The query string.
**kwargs: Additional keyword arguments.
Returns:
A tuple containing:
- the retrieved results
- the total number of token usages of the LLM
- any additional metadata, which can be an empty dictionary
"""
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
"""
Query the agent and return the answer.
Args:
query: The query string.
**kwargs: Additional keyword arguments.
Returns:
A tuple containing:
- the result generated from LLM
- the retrieved document results
- the total number of token usages of the LLM
"""

326
deepsearcher/agent/chain_of_rag.py

@ -0,0 +1,326 @@
from typing import List, Tuple
from deepsearcher.agent.base import RAGAgent, describe_class
from deepsearcher.agent.collection_router import CollectionRouter
from deepsearcher.embedding.base import BaseEmbedding
from deepsearcher.llm.base import BaseLLM
from deepsearcher.utils import log
from deepsearcher.vector_db import RetrievalResult
from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results
FOLLOWUP_QUERY_PROMPT = """You are using a search tool to answer the main query by iteratively searching the database. Given the following intermediate queries and answers, generate a new simple follow-up question that can help answer the main query. You may rephrase or decompose the main query when previous answers are not helpful. Ask simple follow-up questions only as the search tool may not understand complex questions.
## Previous intermediate queries and answers
{intermediate_context}
## Main query to answer
{query}
Respond with a simple follow-up question that will help answer the main query, do not explain yourself or output anything else.
"""
INTERMEDIATE_ANSWER_PROMPT = """Given the following documents, generate an appropriate answer for the query. DO NOT hallucinate any information, only use the provided documents to generate the answer. Respond "No relevant information found" if the documents do not contain useful information.
## Documents
{retrieved_documents}
## Query
{sub_query}
Respond with a concise answer only, do not explain yourself or output anything else.
"""
FINAL_ANSWER_PROMPT = """Given the following intermediate queries and answers, generate a final answer for the main query by combining relevant information. Note that intermediate answers are generated by an LLM and may not always be accurate.
## Documents
{retrieved_documents}
## Intermediate queries and answers
{intermediate_context}
## Main query
{query}
Respond with an appropriate answer only, do not explain yourself or output anything else.
"""
REFLECTION_PROMPT = """Given the following intermediate queries and answers, judge whether you have enough information to answer the main query. If you believe you have enough information, respond with "Yes", otherwise respond with "No".
## Intermediate queries and answers
{intermediate_context}
## Main query
{query}
Respond with "Yes" or "No" only, do not explain yourself or output anything else.
"""
GET_SUPPORTED_DOCS_PROMPT = """Given the following documents, select the ones that are support the Q-A pair.
## Documents
{retrieved_documents}
## Q-A Pair
### Question
{query}
### Answer
{answer}
Respond with a python list of indices of the selected documents.
"""
@describe_class(
"This agent can decompose complex queries and gradually find the fact information of sub-queries. "
"It is very suitable for handling concrete factual queries and multi-hop questions."
)
class ChainOfRAG(RAGAgent):
"""
Chain of Retrieval-Augmented Generation (RAG) agent implementation.
This agent implements a multi-step RAG process where each step can refine
the query and retrieval process based on previous results, creating a chain
of increasingly focused and relevant information retrieval and generation.
Inspired by: https://arxiv.org/pdf/2501.14342
"""
def __init__(
self,
llm: BaseLLM,
embedding_model: BaseEmbedding,
vector_db: BaseVectorDB,
max_iter: int = 4,
early_stopping: bool = False,
route_collection: bool = True,
text_window_splitter: bool = True,
**kwargs,
):
"""
Initialize the ChainOfRAG agent with configuration parameters.
Args:
llm (BaseLLM): The language model to use for generating answers.
embedding_model (BaseEmbedding): The embedding model to use for embedding queries.
vector_db (BaseVectorDB): The vector database to search for relevant documents.
max_iter (int, optional): The maximum number of iterations for the RAG process. Defaults to 4.
early_stopping (bool, optional): Whether to use early stopping. Defaults to False.
route_collection (bool, optional): Whether to route the query to specific collections. Defaults to True.
text_window_splitter (bool, optional): Whether use text_window splitter. Defaults to True.
"""
self.llm = llm
self.embedding_model = embedding_model
self.vector_db = vector_db
self.max_iter = max_iter
self.early_stopping = early_stopping
self.route_collection = route_collection
self.collection_router = CollectionRouter(
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
)
self.text_window_splitter = text_window_splitter
def _reflect_get_subquery(self, query: str, intermediate_context: List[str]) -> Tuple[str, int]:
chat_response = self.llm.chat(
[
{
"role": "user",
"content": FOLLOWUP_QUERY_PROMPT.format(
query=query,
intermediate_context="\n".join(intermediate_context),
),
}
]
)
return self.llm.remove_think(chat_response.content), chat_response.total_tokens
def _retrieve_and_answer(self, query: str) -> Tuple[str, List[RetrievalResult], int]:
consume_tokens = 0
if self.route_collection:
selected_collections, n_token_route = self.collection_router.invoke(
query=query, dim=self.embedding_model.dimension
)
else:
selected_collections = self.collection_router.all_collections
n_token_route = 0
consume_tokens += n_token_route
all_retrieved_results = []
for collection in selected_collections:
log.color_print(f"<search> Search [{query}] in [{collection}]... </search>\n")
query_vector = self.embedding_model.embed_query(query)
retrieved_results = self.vector_db.search_data(
collection=collection, vector=query_vector, query_text=query
)
all_retrieved_results.extend(retrieved_results)
all_retrieved_results = deduplicate_results(all_retrieved_results)
chat_response = self.llm.chat(
[
{
"role": "user",
"content": INTERMEDIATE_ANSWER_PROMPT.format(
retrieved_documents=self._format_retrieved_results(all_retrieved_results),
sub_query=query,
),
}
]
)
return (
self.llm.remove_think(chat_response.content),
all_retrieved_results,
consume_tokens + chat_response.total_tokens,
)
def _get_supported_docs(
self,
retrieved_results: List[RetrievalResult],
query: str,
intermediate_answer: str,
) -> Tuple[List[RetrievalResult], int]:
supported_retrieved_results = []
token_usage = 0
if "No relevant information found" not in intermediate_answer:
chat_response = self.llm.chat(
[
{
"role": "user",
"content": GET_SUPPORTED_DOCS_PROMPT.format(
retrieved_documents=self._format_retrieved_results(retrieved_results),
query=query,
answer=intermediate_answer,
),
}
]
)
supported_doc_indices = self.llm.literal_eval(chat_response.content)
supported_retrieved_results = [
retrieved_results[int(i)]
for i in supported_doc_indices
if int(i) < len(retrieved_results)
]
token_usage = chat_response.total_tokens
return supported_retrieved_results, token_usage
def _check_has_enough_info(
self, query: str, intermediate_contexts: List[str]
) -> Tuple[bool, int]:
if not intermediate_contexts:
return False, 0
chat_response = self.llm.chat(
[
{
"role": "user",
"content": REFLECTION_PROMPT.format(
query=query,
intermediate_context="\n".join(intermediate_contexts),
),
}
]
)
has_enough_info = self.llm.remove_think(chat_response.content).strip().lower() == "yes"
return has_enough_info, chat_response.total_tokens
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
"""
Retrieves relevant documents based on the input query and iteratively refines the search.
This method iteratively refines the search query based on intermediate results, retrieves documents,
and filters out supported documents. It keeps track of the intermediate contexts and token usage.
Args:
query (str): The initial search query.
**kwargs: Additional keyword arguments.
- max_iter (int, optional): The maximum number of iterations for refinement. Defaults to self.max_iter.
Returns:
Tuple[List[RetrievalResult], int, dict]: A tuple containing:
- List[RetrievalResult]: The list of all retrieved and deduplicated results.
- int: The total token usage across all iterations.
- dict: A dictionary containing additional information, including the intermediate contexts.
"""
max_iter = kwargs.pop("max_iter", self.max_iter)
intermediate_contexts = []
all_retrieved_results = []
token_usage = 0
for iter in range(max_iter):
log.color_print(f">> Iteration: {iter + 1}\n")
followup_query, n_token0 = self._reflect_get_subquery(query, intermediate_contexts)
intermediate_answer, retrieved_results, n_token1 = self._retrieve_and_answer(
followup_query
)
supported_retrieved_results, n_token2 = self._get_supported_docs(
retrieved_results, followup_query, intermediate_answer
)
all_retrieved_results.extend(supported_retrieved_results)
intermediate_idx = len(intermediate_contexts) + 1
intermediate_contexts.append(
f"Intermediate query{intermediate_idx}: {followup_query}\nIntermediate answer{intermediate_idx}: {intermediate_answer}"
)
token_usage += n_token0 + n_token1 + n_token2
if self.early_stopping:
has_enough_info, n_token_check = self._check_has_enough_info(
query, intermediate_contexts
)
token_usage += n_token_check
if has_enough_info:
log.color_print(
f"<think> Early stopping after iteration {iter + 1}: Have enough information to answer the main query. </think>\n"
)
break
all_retrieved_results = deduplicate_results(all_retrieved_results)
additional_info = {"intermediate_context": intermediate_contexts}
return all_retrieved_results, token_usage, additional_info
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
"""
Executes a query and returns the final answer along with all retrieved results and total token usage.
This method initiates a query, retrieves relevant documents, and then summarizes the answer based on the retrieved documents and intermediate contexts. It logs the final answer and returns the answer content, all retrieved results, and the total token usage including the tokens used for the final answer.
Args:
query (str): The initial query to execute.
**kwargs: Additional keyword arguments to pass to the `retrieve` method.
Returns:
Tuple[str, List[RetrievalResult], int]: A tuple containing:
- str: The final answer content.
- List[RetrievalResult]: The list of all retrieved and deduplicated results.
- int: The total token usage across all iterations, including the final answer.
"""
all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs)
intermediate_context = additional_info["intermediate_context"]
log.color_print(
f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n"
)
chat_response = self.llm.chat(
[
{
"role": "user",
"content": FINAL_ANSWER_PROMPT.format(
retrieved_documents=self._format_retrieved_results(all_retrieved_results),
intermediate_context="\n".join(intermediate_context),
query=query,
),
}
]
)
log.color_print("\n==== FINAL ANSWER====\n")
log.color_print(self.llm.remove_think(chat_response.content))
return (
self.llm.remove_think(chat_response.content),
all_retrieved_results,
n_token_retrieval + chat_response.total_tokens,
)
def _format_retrieved_results(self, retrieved_results: List[RetrievalResult]) -> str:
formatted_documents = []
for i, result in enumerate(retrieved_results):
if self.text_window_splitter and "wider_text" in result.metadata:
text = result.metadata["wider_text"]
else:
text = result.text
formatted_documents.append(f"<Document {i}>\n{text}\n<\Document {i}>")
return "\n".join(formatted_documents)

98
deepsearcher/agent/collection_router.py

@ -0,0 +1,98 @@
from typing import List, Tuple
from deepsearcher.agent.base import BaseAgent
from deepsearcher.llm.base import BaseLLM
from deepsearcher.utils import log
from deepsearcher.vector_db.base import BaseVectorDB
COLLECTION_ROUTE_PROMPT = """
I provide you with collection_name(s) and corresponding collection_description(s). Please select the collection names that may be related to the question and return a python list of str. If there is no collection related to the question, you can return an empty list.
"QUESTION": {question}
"COLLECTION_INFO": {collection_info}
When you return, you can ONLY return a json convertable python list of str, WITHOUT any other additional content. Your selected collection name list is:
"""
class CollectionRouter(BaseAgent):
"""
Routes queries to appropriate collections in the vector database.
This class analyzes the content of a query and determines which collections
in the vector database are most likely to contain relevant information.
"""
def __init__(self, llm: BaseLLM, vector_db: BaseVectorDB, dim: int, **kwargs):
"""
Initialize the CollectionRouter.
Args:
llm: The language model to use for analyzing queries.
vector_db: The vector database containing the collections.
dim: The dimension of the vector space to search in.
"""
self.llm = llm
self.vector_db = vector_db
self.all_collections = [
collection_info.collection_name
for collection_info in self.vector_db.list_collections(dim=dim)
]
def invoke(self, query: str, dim: int, **kwargs) -> Tuple[List[str], int]:
"""
Determine which collections are relevant for the given query.
This method analyzes the query content and selects collections that are
most likely to contain information relevant to answering the query.
Args:
query (str): The query to analyze.
dim (int): The dimension of the vector space to search in.
Returns:
Tuple[List[str], int]: A tuple containing:
- A list of selected collection names
- The token usage for the routing operation
"""
consume_tokens = 0
collection_infos = self.vector_db.list_collections(dim=dim)
if len(collection_infos) == 0:
log.color_print(
"No collections found in the vector database. Please check the database connection."
)
return [], 0
if len(collection_infos) == 1:
the_only_collection = collection_infos[0].collection_name
log.color_print(
f"<think> Perform search [{query}] on the vector DB collection: {the_only_collection} </think>\n"
)
return [the_only_collection], 0
vector_db_search_prompt = COLLECTION_ROUTE_PROMPT.format(
question=query,
collection_info=[
{
"collection_name": collection_info.collection_name,
"collection_description": collection_info.description,
}
for collection_info in collection_infos
],
)
chat_response = self.llm.chat(
messages=[{"role": "user", "content": vector_db_search_prompt}]
)
selected_collections = self.llm.literal_eval(chat_response.content)
consume_tokens += chat_response.total_tokens
for collection_info in collection_infos:
# If a collection description is not provided, use the query as the search query
if not collection_info.description:
selected_collections.append(collection_info.collection_name)
# If the default collection exists, use the query as the search query
if self.vector_db.default_collection == collection_info.collection_name:
selected_collections.append(collection_info.collection_name)
selected_collections = list(set(selected_collections))
log.color_print(
f"<think> Perform search [{query}] on the vector DB collections: {selected_collections} </think>\n"
)
return selected_collections, consume_tokens

319
deepsearcher/agent/deep_search.py

@ -0,0 +1,319 @@
import asyncio
from typing import List, Tuple
from deepsearcher.agent.base import RAGAgent, describe_class
from deepsearcher.agent.collection_router import CollectionRouter
from deepsearcher.embedding.base import BaseEmbedding
from deepsearcher.llm.base import BaseLLM
from deepsearcher.utils import log
from deepsearcher.vector_db import RetrievalResult
from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results
SUB_QUERY_PROMPT = """To answer this question more comprehensively, please break down the original question into up to four sub-questions. Return as list of str.
If this is a very simple question and no decomposition is necessary, then keep the only one original question in the python code list.
Original Question: {original_query}
<EXAMPLE>
Example input:
"Explain deep learning"
Example output:
[
"What is deep learning?",
"What is the difference between deep learning and machine learning?",
"What is the history of deep learning?"
]
</EXAMPLE>
Provide your response in a python code list of str format:
"""
RERANK_PROMPT = """Based on the query questions and the retrieved chunk, to determine whether the chunk is helpful in answering any of the query question, you can only return "YES" or "NO", without any other information.
Query Questions: {query}
Retrieved Chunk: {retrieved_chunk}
Is the chunk helpful in answering the any of the questions?
"""
REFLECT_PROMPT = """Determine whether additional search queries are needed based on the original query, previous sub queries, and all retrieved document chunks. If further research is required, provide a Python list of up to 3 search queries. If no further research is required, return an empty list.
If the original query is to write a report, then you prefer to generate some further queries, instead return an empty list.
Original Query: {question}
Previous Sub Queries: {mini_questions}
Related Chunks:
{mini_chunk_str}
Respond exclusively in valid List of str format without any other text."""
SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks.
Original Query: {question}
Previous Sub Queries: {mini_questions}
Related Chunks:
{mini_chunk_str}
"""
@describe_class(
"This agent is suitable for handling general and simple queries, such as given a topic and then writing a report, survey, or article."
)
class DeepSearch(RAGAgent):
"""
Deep Search agent implementation for comprehensive information retrieval.
This agent performs a thorough search through the knowledge base, analyzing
multiple aspects of the query to provide comprehensive and detailed answers.
"""
def __init__(
self,
llm: BaseLLM,
embedding_model: BaseEmbedding,
vector_db: BaseVectorDB,
max_iter: int = 3,
route_collection: bool = True,
text_window_splitter: bool = True,
**kwargs,
):
"""
Initialize the DeepSearch agent.
Args:
llm: The language model to use for generating answers.
embedding_model: The embedding model to use for query embedding.
vector_db: The vector database to search for relevant documents.
max_iter: The maximum number of iterations for the search process.
route_collection: Whether to use a collection router for search.
text_window_splitter: Whether to use text_window splitter.
**kwargs: Additional keyword arguments for customization.
"""
self.llm = llm
self.embedding_model = embedding_model
self.vector_db = vector_db
self.max_iter = max_iter
self.route_collection = route_collection
self.collection_router = CollectionRouter(
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
)
self.text_window_splitter = text_window_splitter
def _generate_sub_queries(self, original_query: str) -> Tuple[List[str], int]:
chat_response = self.llm.chat(
messages=[
{"role": "user", "content": SUB_QUERY_PROMPT.format(original_query=original_query)}
]
)
response_content = self.llm.remove_think(chat_response.content)
return self.llm.literal_eval(response_content), chat_response.total_tokens
async def _search_chunks_from_vectordb(self, query: str, sub_queries: List[str]):
consume_tokens = 0
if self.route_collection:
selected_collections, n_token_route = self.collection_router.invoke(
query=query, dim=self.embedding_model.dimension
)
else:
selected_collections = self.collection_router.all_collections
n_token_route = 0
consume_tokens += n_token_route
all_retrieved_results = []
query_vector = self.embedding_model.embed_query(query)
for collection in selected_collections:
log.color_print(f"<search> Search [{query}] in [{collection}]... </search>\n")
retrieved_results = self.vector_db.search_data(
collection=collection, vector=query_vector, query_text=query
)
if not retrieved_results or len(retrieved_results) == 0:
log.color_print(
f"<search> No relevant document chunks found in '{collection}'! </search>\n"
)
continue
accepted_chunk_num = 0
references = set()
for retrieved_result in retrieved_results:
chat_response = self.llm.chat(
messages=[
{
"role": "user",
"content": RERANK_PROMPT.format(
query=[query] + sub_queries,
retrieved_chunk=f"<chunk>{retrieved_result.text}</chunk>",
),
}
]
)
consume_tokens += chat_response.total_tokens
response_content = self.llm.remove_think(chat_response.content).strip()
if "YES" in response_content and "NO" not in response_content:
all_retrieved_results.append(retrieved_result)
accepted_chunk_num += 1
references.add(retrieved_result.reference)
if accepted_chunk_num > 0:
log.color_print(
f"<search> Accept {accepted_chunk_num} document chunk(s) from references: {list(references)} </search>\n"
)
else:
log.color_print(
f"<search> No document chunk accepted from '{collection}'! </search>\n"
)
return all_retrieved_results, consume_tokens
def _generate_gap_queries(
self, original_query: str, all_sub_queries: List[str], all_chunks: List[RetrievalResult]
) -> Tuple[List[str], int]:
reflect_prompt = REFLECT_PROMPT.format(
question=original_query,
mini_questions=all_sub_queries,
mini_chunk_str=self._format_chunk_texts([chunk.text for chunk in all_chunks])
if len(all_chunks) > 0
else "NO RELATED CHUNKS FOUND.",
)
chat_response = self.llm.chat([{"role": "user", "content": reflect_prompt}])
response_content = self.llm.remove_think(chat_response.content)
return self.llm.literal_eval(response_content), chat_response.total_tokens
def retrieve(self, original_query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
"""
Retrieve relevant documents from the knowledge base for the given query.
This method performs a deep search through the vector database to find
the most relevant documents for answering the query.
Args:
original_query (str): The query to search for.
**kwargs: Additional keyword arguments for customizing the retrieval.
Returns:
Tuple[List[RetrievalResult], int, dict]: A tuple containing:
- A list of retrieved document results
- The token usage for the retrieval operation
- Additional information about the retrieval process
"""
return asyncio.run(self.async_retrieve(original_query, **kwargs))
async def async_retrieve(
self, original_query: str, **kwargs
) -> Tuple[List[RetrievalResult], int, dict]:
max_iter = kwargs.pop("max_iter", self.max_iter)
### SUB QUERIES ###
log.color_print(f"<query> {original_query} </query>\n")
all_search_res = []
all_sub_queries = []
total_tokens = 0
sub_queries, used_token = self._generate_sub_queries(original_query)
total_tokens += used_token
if not sub_queries:
log.color_print("No sub queries were generated by the LLM. Exiting.")
return [], total_tokens, {}
else:
log.color_print(
f"<think> Break down the original query into new sub queries: {sub_queries}</think>\n"
)
all_sub_queries.extend(sub_queries)
sub_gap_queries = sub_queries
for iter in range(max_iter):
log.color_print(f">> Iteration: {iter + 1}\n")
search_res_from_vectordb = []
search_res_from_internet = [] # TODO
# Create all search tasks
search_tasks = [
self._search_chunks_from_vectordb(query, sub_gap_queries)
for query in sub_gap_queries
]
# Execute all tasks in parallel and wait for results
search_results = await asyncio.gather(*search_tasks)
# Merge all results
for result in search_results:
search_res, consumed_token = result
total_tokens += consumed_token
search_res_from_vectordb.extend(search_res)
search_res_from_vectordb = deduplicate_results(search_res_from_vectordb)
# search_res_from_internet = deduplicate_results(search_res_from_internet)
all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
if iter == max_iter - 1:
log.color_print("<think> Exceeded maximum iterations. Exiting. </think>\n")
break
### REFLECTION & GET GAP QUERIES ###
log.color_print("<think> Reflecting on the search results... </think>\n")
sub_gap_queries, consumed_token = self._generate_gap_queries(
original_query, all_sub_queries, all_search_res
)
total_tokens += consumed_token
if not sub_gap_queries or len(sub_gap_queries) == 0:
log.color_print("<think> No new search queries were generated. Exiting. </think>\n")
break
else:
log.color_print(
f"<think> New search queries for next iteration: {sub_gap_queries} </think>\n"
)
all_sub_queries.extend(sub_gap_queries)
all_search_res = deduplicate_results(all_search_res)
additional_info = {"all_sub_queries": all_sub_queries}
return all_search_res, total_tokens, additional_info
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
"""
Query the agent and generate an answer based on retrieved documents.
This method retrieves relevant documents and uses the language model
to generate a comprehensive answer to the query.
Args:
query (str): The query to answer.
**kwargs: Additional keyword arguments for customizing the query process.
Returns:
Tuple[str, List[RetrievalResult], int]: A tuple containing:
- The generated answer
- A list of retrieved document results
- The total token usage
"""
all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs)
if not all_retrieved_results or len(all_retrieved_results) == 0:
return f"No relevant information found for query '{query}'.", [], n_token_retrieval
all_sub_queries = additional_info["all_sub_queries"]
chunk_texts = []
for chunk in all_retrieved_results:
if self.text_window_splitter and "wider_text" in chunk.metadata:
chunk_texts.append(chunk.metadata["wider_text"])
else:
chunk_texts.append(chunk.text)
log.color_print(
f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n"
)
summary_prompt = SUMMARY_PROMPT.format(
question=query,
mini_questions=all_sub_queries,
mini_chunk_str=self._format_chunk_texts(chunk_texts),
)
chat_response = self.llm.chat([{"role": "user", "content": summary_prompt}])
log.color_print("\n==== FINAL ANSWER====\n")
log.color_print(self.llm.remove_think(chat_response.content))
return (
self.llm.remove_think(chat_response.content),
all_retrieved_results,
n_token_retrieval + chat_response.total_tokens,
)
def _format_chunk_texts(self, chunk_texts: List[str]) -> str:
chunk_str = ""
for i, chunk in enumerate(chunk_texts):
chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n"""
return chunk_str

128
deepsearcher/agent/naive_rag.py

@ -0,0 +1,128 @@
from typing import List, Tuple
from deepsearcher.agent.base import RAGAgent
from deepsearcher.agent.collection_router import CollectionRouter
from deepsearcher.embedding.base import BaseEmbedding
from deepsearcher.llm.base import BaseLLM
from deepsearcher.utils import log
from deepsearcher.vector_db.base import BaseVectorDB, RetrievalResult, deduplicate_results
SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks.
Original Query: {query}
Related Chunks:
{mini_chunk_str}
"""
class NaiveRAG(RAGAgent):
"""
Naive Retrieval-Augmented Generation agent implementation.
This agent implements a straightforward RAG approach, retrieving relevant
documents and generating answers without complex processing or refinement steps.
"""
def __init__(
self,
llm: BaseLLM,
embedding_model: BaseEmbedding,
vector_db: BaseVectorDB,
top_k: int = 10,
route_collection: bool = True,
text_window_splitter: bool = True,
**kwargs,
):
"""
Initialize the NaiveRAG agent.
Args:
llm: The language model to use for generating answers.
embedding_model: The embedding model to use for query embedding.
vector_db: The vector database to search for relevant documents.
**kwargs: Additional keyword arguments for customization.
"""
self.llm = llm
self.embedding_model = embedding_model
self.vector_db = vector_db
self.top_k = top_k
self.route_collection = route_collection
if self.route_collection:
self.collection_router = CollectionRouter(
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
)
self.text_window_splitter = text_window_splitter
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
"""
Retrieve relevant documents from the knowledge base for the given query.
This method performs a basic search through the vector database to find
documents relevant to the query.
Args:
query (str): The query to search for.
**kwargs: Additional keyword arguments for customizing the retrieval.
Returns:
Tuple[List[RetrievalResult], int, dict]: A tuple containing:
- A list of retrieved document results
- The token usage for the retrieval operation
- Additional information about the retrieval process
"""
consume_tokens = 0
if self.route_collection:
selected_collections, n_token_route = self.collection_router.invoke(
query=query, dim=self.embedding_model.dimension
)
else:
selected_collections = self.collection_router.all_collections
n_token_route = 0
consume_tokens += n_token_route
all_retrieved_results = []
for collection in selected_collections:
retrieval_res = self.vector_db.search_data(
collection=collection,
vector=self.embedding_model.embed_query(query),
top_k=max(self.top_k // len(selected_collections), 1),
query_text=query,
)
all_retrieved_results.extend(retrieval_res)
all_retrieved_results = deduplicate_results(all_retrieved_results)
return all_retrieved_results, consume_tokens, {}
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
"""
Query the agent and generate an answer based on retrieved documents.
This method retrieves relevant documents and uses the language model
to generate a simple answer to the query.
Args:
query (str): The query to answer.
**kwargs: Additional keyword arguments for customizing the query process.
Returns:
Tuple[str, List[RetrievalResult], int]: A tuple containing:
- The generated answer
- A list of retrieved document results
- The total token usage
"""
all_retrieved_results, n_token_retrieval, _ = self.retrieve(query)
chunk_texts = []
for chunk in all_retrieved_results:
if self.text_window_splitter and "wider_text" in chunk.metadata:
chunk_texts.append(chunk.metadata["wider_text"])
else:
chunk_texts.append(chunk.text)
mini_chunk_str = ""
for i, chunk in enumerate(chunk_texts):
mini_chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n"""
summary_prompt = SUMMARY_PROMPT.format(query=query, mini_chunk_str=mini_chunk_str)
char_response = self.llm.chat([{"role": "user", "content": summary_prompt}])
final_answer = char_response.content
log.color_print("\n==== FINAL ANSWER====\n")
log.color_print(final_answer)
return final_answer, all_retrieved_results, n_token_retrieval + char_response.total_tokens

93
deepsearcher/agent/rag_router.py

@ -0,0 +1,93 @@
from typing import List, Optional, Tuple
from deepsearcher.agent import RAGAgent
from deepsearcher.llm.base import BaseLLM
from deepsearcher.utils import log
from deepsearcher.vector_db import RetrievalResult
RAG_ROUTER_PROMPT = """Given a list of agent indexes and corresponding descriptions, each agent has a specific function.
Given a query, select only one agent that best matches the agent handling the query, and return the index without any other information.
## Question
{query}
## Agent Indexes and Descriptions
{description_str}
Only return one agent index number that best matches the agent handling the query:
"""
class RAGRouter(RAGAgent):
"""
Routes queries to the most appropriate RAG agent implementation.
This class analyzes the content and requirements of a query and determines
which RAG agent implementation is best suited to handle it.
"""
def __init__(
self,
llm: BaseLLM,
rag_agents: List[RAGAgent],
agent_descriptions: Optional[List[str]] = None,
):
"""
Initialize the RAGRouter.
Args:
llm: The language model to use for analyzing queries.
rag_agents: A list of RAGAgent instances.
agent_descriptions (list, optional): A list of descriptions for each agent.
"""
self.llm = llm
self.rag_agents = rag_agents
self.agent_descriptions = agent_descriptions
if not self.agent_descriptions:
try:
self.agent_descriptions = [
agent.__class__.__description__ for agent in self.rag_agents
]
except Exception:
raise AttributeError(
"Please provide agent descriptions or set __description__ attribute for each agent class."
)
def _route(self, query: str) -> Tuple[RAGAgent, int]:
description_str = "\n".join(
[f"[{i + 1}]: {description}" for i, description in enumerate(self.agent_descriptions)]
)
prompt = RAG_ROUTER_PROMPT.format(query=query, description_str=description_str)
chat_response = self.llm.chat(messages=[{"role": "user", "content": prompt}])
try:
selected_agent_index = int(self.llm.remove_think(chat_response.content)) - 1
except ValueError:
# In some reasoning LLM, the output is not a number, but a explaination string with a number in the end.
log.warning(
"Parse int failed in RAGRouter, but will try to find the last digit as fallback."
)
selected_agent_index = (
int(self.find_last_digit(self.llm.remove_think(chat_response.content))) - 1
)
selected_agent = self.rag_agents[selected_agent_index]
log.color_print(
f"<think> Select agent [{selected_agent.__class__.__name__}] to answer the query [{query}] </think>\n"
)
return self.rag_agents[selected_agent_index], chat_response.total_tokens
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
agent, n_token_router = self._route(query)
retrieved_results, n_token_retrieval, metadata = agent.retrieve(query, **kwargs)
return retrieved_results, n_token_router + n_token_retrieval, metadata
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
agent, n_token_router = self._route(query)
answer, retrieved_results, n_token_retrieval = agent.query(query, **kwargs)
return answer, retrieved_results, n_token_router + n_token_retrieval
def find_last_digit(self, string):
for char in reversed(string):
if char.isdigit():
return char
raise ValueError("No digit found in the string")

118
deepsearcher/cli.py

@ -0,0 +1,118 @@
import argparse
import logging
import sys
import warnings
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.offline_loading import load_from_local_files, load_from_website
from deepsearcher.online_query import query
from deepsearcher.utils import log
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output
def main():
"""
Main entry point for the DeepSearcher CLI.
This function parses command line arguments and executes the appropriate action
based on the subcommand provided (query or load). It handles the deprecated
command line format and provides helpful error messages.
Returns:
None
"""
if "--query" in sys.argv or "--load" in sys.argv:
print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.")
print("Please use:")
print(" deepsearcher query <your_query> --max_iter 3")
print(
" deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>"
)
sys.exit(1)
config = Configuration() # Customize your config here
init_config(config=config)
parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.")
subparsers = parser.add_subparsers(dest="subcommand", title="subcommands")
## Arguments of query
query_parser = subparsers.add_parser("query", help="Query a question or search topic.")
query_parser.add_argument("query", type=str, default="", help="query question or search topic.")
query_parser.add_argument(
"--max_iter",
type=int,
default=3,
help="Max iterations of reflection. Default is 3.",
)
## Arguments of loading
load_parser = subparsers.add_parser(
"load", help="Load knowledge from local files or from URLs."
)
load_parser.add_argument(
"load_path",
type=str,
nargs="+", # 1 or more files or urls
help="Load knowledge from local files or from URLs.",
)
load_parser.add_argument(
"--batch_size",
type=int,
default=256,
help="Batch size for loading knowledge.",
)
load_parser.add_argument(
"--collection_name",
type=str,
default=None,
help="Destination collection name of loaded knowledge.",
)
load_parser.add_argument(
"--collection_desc",
type=str,
default=None,
help="Description of the collection.",
)
load_parser.add_argument(
"--force_new_collection",
type=bool,
default=False,
help="If you want to drop origin collection and create a new collection on every load, set to True",
)
args = parser.parse_args()
if args.subcommand == "query":
final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter)
log.color_print("\n==== FINAL ANSWER====\n")
log.color_print(final_answer)
log.color_print("\n### References\n")
for i, ref in enumerate(refs):
log.color_print(f"{i + 1}. {ref.text[:60]}{ref.reference}")
elif args.subcommand == "load":
urls = [url for url in args.load_path if url.startswith("http")]
local_files = [file for file in args.load_path if not file.startswith("http")]
kwargs = {}
if args.collection_name:
kwargs["collection_name"] = args.collection_name
if args.collection_desc:
kwargs["collection_description"] = args.collection_desc
if args.force_new_collection:
kwargs["force_new_collection"] = args.force_new_collection
if args.batch_size:
kwargs["batch_size"] = args.batch_size
if len(urls) > 0:
load_from_website(urls, **kwargs)
if len(local_files) > 0:
load_from_local_files(local_files, **kwargs)
else:
print("Please provide a query or a load argument.")
if __name__ == "__main__":
main()

87
deepsearcher/config.yaml

@ -0,0 +1,87 @@
provide_settings:
llm:
provider: "OpenAILLM"
config:
model: "Qwen/Qwen3-8B-FP8"
api_key: "empty"
base_url: "http://localhost:8000/v1"
embedding:
provider: "OpenAIEmbedding"
config:
model: "Qwen/Qwen3-Embedding-0.6B"
api_key: "empty"
base_url: "http://localhost:8001/v1"
dimension: 1024
dim_change: false
file_loader:
provider: "PDFLoader"
config: {}
# provider: "JsonFileLoader"
# config:
# text_key: ""
# provider: "TextLoader"
# config: {}
# provider: "UnstructuredLoader"
# config: {}
# provider: "DoclingLoader"
# config: {}
web_crawler:
provider: "FireCrawlCrawler"
config: {}
# provider: "Crawl4AICrawler"
# config: # Uncomment to custom browser configuration for Crawl4AI
# browser_config:
# headless: false
# proxy: "http://127.0.0.1:7890"
# chrome_channel: "chrome"
# verbose: true
# viewport_width: 800
# viewport_height: 600
# provider: "JinaCrawler"
# config: {}
# provider: "DoclingCrawler"
# config: {}
vector_db:
provider: "Milvus"
config:
default_collection: "deepsearcher"
uri: "http://localhost:19530"
token: "root:Milvus"
db: "default"
# vector_db:
# provider: "OracleDB"
# config:
# default_collection: "deepsearcher"
# user: ""
# password: ""
# dsn: ""
# config_dir: ""
# wallet_location: ""
# wallet_password: ""
# vector_db:
# provider: "Qdrant"
# config:
# default_collection: "deepsearcher"
# host: "localhost"
# port: 6333
query_settings:
max_iter: 2
load_settings:
chunk_size: 1024
chunk_overlap: 128

240
deepsearcher/configuration.py

@ -0,0 +1,240 @@
import os
from typing import Literal
import yaml
from deepsearcher.agent import ChainOfRAG, DeepSearch, NaiveRAG
from deepsearcher.agent.rag_router import RAGRouter
from deepsearcher.embedding.base import BaseEmbedding
from deepsearcher.llm.base import BaseLLM
from deepsearcher.loader.file_loader.base import BaseLoader
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.vector_db.base import BaseVectorDB
current_dir = os.path.dirname(os.path.abspath(__file__))
DEFAULT_CONFIG_YAML_PATH = os.path.join(current_dir, "config.yaml")
FeatureType = Literal["llm", "embedding", "file_loader", "web_crawler", "vector_db"]
class Configuration:
"""
Configuration class for DeepSearcher.
This class manages the configuration settings for various components of the DeepSearcher system,
including LLM providers, embedding models, file loaders, web crawlers, and vector databases.
It loads configurations from a YAML file and provides methods to get and set provider configurations.
"""
def __init__(self, config_path: str = DEFAULT_CONFIG_YAML_PATH):
"""
Initialize the Configuration object.
Args:
config_path: Path to the configuration YAML file. Defaults to the config.yaml in the project root.
"""
# Initialize default configurations
config_data = self.load_config_from_yaml(config_path)
self.provide_settings = config_data["provide_settings"]
self.query_settings = config_data["query_settings"]
self.load_settings = config_data["load_settings"]
def load_config_from_yaml(self, config_path: str):
"""
Load configuration from a YAML file.
Args:
config_path: Path to the configuration YAML file.
Returns:
The loaded configuration data as a dictionary.
"""
with open(config_path, "r") as file:
return yaml.safe_load(file)
def set_provider_config(self, feature: FeatureType, provider: str, provider_configs: dict):
"""
Set the provider and its configurations for a given feature.
Args:
feature: The feature to configure (e.g., 'llm', 'file_loader', 'web_crawler').
provider: The provider name (e.g., 'openai', 'deepseek').
provider_configs: A dictionary with configurations specific to the provider.
Raises:
ValueError: If the feature is not supported.
"""
if feature not in self.provide_settings:
raise ValueError(f"Unsupported feature: {feature}")
self.provide_settings[feature]["provider"] = provider
self.provide_settings[feature]["config"] = provider_configs
def get_provider_config(self, feature: FeatureType):
"""
Get the current provider and configuration for a given feature.
Args:
feature: The feature to retrieve (e.g., 'llm', 'file_loader', 'web_crawler').
Returns:
A dictionary with provider and its configurations.
Raises:
ValueError: If the feature is not supported.
"""
if feature not in self.provide_settings:
raise ValueError(f"Unsupported feature: {feature}")
return self.provide_settings[feature]
class ModuleFactory:
"""
Factory class for creating instances of various modules in the DeepSearcher system.
This class creates instances of LLMs, embedding models, file loaders, web crawlers,
and vector databases based on the configuration settings.
"""
def __init__(self, config: Configuration):
"""
Initialize the ModuleFactory.
Args:
config: The Configuration object containing provider settings.
"""
self.config = config
def _create_module_instance(self, feature: FeatureType, module_name: str):
"""
Create an instance of a module based on the feature and module name.
Args:
feature: The feature type (e.g., 'llm', 'embedding').
module_name: The module name to import from.
Returns:
An instance of the specified module.
"""
# e.g.
# feature = "file_loader"
# module_name = "deepsearcher.loader.file_loader"
class_name = self.config.provide_settings[feature]["provider"]
module = __import__(module_name, fromlist=[class_name])
class_ = getattr(module, class_name)
return class_(**self.config.provide_settings[feature]["config"])
def create_llm(self) -> BaseLLM:
"""
Create an instance of a language model.
Returns:
An instance of a BaseLLM implementation.
"""
return self._create_module_instance("llm", "deepsearcher.llm")
def create_embedding(self) -> BaseEmbedding:
"""
Create an instance of an embedding model.
Returns:
An instance of a BaseEmbedding implementation.
"""
return self._create_module_instance("embedding", "deepsearcher.embedding")
def create_file_loader(self) -> BaseLoader:
"""
Create an instance of a file loader.
Returns:
An instance of a BaseLoader implementation.
"""
return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader")
def create_web_crawler(self) -> BaseCrawler:
"""
Create an instance of a web crawler.
Returns:
An instance of a BaseCrawler implementation.
"""
return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler")
def create_vector_db(self) -> BaseVectorDB:
"""
Create an instance of a vector database.
Returns:
An instance of a BaseVectorDB implementation.
"""
return self._create_module_instance("vector_db", "deepsearcher.vector_db")
config = Configuration()
module_factory: ModuleFactory = None
llm: BaseLLM = None
embedding_model: BaseEmbedding = None
file_loader: BaseLoader = None
vector_db: BaseVectorDB = None
web_crawler: BaseCrawler = None
default_searcher: RAGRouter = None
naive_rag: NaiveRAG = None
def init_config(config: Configuration):
"""
Initialize the global configuration and create instances of all required modules.
This function initializes the global variables for the LLM, embedding model,
file loader, web crawler, vector database, and RAG agents.
Args:
config: The Configuration object to use for initialization.
"""
global \
module_factory, \
llm, \
embedding_model, \
file_loader, \
vector_db, \
web_crawler, \
default_searcher, \
naive_rag
module_factory = ModuleFactory(config)
llm = module_factory.create_llm()
embedding_model = module_factory.create_embedding()
file_loader = module_factory.create_file_loader()
web_crawler = module_factory.create_web_crawler()
vector_db = module_factory.create_vector_db()
default_searcher = RAGRouter(
llm=llm,
rag_agents=[
DeepSearch(
llm=llm,
embedding_model=embedding_model,
vector_db=vector_db,
max_iter=config.query_settings["max_iter"],
route_collection=True,
text_window_splitter=True,
),
ChainOfRAG(
llm=llm,
embedding_model=embedding_model,
vector_db=vector_db,
max_iter=config.query_settings["max_iter"],
route_collection=True,
text_window_splitter=True,
),
],
)
naive_rag = NaiveRAG(
llm=llm,
embedding_model=embedding_model,
vector_db=vector_db,
top_k=10,
route_collection=True,
text_window_splitter=True,
)

5
deepsearcher/embedding/__init__.py

@ -0,0 +1,5 @@
from .openai_embedding import OpenAIEmbedding
__all__ = [
"OpenAIEmbedding",
]

76
deepsearcher/embedding/base.py

@ -0,0 +1,76 @@
from typing import List
from tqdm import tqdm
from deepsearcher.loader.splitter import Chunk
class BaseEmbedding:
"""
Abstract base class for embedding model implementations.
This class defines the interface for embedding model implementations,
including methods for embedding queries and documents, and a property
for the dimensionality of the embeddings.
"""
def embed_query(self, text: str) -> List[float]:
"""
Embed a single query text.
Args:
text: The query text to embed.
Returns:
A list of floats representing the embedding vector.
"""
pass
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embed a list of document texts.
This default implementation calls embed_query for each text,
but implementations may override this with a more efficient batch method.
Args:
texts: A list of document texts to embed.
Returns:
A list of embedding vectors, one for each input text.
"""
return [self.embed_query(text) for text in texts]
def embed_chunks(self, chunks: List[Chunk], batch_size: int = 256) -> List[Chunk]:
"""
Embed a list of Chunk objects.
This method extracts the text from each chunk, embeds it in batches,
and updates the chunks with their embeddings.
Args:
chunks: A list of Chunk objects to embed.
batch_size: The number of chunks to process in each batch.
Returns:
The input list of Chunk objects, updated with embeddings.
"""
texts = [chunk.text for chunk in chunks]
batch_texts = [texts[i : i + batch_size] for i in range(0, len(texts), batch_size)]
embeddings = []
for batch_text in tqdm(batch_texts, desc="Embedding chunks"):
batch_embeddings = self.embed_documents(batch_text)
embeddings.extend(batch_embeddings)
for chunk, embedding in zip(chunks, embeddings):
chunk.embedding = embedding
return chunks
@property
def dimension(self) -> int:
"""
Get the dimensionality of the embeddings.
Returns:
The number of dimensions in the embedding vectors.
"""
pass

103
deepsearcher/embedding/openai_embedding.py

@ -0,0 +1,103 @@
import os
from typing import List
from openai import OpenAI
from openai._types import NOT_GIVEN
from deepsearcher.embedding.base import BaseEmbedding
class OpenAIEmbedding(BaseEmbedding):
"""
OpenAI embedding model implementation.
This class provides an interface to the OpenAI embedding API, which offers
various embedding models for text processing.
For more information, see:
https://platform.openai.com/docs/guides/embeddings/use-cases
"""
def __init__(self, model: str, **kwargs):
"""
Initialize the OpenAI embedding model.
Args:
model (str): The model identifier to use for embeddings.
**kwargs: Additional keyword arguments.
- api_key (str): The API key.
- base_url (str): The base URL.
- model_name (str): Alternative way to specify the model.
- dimension (int): The dimension of the embedding vectors.
- dim_change (bool): Whether it's able to change the dimension of the generated embeddings.
"""
# Extract standard parameters (keep original behavior)
if "api_key" in kwargs:
api_key = kwargs.pop("api_key")
if "base_url" in kwargs:
base_url = kwargs.pop("base_url")
else:
base_url = os.getenv("OPENAI_BASE_URL")
if "model_name" in kwargs:
model = kwargs.pop("model_name")
if "dimension" in kwargs:
dimension = kwargs.pop("dimension")
else:
dimension = NOT_GIVEN
if "dim_change" in kwargs:
dim_change = kwargs.pop("dim_change")
self.dim = dimension
self.dim_change = dim_change
self.model = model
self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs)
def embed_query(self, text: str) -> List[float]:
"""
Embed a single query text.
Args:
text (str): The query text to embed.
Returns:
List[float]: A list of floats representing the embedding vector.
"""
response = self.client.embeddings.create(
input=[text], model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN
)
return response.data[0].embedding
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embed a list of document texts.
Args:
texts (List[str]): A list of document texts to embed.
Returns:
List[List[float]]: A list of embedding vectors, one for each input text.
"""
response = self.client.embeddings.create(
input=texts, model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN
)
return [r.embedding for r in response.data]
@property
def dimension(self) -> int:
"""
Get the dimensionality of the embeddings for the current model.
Returns:
int: The number of dimensions in the embedding vectors.
"""
return self.dim

5
deepsearcher/llm/__init__.py

@ -0,0 +1,5 @@
from .openai_llm import OpenAILLM
__all__ = [
"OpenAILLM",
]

120
deepsearcher/llm/base.py

@ -0,0 +1,120 @@
import ast
import re
from abc import ABC
from typing import Dict, List
class ChatResponse(ABC):
"""
Represents a response from a chat model.
This class encapsulates the content of a response from a chat model
along with information about token usage.
Attributes:
content: The text content of the response.
total_tokens: The total number of tokens used in the request and response.
"""
def __init__(self, content: str, total_tokens: int) -> None:
"""
Initialize a ChatResponse object.
Args:
content: The text content of the response.
total_tokens: The total number of tokens used in the request and response.
"""
self.content = content
self.total_tokens = total_tokens
def __repr__(self) -> str:
"""
Return a string representation of the ChatResponse.
Returns:
A string representation of the ChatResponse object.
"""
return f"ChatResponse(content={self.content}, total_tokens={self.total_tokens})"
class BaseLLM(ABC):
"""
Abstract base class for language model implementations.
This class defines the interface for language model implementations,
including methods for chat-based interactions and parsing responses.
"""
def __init__(self):
"""
Initialize a BaseLLM object.
"""
pass
def chat(self, messages: List[Dict]) -> ChatResponse:
"""
Send a chat message to the language model and get a response.
Args:
messages: A list of message dictionaries, typically in the format
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
Returns:
A ChatResponse object containing the model's response.
"""
pass
@staticmethod
def literal_eval(response_content: str):
"""
Parse a string response into a Python object using ast.literal_eval.
This method attempts to extract and parse JSON or Python literals from the response content,
handling various formats like code blocks and special tags.
Args:
response_content: The string content to parse.
Returns:
The parsed Python object.
Raises:
ValueError: If the response content cannot be parsed.
"""
response_content = response_content.strip()
response_content = BaseLLM.remove_think(response_content)
try:
if response_content.startswith("```") and response_content.endswith("```"):
if response_content.startswith("```python"):
response_content = response_content[9:-3]
elif response_content.startswith("```json"):
response_content = response_content[7:-3]
elif response_content.startswith("```str"):
response_content = response_content[6:-3]
elif response_content.startswith("```\n"):
response_content = response_content[4:-3]
else:
raise ValueError("Invalid code block format")
result = ast.literal_eval(response_content.strip())
except Exception:
matches = re.findall(r"(\[.*?\]|\{.*?\})", response_content, re.DOTALL)
if len(matches) != 1:
raise ValueError(
f"Invalid JSON/List format for response content:\n{response_content}"
)
json_part = matches[0]
return ast.literal_eval(json_part)
return result
@staticmethod
def remove_think(response_content: str) -> str:
# remove content between <think> and </think>, especial for reasoning model
if "<think>" in response_content and "</think>" in response_content:
end_of_think = response_content.find("</think>") + len("</think>")
response_content = response_content[end_of_think:]
return response_content.strip()

61
deepsearcher/llm/openai_llm.py

@ -0,0 +1,61 @@
import os
from typing import Dict, List
from deepsearcher.llm.base import BaseLLM, ChatResponse
class OpenAILLM(BaseLLM):
"""
OpenAI language model implementation.
This class provides an interface to interact with OpenAI's language models
through their API.
Attributes:
model (str): The OpenAI model identifier to use.
client: The OpenAI client instance.
"""
def __init__(self, model: str = "o1-mini", **kwargs):
"""
Initialize an OpenAI language model client.
Args:
model (str, optional): The model identifier to use. Defaults to "o1-mini".
**kwargs: Additional keyword arguments to pass to the OpenAI client.
- api_key: OpenAI API key. If not provided, uses OPENAI_API_KEY environment variable.
- base_url: OpenAI API base URL. If not provided, uses OPENAI_BASE_URL environment variable.
"""
from openai import OpenAI
self.model = model
if "api_key" in kwargs:
api_key = kwargs.pop("api_key")
else:
api_key = os.getenv("OPENAI_API_KEY")
if "base_url" in kwargs:
base_url = kwargs.pop("base_url")
else:
base_url = os.getenv("OPENAI_BASE_URL")
self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs)
def chat(self, messages: List[Dict]) -> ChatResponse:
"""
Send a chat message to the OpenAI model and get a response.
Args:
messages (List[Dict]): A list of message dictionaries, typically in the format
[{"role": "system", "content": "..."},
{"role": "user", "content": "..."}]
Returns:
ChatResponse: An object containing the model's response and token usage information.
"""
completion = self.client.chat.completions.create(
model=self.model,
messages=messages,
)
return ChatResponse(
content=completion.choices[0].message.content,
total_tokens=completion.usage.total_tokens,
)

0
deepsearcher/loader/__init__.py

7
deepsearcher/loader/file_loader/__init__.py

@ -0,0 +1,7 @@
from deepsearcher.loader.file_loader.docling_loader import DoclingLoader
from deepsearcher.loader.file_loader.json_loader import JsonFileLoader
from deepsearcher.loader.file_loader.pdf_loader import PDFLoader
from deepsearcher.loader.file_loader.text_loader import TextLoader
from deepsearcher.loader.file_loader.unstructured_loader import UnstructuredLoader
__all__ = ["PDFLoader", "TextLoader", "UnstructuredLoader", "JsonFileLoader", "DoclingLoader"]

70
deepsearcher/loader/file_loader/base.py

@ -0,0 +1,70 @@
import os
from abc import ABC
from typing import List
from langchain_core.documents import Document
class BaseLoader(ABC):
"""
Abstract base class for file loaders.
This class defines the interface for loading documents from files and directories.
All specific file loaders should inherit from this class and implement the required methods.
"""
def __init__(self, **kwargs):
"""
Initialize the loader with optional keyword arguments.
Args:
**kwargs: Optional keyword arguments for specific loader implementations.
"""
pass
def load_file(self, file_path: str) -> List[Document]:
"""
Load a single file and convert it to Document objects.
Args:
file_path: Path to the file to be loaded.
Returns:
A list of Document objects containing the text and metadata.
Note:
Return a list of Document objects which contain the text and metadata.
In the metadata, it's recommended to include the reference to the file.
e.g. return [Document(page_content=..., metadata={"reference": file_path})]
"""
pass
def load_directory(self, directory: str) -> List[Document]:
"""
Load all supported files from a directory and its subdirectories recursively.
Args:
directory: Path to the directory containing files to be loaded.
Returns:
A list of Document objects from all supported files in the directory and subdirectories.
"""
documents = []
for root, _, files in os.walk(directory):
for file in files:
for suffix in self.supported_file_types:
if file.endswith(suffix):
full_path = os.path.join(root, file)
documents.extend(self.load_file(full_path))
break
return documents
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by this loader.
Returns:
A list of supported file extensions (without the dot).
"""
pass

117
deepsearcher/loader/file_loader/docling_loader.py

@ -0,0 +1,117 @@
import os
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
from deepsearcher.utils import log
class DoclingLoader(BaseLoader):
"""
Loader that utilizes Docling's DocumentConverter and HierarchicalChunker
to convert and chunk files (e.g. Markdown or HTML) into Document objects.
"""
def __init__(self):
"""
Initialize the DoclingLoader with DocumentConverter and HierarchicalChunker instances.
"""
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
self.converter = DocumentConverter()
self.chunker = HierarchicalChunker()
def load_file(self, file_path: str) -> List[Document]:
"""
Load a local file (or URL) using docling's conversion and perform hierarchical chunking.
Args:
file_path: Path or URL of the file to be loaded.
Returns:
A list of Document objects, each representing a chunk.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the file type is not supported.
IOError: If there is an error reading the file.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
# Check if the file has a supported extension
file_extension = os.path.splitext(file_path)[1].lower().lstrip(".")
if file_extension not in self.supported_file_types:
supported_formats = ", ".join(self.supported_file_types)
raise ValueError(
f"Unsupported file type: '{file_extension}'. "
f"Supported file types are: {supported_formats}"
)
try:
conversion_result = self.converter.convert(file_path)
docling_document = conversion_result.document
chunks = list(self.chunker.chunk(docling_document))
documents = []
for chunk in chunks:
metadata = {"reference": file_path, "text": chunk.text}
documents.append(Document(page_content=chunk.text, metadata=metadata))
return documents
except Exception as e:
log.color_print(f"Error processing file {file_path}: {str(e)}")
raise IOError(f"Failed to process file {file_path}: {str(e)}")
def load_directory(self, directory: str) -> List[Document]:
"""
Load all supported files from a directory.
Args:
directory: Path to the directory containing files to be loaded.
Returns:
A list of Document objects from all supported files in the directory.
Raises:
NotADirectoryError: If the specified path is not a directory.
"""
if not os.path.isdir(directory):
raise NotADirectoryError(f"Error: '{directory}' is not a directory.")
return super().load_directory(directory)
@property
def supported_file_types(self) -> List[str]:
"""
Return the list of file extensions supported by this loader.
Supported formats (refer to the official website: https://docling-project.github.io/docling/usage/supported_formats/):
- PDF
- Office formats: DOCX, XLSX, PPTX
- Markdown
- AsciiDoc
- HTML, XHTML
- CSV
- Images: PNG, JPEG, TIFF, BMP
"""
return [
"pdf",
"docx",
"xlsx",
"pptx",
"md",
"adoc",
"asciidoc",
"html",
"xhtml",
"csv",
"png",
"jpg",
"jpeg",
"tif",
"tiff",
"bmp",
]

94
deepsearcher/loader/file_loader/json_loader.py

@ -0,0 +1,94 @@
import json
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
class JsonFileLoader(BaseLoader):
"""
Loader for JSON and JSONL files.
This loader handles JSON and JSONL files, extracting text content from a specified key
and converting each entry into Document objects for further processing.
"""
def __init__(self, text_key: str):
"""
Initialize the JsonFileLoader.
Args:
text_key: The key in the JSON data that contains the text content to be extracted.
"""
self.text_key = text_key
def load_file(self, file_path: str) -> List[Document]:
"""
Load a JSON or JSONL file and convert it to Document objects.
Args:
file_path: Path to the JSON or JSONL file to be loaded.
Returns:
A list of Document objects, one for each entry in the JSON/JSONL file.
"""
if file_path.endswith(".jsonl"):
data_list: list[dict] = self._read_jsonl_file(file_path)
else:
data_list: list[dict] = self._read_json_file(file_path)
documents = []
for data_dict in data_list:
page_content = data_dict.pop(self.text_key)
data_dict.update({"reference": file_path})
document = Document(page_content=page_content, metadata=data_dict)
documents.append(document)
return documents
def _read_json_file(self, file_path: str) -> list[dict]:
"""
Read and parse a JSON file.
Args:
file_path: Path to the JSON file.
Returns:
A list of dictionaries parsed from the JSON file.
Raises:
ValueError: If the JSON file does not contain a list of dictionaries.
"""
json_data = json.load(open(file_path))
if not isinstance(json_data, list):
raise ValueError("JSON file must contain a list of dictionaries.")
return json_data
def _read_jsonl_file(self, file_path: str) -> List[dict]:
"""
Read and parse a JSONL file (JSON Lines format).
Args:
file_path: Path to the JSONL file.
Returns:
A list of dictionaries parsed from the JSONL file.
"""
data_list = []
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
try:
json_data = json.loads(line)
data_list.append(json_data)
except json.JSONDecodeError:
print(f"Failed to decode line: {line}")
return data_list
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by this loader.
Returns:
A list of supported file extensions: ["txt", "md"].
"""
return ["txt", "md"]

54
deepsearcher/loader/file_loader/pdf_loader.py

@ -0,0 +1,54 @@
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
class PDFLoader(BaseLoader):
"""
Loader for PDF files.
This loader handles PDF files and also supports text files with extensions like .txt and .md,
converting them into Document objects for further processing.
"""
def __init__(self):
"""
Initialize the PDFLoader.
"""
pass
def load_file(self, file_path: str) -> List[Document]:
"""
Load a PDF file and convert it to a Document object.
Args:
file_path: Path to the PDF file to be loaded.
Returns:
A list containing a single Document object with the file content and reference.
Note:
This loader also supports .txt and .md files for convenience.
"""
import pdfplumber
if file_path.endswith(".pdf"):
with pdfplumber.open(file_path) as file:
page_content = "\n\n".join([page.extract_text() for page in file.pages])
return [Document(page_content=page_content, metadata={"reference": file_path})]
elif file_path.endswith(".txt") or file_path.endswith(".md"):
with open(file_path, "r", encoding="utf-8") as file:
page_content = file.read()
return [Document(page_content=page_content, metadata={"reference": file_path})]
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by this loader.
Returns:
A list of supported file extensions: ["pdf", "md", "txt"].
"""
return ["pdf", "md", "txt"]

43
deepsearcher/loader/file_loader/text_loader.py

@ -0,0 +1,43 @@
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
class TextLoader(BaseLoader):
"""
Loader for plain text files.
This loader handles text files with extensions like .txt and .md,
converting them into Document objects for further processing.
"""
def __init__(self):
"""
Initialize the TextLoader.
"""
pass
def load_file(self, file_path: str) -> List[Document]:
"""
Load a text file and convert it to a Document object.
Args:
file_path: Path to the text file to be loaded.
Returns:
A list containing a single Document object with the file content and reference.
"""
with open(file_path, "r", encoding="utf-8") as f:
return [Document(page_content=f.read(), metadata={"reference": file_path})]
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by this loader.
Returns:
A list of supported file extensions: ["txt", "md"].
"""
return ["txt", "md"]

201
deepsearcher/loader/file_loader/unstructured_loader.py

@ -0,0 +1,201 @@
import os
import shutil
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
from deepsearcher.utils import log
class UnstructuredLoader(BaseLoader):
"""
Loader for unstructured documents using the unstructured-io library.
This loader processes various document formats using the unstructured-io library's
processing pipeline, extracting text and metadata from complex document formats.
"""
def __init__(self):
"""
Initialize the UnstructuredLoader.
Creates a temporary directory for processed outputs and cleans up any existing ones.
"""
self.directory_with_results = "./pdf_processed_outputs"
if os.path.exists(self.directory_with_results):
shutil.rmtree(self.directory_with_results)
os.makedirs(self.directory_with_results)
def load_pipeline(self, input_path: str) -> List[Document]:
"""
Process documents using the unstructured-io pipeline.
Args:
input_path: Path to the file or directory to be processed.
Returns:
A list of Document objects extracted from the processed files.
Note:
If UNSTRUCTURED_API_KEY and UNSTRUCTURED_API_URL environment variables are set,
the API-based partitioning will be used. Otherwise, local partitioning will be used.
"""
from unstructured_ingest.interfaces import ProcessorConfig
from unstructured_ingest.pipeline.pipeline import Pipeline
from unstructured_ingest.processes.connectors.local import (
LocalConnectionConfig,
LocalDownloaderConfig,
LocalIndexerConfig,
LocalUploaderConfig,
)
from unstructured_ingest.processes.partitioner import PartitionerConfig
# Check if API environment variables are set
api_key = os.getenv("UNSTRUCTURED_API_KEY")
api_url = os.getenv("UNSTRUCTURED_API_URL")
use_api = api_key is not None and api_url is not None
if use_api:
log.color_print("Using Unstructured API for document processing")
else:
log.color_print(
"Using local processing for documents (UNSTRUCTURED_API_KEY or UNSTRUCTURED_API_URL not set)"
)
Pipeline.from_configs(
context=ProcessorConfig(),
indexer_config=LocalIndexerConfig(input_path=input_path),
downloader_config=LocalDownloaderConfig(),
source_connection_config=LocalConnectionConfig(),
partitioner_config=PartitionerConfig(
partition_by_api=use_api,
api_key=api_key,
partition_endpoint=api_url,
strategy="hi_res",
),
uploader_config=LocalUploaderConfig(output_dir=self.directory_with_results),
).run()
from unstructured.staging.base import elements_from_json
elements = []
for filename in os.listdir(self.directory_with_results):
if filename.endswith(".json"):
file_path = os.path.join(self.directory_with_results, filename)
try:
elements.extend(elements_from_json(filename=file_path))
except IOError:
log.color_print(f"Error: Could not read file {filename}.")
documents = []
for element in elements:
metadata = element.metadata.to_dict()
metadata["reference"] = input_path # TODO test it
documents.append(
Document(
page_content=element.text,
metadata=metadata,
)
)
return documents
def load_file(self, file_path: str) -> List[Document]:
"""
Load a single file using the unstructured-io pipeline.
Args:
file_path: Path to the file to be processed.
Returns:
A list of Document objects extracted from the processed file.
"""
return self.load_pipeline(file_path)
def load_directory(self, directory: str) -> List[Document]:
"""
Load all supported files from a directory using the unstructured-io pipeline.
Args:
directory: Path to the directory containing files to be processed.
Returns:
A list of Document objects extracted from all processed files.
"""
return self.load_pipeline(directory)
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by the unstructured-io library. Please refer to the Unstructured documentation for more details: https://docs.unstructured.io/ui/supported-file-types.
Returns:
A comprehensive list of supported file extensions.
Note:
The unstructured-io library supports a wide range of document formats
including office documents, images, emails, and more.
"""
return [
"abw",
"bmp",
"csv",
"cwk",
"dbf",
"dif",
"doc",
"docm",
"docx",
"dot",
"dotm",
"eml",
"epub",
"et",
"eth",
"fods",
"gif",
"heic",
"htm",
"html",
"hwp",
"jpeg",
"jpg",
"md",
"mcw",
"mw",
"odt",
"org",
"p7s",
"pages",
"pbd",
"pdf",
"png",
"pot",
"potm",
"ppt",
"pptm",
"pptx",
"prn",
"rst",
"rtf",
"sdp",
"sgl",
"svg",
"sxg",
"tiff",
"txt",
"tsv",
"uof",
"uos1",
"uos2",
"web",
"webp",
"wk2",
"xls",
"xlsb",
"xlsm",
"xlsx",
"xlw",
"xml",
"zabw",
]

105
deepsearcher/loader/splitter.py

@ -0,0 +1,105 @@
## Sentence Window splitting strategy, ref:
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
class Chunk:
"""
Represents a chunk of text with associated metadata and embedding.
A chunk is a segment of text extracted from a document, along with its reference
information, metadata, and optional embedding vector.
Attributes:
text: The text content of the chunk.
reference: A reference to the source of the chunk (e.g., file path, URL).
metadata: Additional metadata associated with the chunk.
embedding: The vector embedding of the chunk, if available.
"""
def __init__(
self,
text: str,
reference: str,
metadata: dict = None,
embedding: List[float] = None,
):
"""
Initialize a Chunk object.
Args:
text: The text content of the chunk.
reference: A reference to the source of the chunk.
metadata: Additional metadata associated with the chunk. Defaults to an empty dict.
embedding: The vector embedding of the chunk. Defaults to None.
"""
self.text = text
self.reference = reference
self.metadata = metadata or {}
self.embedding = embedding or None
def _sentence_window_split(
split_docs: List[Document], original_document: Document, offset: int = 200
) -> List[Chunk]:
"""
Create chunks with context windows from split documents.
This function takes documents that have been split into smaller pieces and
adds context from the original document by including text before and after
each split piece, up to the specified offset.
Args:
split_docs: List of documents that have been split.
original_document: The original document before splitting.
offset: Number of characters to include before and after each split piece.
Returns:
A list of Chunk objects with context windows.
"""
chunks = []
original_text = original_document.page_content
for doc in split_docs:
doc_text = doc.page_content
start_index = original_text.index(doc_text)
end_index = start_index + len(doc_text) - 1
wider_text = original_text[
max(0, start_index - offset) : min(len(original_text), end_index + offset)
]
reference = doc.metadata.pop("reference", "")
doc.metadata["wider_text"] = wider_text
chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
chunks.append(chunk)
return chunks
def split_docs_to_chunks(
documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
) -> List[Chunk]:
"""
Split documents into chunks with context windows.
This function splits a list of documents into smaller chunks with overlapping text,
and adds context windows to each chunk by including text before and after the chunk.
Args:
documents: List of documents to split.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
Returns:
A list of Chunk objects with context windows.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
all_chunks = []
for doc in documents:
split_docs = text_splitter.split_documents([doc])
split_chunks = _sentence_window_split(split_docs, doc, offset=300)
all_chunks.extend(split_chunks)
return all_chunks

11
deepsearcher/loader/web_crawler/__init__.py

@ -0,0 +1,11 @@
from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler
from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler
from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler
from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler
__all__ = [
"FireCrawlCrawler",
"JinaCrawler",
"Crawl4AICrawler",
"DoclingCrawler",
]

55
deepsearcher/loader/web_crawler/base.py

@ -0,0 +1,55 @@
from abc import ABC
from typing import List
from langchain_core.documents import Document
class BaseCrawler(ABC):
"""
Abstract base class for web crawlers.
This class defines the interface for crawling web pages and converting them
into Document objects for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the crawler with optional keyword arguments.
Args:
**kwargs: Optional keyword arguments for specific crawler implementations.
"""
pass
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL and convert it to Document objects.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from the URL.
Note:
Implementations should include the URL reference in the metadata.
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
"""
pass
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
"""
Crawl multiple URLs and return a list of Document objects.
Args:
urls: A list of URLs to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from all URLs.
"""
documents = []
for url in urls:
documents.extend(self.crawl_url(url, **crawl_kwargs))
return documents

140
deepsearcher/loader/web_crawler/crawl4ai_crawler.py

@ -0,0 +1,140 @@
import asyncio
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log
class Crawl4AICrawler(BaseCrawler):
"""
Web crawler using the Crawl4AI library.
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
into markdown format for further processing. It supports both single-page crawling
and batch crawling of multiple pages.
"""
def __init__(self, **kwargs):
"""
Initialize the Crawl4AICrawler.
Args:
**kwargs: Optional keyword arguments.
browser_config: Configuration for the browser used by Crawl4AI.
"""
super().__init__(**kwargs)
self.crawler = None # Lazy init
self.browser_config = kwargs.get("browser_config", None)
def _lazy_init(self):
"""
Initialize the crawler lazily when needed.
This method creates the AsyncWebCrawler instance with the provided browser configuration
only when it's first needed, to avoid unnecessary initialization.
"""
from crawl4ai import AsyncWebCrawler, BrowserConfig
if self.crawler is None:
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
self.crawler = AsyncWebCrawler(config=config)
async def _async_crawl(self, url: str) -> Document:
"""
Asynchronously crawl a single URL.
Args:
url: The URL to crawl.
Returns:
A Document object with the markdown content and metadata from the URL.
"""
if self.crawler is None:
self._lazy_init()
async with self.crawler as crawler:
result = await crawler.arun(url)
markdown_content = result.markdown or ""
metadata = {
"reference": url,
"success": result.success,
"status_code": result.status_code,
"media": result.media,
"links": result.links,
}
if hasattr(result, "metadata") and result.metadata:
metadata["title"] = result.metadata.get("title", "")
metadata["author"] = result.metadata.get("author", "")
return Document(page_content=markdown_content, metadata=metadata)
def crawl_url(self, url: str) -> List[Document]:
"""
Crawl a single URL.
Args:
url: The URL to crawl.
Returns:
A list containing a single Document object with the markdown content and metadata,
or an empty list if an error occurs.
"""
try:
document = asyncio.run(self._async_crawl(url))
return [document]
except Exception as e:
log.error(f"Error during crawling {url}: {e}")
return []
async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
"""
Asynchronously crawl multiple URLs.
Args:
urls: A list of URLs to crawl.
Returns:
A list of Document objects with the markdown content and metadata from all URLs.
"""
if self.crawler is None:
self._lazy_init()
async with self.crawler as crawler:
results = await crawler.arun_many(urls)
documents = []
for result in results:
markdown_content = result.markdown or ""
metadata = {
"reference": result.url,
"success": result.success,
"status_code": result.status_code,
"media": result.media,
"links": result.links,
}
if hasattr(result, "metadata") and result.metadata:
metadata["title"] = result.metadata.get("title", "")
metadata["author"] = result.metadata.get("author", "")
documents.append(Document(page_content=markdown_content, metadata=metadata))
return documents
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
"""
Crawl multiple URLs.
Args:
urls: A list of URLs to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects with the markdown content and metadata from all URLs,
or an empty list if an error occurs.
"""
try:
return asyncio.run(self._async_crawl_many(urls))
except Exception as e:
log.error(f"Error during crawling {urls}: {e}")
return []

98
deepsearcher/loader/web_crawler/docling_crawler.py

@ -0,0 +1,98 @@
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log
class DoclingCrawler(BaseCrawler):
"""
Web crawler using Docling's DocumentConverter and HierarchicalChunker.
This crawler leverages Docling's capabilities to convert web pages into structured
documents and chunk them appropriately for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
Args:
**kwargs: Optional keyword arguments.
"""
super().__init__(**kwargs)
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
self.converter = DocumentConverter()
self.chunker = HierarchicalChunker()
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL using Docling's conversion and perform hierarchical chunking.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects, each representing a chunk from the crawled URL.
Raises:
IOError: If there is an error processing the URL.
"""
try:
# Use Docling to convert the URL to a document
conversion_result = self.converter.convert(url)
docling_document = conversion_result.document
# Chunk the document using hierarchical chunking
chunks = list(self.chunker.chunk(docling_document))
documents = []
for chunk in chunks:
metadata = {"reference": url, "text": chunk.text}
documents.append(Document(page_content=chunk.text, metadata=metadata))
return documents
except Exception as e:
log.color_print(f"Error processing URL {url}: {str(e)}")
raise IOError(f"Failed to process URL {url}: {str(e)}")
@property
def supported_file_types(self) -> List[str]:
"""
Return the list of file types and formats supported by Docling.
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
- PDF
- Office formats: DOCX, XLSX, PPTX
- Markdown
- AsciiDoc
- HTML, XHTML
- CSV
- Images: PNG, JPEG, TIFF, BMP
Returns:
A list of file extensions supported by this crawler.
"""
return [
"pdf",
"docx",
"xlsx",
"pptx",
"md",
"adoc",
"asciidoc",
"html",
"xhtml",
"csv",
"png",
"jpg",
"jpeg",
"tif",
"tiff",
"bmp",
]

88
deepsearcher/loader/web_crawler/firecrawl_crawler.py

@ -0,0 +1,88 @@
import os
from typing import List, Optional
from firecrawl import FirecrawlApp, ScrapeOptions
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
class FireCrawlCrawler(BaseCrawler):
"""
Web crawler using the FireCrawl service.
This crawler uses the FireCrawl service to crawl web pages and convert them
into markdown format for further processing. It supports both single-page scraping
and recursive crawling of multiple pages.
"""
def __init__(self, **kwargs):
"""
Initialize the FireCrawlCrawler.
Args:
**kwargs: Optional keyword arguments.
"""
super().__init__(**kwargs)
self.app = None
def crawl_url(
self,
url: str,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
) -> List[Document]:
"""
Dynamically crawls a URL using either scrape_url or crawl_url:
- Uses scrape_url for single-page extraction if no params are provided.
- Uses crawl_url to recursively gather pages when any param is provided.
Args:
url (str): The starting URL to crawl.
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
limit (Optional[int]): Maximum number of pages to crawl (default: 20).
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
Returns:
List[Document]: List of Document objects with page content and metadata.
"""
# Lazy init
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
# if user just inputs a single url as param
# scrape single page
if max_depth is None and limit is None and allow_backward_links is None:
# Call the new Firecrawl API, passing formats directly
scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
data = scrape_response.model_dump()
return [
Document(
page_content=data.get("markdown", ""),
metadata={"reference": url, **data.get("metadata", {})},
)
]
# else, crawl multiple pages based on users' input params
# set default values if not provided
crawl_response = self.app.crawl_url(
url=url,
limit=limit or 20,
max_depth=max_depth or 2,
allow_backward_links=allow_backward_links or False,
scrape_options=ScrapeOptions(formats=["markdown"]),
poll_interval=5,
)
items = crawl_response.model_dump().get("data", [])
documents: List[Document] = []
for item in items:
# Support items that are either dicts or Pydantic sub-models
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
md = item_dict.get("markdown", "")
meta = item_dict.get("metadata", {})
meta["reference"] = meta.get("url", url)
documents.append(Document(page_content=md, metadata=meta))
return documents

62
deepsearcher/loader/web_crawler/jina_crawler.py

@ -0,0 +1,62 @@
import os
from typing import List
import requests
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
class JinaCrawler(BaseCrawler):
"""
Web crawler using Jina AI's rendering service.
This crawler uses Jina AI's rendering service to crawl web pages and convert them
into markdown format for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the JinaCrawler.
Args:
**kwargs: Optional keyword arguments.
Raises:
ValueError: If the JINA_API_TOKEN environment variable is not set.
"""
super().__init__(**kwargs)
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
if not self.jina_api_token:
raise ValueError("Missing JINA_API_TOKEN environment variable")
def crawl_url(self, url: str) -> List[Document]:
"""
Crawl a single URL using Jina AI's rendering service.
Args:
url: The URL to crawl.
Returns:
A list containing a single Document object with the markdown content and metadata.
Raises:
HTTPError: If the request to Jina AI's service fails.
"""
jina_url = f"https://r.jina.ai/{url}"
headers = {
"Authorization": f"Bearer {self.jina_api_token}",
"X-Return-Format": "markdown",
}
response = requests.get(jina_url, headers=headers)
response.raise_for_status()
markdown_content = response.text
metadata = {
"reference": url,
"status_code": response.status_code,
"headers": dict(response.headers),
}
return [Document(page_content=markdown_content, metadata=metadata)]

119
deepsearcher/offline_loading.py

@ -0,0 +1,119 @@
import os
from typing import List, Union
from tqdm import tqdm
# from deepsearcher.configuration import embedding_model, vector_db, file_loader
from deepsearcher import configuration
from deepsearcher.loader.splitter import split_docs_to_chunks
def load_from_local_files(
paths_or_directory: Union[str, List[str]],
collection_name: str = None,
collection_description: str = None,
force_new_collection: bool = False,
chunk_size: int = 1500,
chunk_overlap: int = 100,
batch_size: int = 256,
):
"""
Load knowledge from local files or directories into the vector database.
This function processes files from the specified paths or directories,
splits them into chunks, embeds the chunks, and stores them in the vector database.
Args:
paths_or_directory: A single path or a list of paths to files or directories to load.
collection_name: Name of the collection to store the data in. If None, uses the default collection.
collection_description: Description of the collection. If None, no description is set.
force_new_collection: If True, drops the existing collection and creates a new one.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
batch_size: Number of chunks to process at once during embedding.
Raises:
FileNotFoundError: If any of the specified paths do not exist.
"""
vector_db = configuration.vector_db
if collection_name is None:
collection_name = vector_db.default_collection
collection_name = collection_name.replace(" ", "_").replace("-", "_")
embedding_model = configuration.embedding_model
file_loader = configuration.file_loader
vector_db.init_collection(
dim=embedding_model.dimension,
collection=collection_name,
description=collection_description,
force_new_collection=force_new_collection,
)
if isinstance(paths_or_directory, str):
paths_or_directory = [paths_or_directory]
all_docs = []
for path in tqdm(paths_or_directory, desc="Loading files"):
if not os.path.exists(path):
raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")
if os.path.isdir(path):
docs = file_loader.load_directory(path)
else:
docs = file_loader.load_file(path)
all_docs.extend(docs)
# print("Splitting docs to chunks...")
chunks = split_docs_to_chunks(
all_docs,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
vector_db.insert_data(collection=collection_name, chunks=chunks)
def load_from_website(
urls: Union[str, List[str]],
collection_name: str = None,
collection_description: str = None,
force_new_collection: bool = False,
chunk_size: int = 1500,
chunk_overlap: int = 100,
batch_size: int = 256,
**crawl_kwargs,
):
"""
Load knowledge from websites into the vector database.
This function crawls the specified URLs, processes the content,
splits it into chunks, embeds the chunks, and stores them in the vector database.
Args:
urls: A single URL or a list of URLs to crawl.
collection_name: Name of the collection to store the data in. If None, uses the default collection.
collection_description: Description of the collection. If None, no description is set.
force_new_collection: If True, drops the existing collection and creates a new one.
chunk_size: Size of each chunk in characters.
chunk_overlap: Number of characters to overlap between chunks.
batch_size: Number of chunks to process at once during embedding.
**crawl_kwargs: Additional keyword arguments to pass to the web crawler.
"""
if isinstance(urls, str):
urls = [urls]
vector_db = configuration.vector_db
embedding_model = configuration.embedding_model
web_crawler = configuration.web_crawler
vector_db.init_collection(
dim=embedding_model.dimension,
collection=collection_name,
description=collection_description,
force_new_collection=force_new_collection,
)
all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)
chunks = split_docs_to_chunks(
all_docs,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
vector_db.insert_data(collection=collection_name, chunks=chunks)

96
deepsearcher/online_query.py

@ -0,0 +1,96 @@
from typing import List, Tuple
# from deepsearcher.configuration import vector_db, embedding_model, llm
from deepsearcher import configuration
from deepsearcher.vector_db.base import RetrievalResult
def query(original_query: str, max_iter: int = 3) -> Tuple[str, List[RetrievalResult], int]:
"""
Query the knowledge base with a question and get an answer.
This function uses the default searcher to query the knowledge base and generate
an answer based on the retrieved information.
Args:
original_query: The question or query to search for.
max_iter: Maximum number of iterations for the search process.
Returns:
A tuple containing:
- The generated answer as a string
- A list of retrieval results that were used to generate the answer
- The number of tokens consumed during the process
"""
default_searcher = configuration.default_searcher
return default_searcher.query(original_query, max_iter=max_iter)
def retrieve(
original_query: str, max_iter: int = 3
) -> Tuple[List[RetrievalResult], List[str], int]:
"""
Retrieve relevant information from the knowledge base without generating an answer.
This function uses the default searcher to retrieve information from the knowledge base
that is relevant to the query.
Args:
original_query: The question or query to search for.
max_iter: Maximum number of iterations for the search process.
Returns:
A tuple containing:
- A list of retrieval results
- An empty list (placeholder for future use)
- The number of tokens consumed during the process
"""
default_searcher = configuration.default_searcher
retrieved_results, consume_tokens, metadata = default_searcher.retrieve(
original_query, max_iter=max_iter
)
return retrieved_results, [], consume_tokens
def naive_retrieve(query: str, collection: str = None, top_k=10) -> List[RetrievalResult]:
"""
Perform a simple retrieval from the knowledge base using the naive RAG approach.
This function uses the naive RAG agent to retrieve information from the knowledge base
without any advanced techniques like iterative refinement.
Args:
query: The question or query to search for.
collection: The name of the collection to search in. If None, searches in all collections.
top_k: The maximum number of results to return.
Returns:
A list of retrieval results.
"""
naive_rag = configuration.naive_rag
all_retrieved_results, consume_tokens, _ = naive_rag.retrieve(query)
return all_retrieved_results
def naive_rag_query(
query: str, collection: str = None, top_k=10
) -> Tuple[str, List[RetrievalResult]]:
"""
Query the knowledge base using the naive RAG approach and get an answer.
This function uses the naive RAG agent to query the knowledge base and generate
an answer based on the retrieved information, without any advanced techniques.
Args:
query: The question or query to search for.
collection: The name of the collection to search in. If None, searches in all collections.
top_k: The maximum number of results to consider.
Returns:
A tuple containing:
- The generated answer as a string
- A list of retrieval results that were used to generate the answer
"""
naive_rag = configuration.naive_rag
answer, retrieved_results, consume_tokens = naive_rag.query(query)
return answer, retrieved_results

0
deepsearcher/utils/__init__.py

160
deepsearcher/utils/log.py

@ -0,0 +1,160 @@
import logging
from termcolor import colored
class ColoredFormatter(logging.Formatter):
"""
A custom formatter for logging that adds colors to log messages.
This formatter adds colors to log messages based on their level,
making it easier to distinguish between different types of logs.
Attributes:
COLORS: A dictionary mapping log levels to colors.
"""
COLORS = {
"DEBUG": "cyan",
"INFO": "green",
"WARNING": "yellow",
"ERROR": "red",
"CRITICAL": "magenta",
}
def format(self, record):
"""
Format a log record with colors.
Args:
record: The log record to format.
Returns:
The formatted log message with colors.
"""
# all line in log will be colored
log_message = super().format(record)
return colored(log_message, self.COLORS.get(record.levelname, "white"))
# only log level will be colored
# levelname_colored = colored(record.levelname, self.COLORS.get(record.levelname, 'white'))
# record.levelname = levelname_colored
# return super().format(record)
# only keywords will be colored
# message = record.msg
# for word, color in self.KEYWORDS.items():
# if word in message:
# message = message.replace(word, colored(word, color))
# record.msg = message
# return super().format(record)
# config log
dev_logger = logging.getLogger("dev")
dev_formatter = ColoredFormatter("%(asctime)s - %(levelname)s - %(message)s")
dev_handler = logging.StreamHandler()
dev_handler.setFormatter(dev_formatter)
dev_logger.addHandler(dev_handler)
dev_logger.setLevel(logging.INFO)
progress_logger = logging.getLogger("progress")
progress_handler = logging.StreamHandler()
progress_handler.setFormatter(ColoredFormatter("%(message)s"))
progress_logger.addHandler(progress_handler)
progress_logger.setLevel(logging.INFO)
dev_mode = False
def set_dev_mode(mode: bool):
"""
Set the development mode.
When in development mode, debug, info, and warning logs are displayed.
When not in development mode, only error and critical logs are displayed.
Args:
mode: True to enable development mode, False to disable it.
"""
global dev_mode
dev_mode = mode
def set_level(level):
"""
Set the logging level for the development logger.
Args:
level: The logging level to set (e.g., logging.DEBUG, logging.INFO).
"""
dev_logger.setLevel(level)
def debug(message):
"""
Log a debug message.
Args:
message: The message to log.
"""
if dev_mode:
dev_logger.debug(message)
def info(message):
"""
Log an info message.
Args:
message: The message to log.
"""
if dev_mode:
dev_logger.info(message)
def warning(message):
"""
Log a warning message.
Args:
message: The message to log.
"""
if dev_mode:
dev_logger.warning(message)
def error(message):
"""
Log an error message.
Args:
message: The message to log.
"""
if dev_mode:
dev_logger.error(message)
def critical(message):
"""
Log a critical message and raise a RuntimeError.
Args:
message: The message to log.
Raises:
RuntimeError: Always raised with the provided message.
"""
dev_logger.critical(message)
raise RuntimeError(message)
def color_print(message, **kwargs):
"""
Print a colored message to the progress logger.
Args:
message: The message to print.
**kwargs: Additional keyword arguments to pass to the logger.
"""
progress_logger.info(message)

6
deepsearcher/vector_db/__init__.py

@ -0,0 +1,6 @@
from .azure_search import AzureSearch
from .milvus import Milvus, RetrievalResult
from .oracle import OracleDB
from .qdrant import Qdrant
__all__ = ["Milvus", "RetrievalResult", "OracleDB", "Qdrant", "AzureSearch"]

279
deepsearcher/vector_db/azure_search.py

@ -0,0 +1,279 @@
import uuid
from typing import Any, Dict, List, Optional
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
class AzureSearch(BaseVectorDB):
def __init__(self, endpoint, index_name, api_key, vector_field):
super().__init__(default_collection=index_name)
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
self.client = SearchClient(
endpoint=endpoint,
index_name=index_name,
credential=AzureKeyCredential(api_key),
)
self.vector_field = vector_field
self.endpoint = endpoint
self.index_name = index_name
self.api_key = api_key
def init_collection(self):
"""Initialize Azure Search index with proper schema"""
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceNotFoundError
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
SearchableField,
SearchField,
SearchIndex,
SimpleField,
)
index_client = SearchIndexClient(
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
)
# Create the index (simplified for compatibility with older SDK versions)
fields = [
SimpleField(name="id", type="Edm.String", key=True),
SearchableField(name="content", type="Edm.String"),
SearchField(
name="content_vector",
type="Collection(Edm.Single)",
searchable=True,
vector_search_dimensions=1536,
),
]
# Create index with fields
index = SearchIndex(name=self.index_name, fields=fields)
try:
# Try to delete existing index
try:
index_client.delete_index(self.index_name)
except ResourceNotFoundError:
pass
# Create the index
index_client.create_index(index)
except Exception as e:
print(f"Error creating index: {str(e)}")
def insert_data(self, documents: List[dict]):
"""Batch insert documents with vector embeddings"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
search_client = SearchClient(
endpoint=self.endpoint,
index_name=self.index_name,
credential=AzureKeyCredential(self.api_key),
)
actions = [
{
"@search.action": "upload" if doc.get("id") else "merge",
"id": doc.get("id", str(uuid.uuid4())),
"content": doc["text"],
"content_vector": doc["vector"],
}
for doc in documents
]
result = search_client.upload_documents(actions)
return [x.succeeded for x in result]
def search_data(
self, collection: Optional[str], vector: List[float], top_k: int = 50
) -> List[RetrievalResult]:
"""Azure Cognitive Search implementation with compatibility for older SDK versions"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
search_client = SearchClient(
endpoint=self.endpoint,
index_name=collection or self.index_name,
credential=AzureKeyCredential(self.api_key),
)
# Validate that vector is not empty
if not vector or len(vector) == 0:
print("Error: Empty vector provided for search. Vector must have 1536 dimensions.")
return []
# Debug vector and field info
print(f"Vector length for search: {len(vector)}")
print(f"Vector field name: {self.vector_field}")
# Ensure vector has the right dimensions
if len(vector) != 1536:
print(f"Warning: Vector length {len(vector)} does not match expected 1536 dimensions")
return []
# Execute search with direct parameters - simpler approach
try:
print(f"Executing search with top_k={top_k}")
# Directly use the search_by_vector method for compatibility
body = {
"search": "*",
"select": "id,content",
"top": top_k,
"vectorQueries": [
{
"vector": vector,
"fields": self.vector_field,
"k": top_k,
"kind": "vector",
}
],
}
# Print the search request body for debugging
print(f"Search request body: {body}")
# Use the REST API directly
result = search_client._client.documents.search_post(
search_request=body, headers={"api-key": self.api_key}
)
# Format results
search_results = []
if hasattr(result, "results"):
for doc in result.results:
try:
doc_dict = doc.as_dict() if hasattr(doc, "as_dict") else doc
content = doc_dict.get("content", "")
doc_id = doc_dict.get("id", "")
score = doc_dict.get("@search.score", 0.0)
result = RetrievalResult(
embedding=[], # We don't get the vectors back
text=content,
reference=doc_id,
metadata={"source": doc_id},
score=score,
)
search_results.append(result)
except Exception as e:
print(f"Error processing result: {str(e)}")
return search_results
except Exception as e:
print(f"Search error: {str(e)}")
# Try another approach if the first one fails
try:
print("Trying alternative search method...")
results = search_client.search(search_text="*", select=["id", "content"], top=top_k)
# Process results
alt_results = []
for doc in results:
try:
# Handle different result formats
if isinstance(doc, dict):
content = doc.get("content", "")
doc_id = doc.get("id", "")
score = doc.get("@search.score", 0.0)
else:
content = getattr(doc, "content", "")
doc_id = getattr(doc, "id", "")
score = getattr(doc, "@search.score", 0.0)
result = RetrievalResult(
embedding=[],
text=content,
reference=doc_id,
metadata={"source": doc_id},
score=score,
)
alt_results.append(result)
except Exception as e:
print(f"Error processing result: {str(e)}")
return alt_results
except Exception as e:
print(f"Alternative search failed: {str(e)}")
return []
def clear_db(self):
"""Delete all documents in the index"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
search_client = SearchClient(
endpoint=self.endpoint,
index_name=self.index_name,
credential=AzureKeyCredential(self.api_key),
)
docs = search_client.search(search_text="*", include_total_count=True, select=["id"])
ids = [doc["id"] for doc in docs]
if ids:
search_client.delete_documents([{"id": id} for id in ids])
return len(ids)
def get_all_collections(self) -> List[str]:
"""List all search indices in Azure Cognitive Search"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
try:
index_client = SearchIndexClient(
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
)
return [index.name for index in index_client.list_indexes()]
except Exception as e:
print(f"Failed to list indices: {str(e)}")
return []
def get_collection_info(self, name: str) -> Dict[str, Any]:
"""Retrieve index metadata"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
index_client = SearchIndexClient(
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
)
return index_client.get_index(name).__dict__
def collection_exists(self, name: str) -> bool:
"""Check index existence"""
from azure.core.exceptions import ResourceNotFoundError
try:
self.get_collection_info(name)
return True
except ResourceNotFoundError:
return False
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
"""List all Azure Search indices with metadata"""
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
try:
index_client = SearchIndexClient(
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
)
collections = []
for index in index_client.list_indexes():
collections.append(
CollectionInfo(
collection_name=index.name,
description=f"Azure Search Index with {len(index.fields) if hasattr(index, 'fields') else 0} fields",
)
)
return collections
except Exception as e:
print(f"Collection listing failed: {str(e)}")
return []

207
deepsearcher/vector_db/base.py

@ -0,0 +1,207 @@
from abc import ABC, abstractmethod
from typing import List, Union
import numpy as np
from deepsearcher.loader.splitter import Chunk
class RetrievalResult:
"""
Represents a result retrieved from the vector database.
This class encapsulates the information about a retrieved document,
including its embedding, text content, reference, metadata, and similarity score.
Attributes:
embedding: The vector embedding of the document.
text: The text content of the document.
reference: A reference to the source of the document.
metadata: Additional metadata associated with the document.
score: The similarity score of the document to the query.
"""
def __init__(
self,
embedding: np.array,
text: str,
reference: str,
metadata: dict,
score: float = 0.0,
):
"""
Initialize a RetrievalResult object.
Args:
embedding: The vector embedding of the document.
text: The text content of the document.
reference: A reference to the source of the document.
metadata: Additional metadata associated with the document.
score: The similarity score of the document to the query. Defaults to 0.0.
"""
self.embedding = embedding
self.text = text
self.reference = reference
self.metadata = metadata
self.score: float = score
def __repr__(self):
"""
Return a string representation of the RetrievalResult.
Returns:
A string representation of the RetrievalResult object.
"""
return f"RetrievalResult(score={self.score}, embedding={self.embedding}, text={self.text}, reference={self.reference}), metadata={self.metadata}"
def deduplicate_results(results: List[RetrievalResult]) -> List[RetrievalResult]:
"""
Remove duplicate results based on text content.
This function removes duplicate results from a list of RetrievalResult objects
by keeping only the first occurrence of each unique text content.
Args:
results: A list of RetrievalResult objects to deduplicate.
Returns:
A list of deduplicated RetrievalResult objects.
"""
all_text_set = set()
deduplicated_results = []
for result in results:
if result.text not in all_text_set:
all_text_set.add(result.text)
deduplicated_results.append(result)
return deduplicated_results
class CollectionInfo:
"""
Represents information about a collection in the vector database.
This class encapsulates the name and description of a collection.
Attributes:
collection_name: The name of the collection.
description: The description of the collection.
"""
def __init__(self, collection_name: str, description: str):
"""
Initialize a CollectionInfo object.
Args:
collection_name: The name of the collection.
description: The description of the collection.
"""
self.collection_name = collection_name
self.description = description
class BaseVectorDB(ABC):
"""
Abstract base class for vector database implementations.
This class defines the interface for vector database implementations,
including methods for initializing collections, inserting data, searching,
listing collections, and clearing the database.
Attributes:
default_collection: The name of the default collection.
"""
def __init__(
self,
default_collection: str = "deepsearcher",
*args,
**kwargs,
):
"""
Initialize a BaseVectorDB object.
Args:
default_collection: The name of the default collection. Defaults to "deepsearcher".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
self.default_collection = default_collection
@abstractmethod
def init_collection(
self,
dim: int,
collection: str,
description: str,
force_new_collection=False,
*args,
**kwargs,
):
"""
Initialize a collection in the vector database.
Args:
dim: The dimensionality of the vectors in the collection.
collection: The name of the collection.
description: The description of the collection.
force_new_collection: If True, drop the existing collection and create a new one.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
pass
@abstractmethod
def insert_data(self, collection: str, chunks: List[Chunk], *args, **kwargs):
"""
Insert data into a collection in the vector database.
Args:
collection: The name of the collection.
chunks: A list of Chunk objects to insert.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
pass
@abstractmethod
def search_data(
self, collection: str, vector: Union[np.array, List[float]], *args, **kwargs
) -> List[RetrievalResult]:
"""
Search for similar vectors in a collection.
Args:
collection: The name of the collection.
vector: The query vector to search for.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
A list of RetrievalResult objects representing the search results.
"""
pass
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
"""
List all collections in the vector database.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
A list of CollectionInfo objects representing the collections.
"""
pass
@abstractmethod
def clear_db(self, *args, **kwargs):
"""
Clear the vector database.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
pass

305
deepsearcher/vector_db/milvus.py

@ -0,0 +1,305 @@
from typing import List, Optional, Union
import numpy as np
from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, RRFRanker
from deepsearcher.loader.splitter import Chunk
from deepsearcher.utils import log
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
class Milvus(BaseVectorDB):
"""Milvus class is a subclass of DB class."""
client: MilvusClient = None
def __init__(
self,
default_collection: str = "deepsearcher",
uri: str = "http://localhost:19530",
token: str = "root:Milvus",
user: str = "",
password: str = "",
db: str = "default",
hybrid: bool = False,
**kwargs,
):
"""
Initialize the Milvus client.
Args:
default_collection (str, optional): Default collection name. Defaults to "deepsearcher".
uri (str, optional): URI for connecting to Milvus server. Defaults to "http://localhost:19530".
token (str, optional): Authentication token for Milvus. Defaults to "root:Milvus".
user (str, optional): Username for authentication. Defaults to "".
password (str, optional): Password for authentication. Defaults to "".
db (str, optional): Database name. Defaults to "default".
hybrid (bool, optional): Whether to enable hybrid search. Defaults to False.
**kwargs: Additional keyword arguments to pass to the MilvusClient.
"""
super().__init__(default_collection)
self.default_collection = default_collection
self.client = MilvusClient(
uri=uri, user=user, password=password, token=token, db_name=db, timeout=30, **kwargs
)
self.hybrid = hybrid
def init_collection(
self,
dim: int,
collection: Optional[str] = "deepsearcher",
description: Optional[str] = "",
force_new_collection: bool = False,
text_max_length: int = 65_535,
reference_max_length: int = 2048,
metric_type: str = "L2",
*args,
**kwargs,
):
"""
Initialize a collection in Milvus.
Args:
dim (int): Dimension of the vector embeddings.
collection (Optional[str], optional): Collection name. Defaults to "deepsearcher".
description (Optional[str], optional): Collection description. Defaults to "".
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
if not collection:
collection = self.default_collection
if description is None:
description = ""
self.metric_type = metric_type
try:
has_collection = self.client.has_collection(collection, timeout=5)
if force_new_collection and has_collection:
self.client.drop_collection(collection)
elif has_collection:
return
schema = self.client.create_schema(
enable_dynamic_field=False, auto_id=True, description=description
)
schema.add_field("id", DataType.INT64, is_primary=True)
schema.add_field("embedding", DataType.FLOAT_VECTOR, dim=dim)
if self.hybrid:
analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}
schema.add_field(
"text",
DataType.VARCHAR,
max_length=text_max_length,
analyzer_params=analyzer_params,
enable_match=True,
enable_analyzer=True,
)
else:
schema.add_field("text", DataType.VARCHAR, max_length=text_max_length)
schema.add_field("reference", DataType.VARCHAR, max_length=reference_max_length)
schema.add_field("metadata", DataType.JSON)
if self.hybrid:
schema.add_field("sparse_vector", DataType.SPARSE_FLOAT_VECTOR)
bm25_function = Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names="sparse_vector",
)
schema.add_function(bm25_function)
index_params = self.client.prepare_index_params()
index_params.add_index(field_name="embedding", metric_type=metric_type)
if self.hybrid:
index_params.add_index(
field_name="sparse_vector",
index_type="SPARSE_INVERTED_INDEX",
metric_type="BM25",
)
self.client.create_collection(
collection,
schema=schema,
index_params=index_params,
consistency_level="Strong",
)
log.color_print(f"create collection [{collection}] successfully")
except Exception as e:
log.critical(f"fail to init db for milvus, error info: {e}")
def insert_data(
self,
collection: Optional[str],
chunks: List[Chunk],
batch_size: int = 256,
*args,
**kwargs,
):
"""
Insert data into a Milvus collection.
Args:
collection (Optional[str]): Collection name. If None, uses default_collection.
chunks (List[Chunk]): List of Chunk objects to insert.
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
if not collection:
collection = self.default_collection
texts = [chunk.text for chunk in chunks]
references = [chunk.reference for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
embeddings = [chunk.embedding for chunk in chunks]
datas = [
{
"embedding": embedding,
"text": text,
"reference": reference,
"metadata": metadata,
}
for embedding, text, reference, metadata in zip(
embeddings, texts, references, metadatas
)
]
batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)]
try:
for batch_data in batch_datas:
self.client.insert(collection_name=collection, data=batch_data)
except Exception as e:
log.critical(f"fail to insert data, error info: {e}")
def search_data(
self,
collection: Optional[str],
vector: Union[np.array, List[float]],
top_k: int = 5,
query_text: Optional[str] = None,
*args,
**kwargs,
) -> List[RetrievalResult]:
"""
Search for similar vectors in a Milvus collection.
Args:
collection (Optional[str]): Collection name. If None, uses default_collection.
vector (Union[np.array, List[float]]): Query vector for similarity search.
top_k (int, optional): Number of results to return. Defaults to 5.
query_text (Optional[str], optional): Original query text for hybrid search. Defaults to None.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[RetrievalResult]: List of retrieval results containing similar vectors.
"""
if not collection:
collection = self.default_collection
try:
use_hybrid = self.hybrid and query_text
if use_hybrid:
sparse_search_params = {"metric_type": "BM25"}
sparse_request = AnnSearchRequest(
[query_text], "sparse_vector", sparse_search_params, limit=top_k
)
dense_search_params = {"metric_type": self.metric_type}
dense_request = AnnSearchRequest(
[vector], "embedding", dense_search_params, limit=top_k
)
search_results = self.client.hybrid_search(
collection_name=collection,
reqs=[sparse_request, dense_request],
ranker=RRFRanker(),
limit=top_k,
output_fields=["embedding", "text", "reference", "metadata"],
timeout=10,
)
else:
search_results = self.client.search(
collection_name=collection,
data=[vector],
limit=top_k,
output_fields=["embedding", "text", "reference", "metadata"],
timeout=10,
)
return [
RetrievalResult(
embedding=b["entity"]["embedding"],
text=b["entity"]["text"],
reference=b["entity"]["reference"],
score=b["distance"],
metadata=b["entity"]["metadata"],
)
for a in search_results
for b in a
]
except Exception as e:
log.critical(f"fail to search data, error info: {e}")
return []
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
"""
List all collections in the Milvus database.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[CollectionInfo]: List of collection information objects.
"""
collection_infos = []
dim = kwargs.pop("dim", 0)
try:
collections = self.client.list_collections()
for collection in collections:
description = self.client.describe_collection(collection)
if dim != 0:
skip = False
for field_dict in description["fields"]:
if (
field_dict["name"] == "embedding"
and field_dict["type"] == DataType.FLOAT_VECTOR
):
if field_dict["params"]["dim"] != dim:
skip = True
if skip:
continue
collection_infos.append(
CollectionInfo(
collection_name=collection,
description=description["description"],
)
)
except Exception as e:
log.critical(f"fail to list collections, error info: {e}")
return collection_infos
def clear_db(self, collection: str = "deepsearcher", *args, **kwargs):
"""
Clear (drop) a collection from the Milvus database.
Args:
collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
if not collection:
collection = self.default_collection
try:
self.client.drop_collection(collection)
except Exception as e:
log.warning(f"fail to clear db, error info: {e}")

536
deepsearcher/vector_db/oracle.py

@ -0,0 +1,536 @@
import array
import json
from typing import List, Optional, Union
import numpy as np
from deepsearcher.loader.splitter import Chunk
from deepsearcher.utils import log
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
class OracleDB(BaseVectorDB):
"""OracleDB class is a subclass of DB class."""
client = None
def __init__(
self,
user: str,
password: str,
dsn: str,
config_dir: str,
wallet_location: str,
wallet_password: str,
min: int = 1,
max: int = 10,
increment: int = 1,
default_collection: str = "deepsearcher",
):
"""
Initialize the Oracle database connection.
Args:
user (str): Oracle database username.
password (str): Oracle database password.
dsn (str): Oracle database connection string.
config_dir (str): Directory containing Oracle configuration files.
wallet_location (str): Location of the Oracle wallet.
wallet_password (str): Password for the Oracle wallet.
min (int, optional): Minimum number of connections in the pool. Defaults to 1.
max (int, optional): Maximum number of connections in the pool. Defaults to 10.
increment (int, optional): Increment for adding new connections. Defaults to 1.
default_collection (str, optional): Default collection name. Defaults to "deepsearcher".
"""
super().__init__(default_collection)
self.default_collection = default_collection
import oracledb
oracledb.defaults.fetch_lobs = False
self.DB_TYPE_VECTOR = oracledb.DB_TYPE_VECTOR
try:
self.client = oracledb.create_pool(
user=user,
password=password,
dsn=dsn,
config_dir=config_dir,
wallet_location=wallet_location,
wallet_password=wallet_password,
min=min,
max=max,
increment=increment,
)
log.color_print(f"Connected to Oracle database at {dsn}")
self.check_table()
except Exception as e:
log.critical(f"Failed to connect to Oracle database at {dsn}")
log.critical(f"Oracle database error in init: {e}")
raise
def numpy_converter_in(self, value):
"""Convert numpy array to array.array"""
if value.dtype == np.float64:
dtype = "d"
elif value.dtype == np.float32:
dtype = "f"
else:
dtype = "b"
return array.array(dtype, value)
def input_type_handler(self, cursor, value, arraysize):
"""Set the type handler for the input data"""
if isinstance(value, np.ndarray):
return cursor.var(
self.DB_TYPE_VECTOR,
arraysize=arraysize,
inconverter=self.numpy_converter_in,
)
def numpy_converter_out(self, value):
"""Convert array.array to numpy array"""
if value.typecode == "b":
dtype = np.int8
elif value.typecode == "f":
dtype = np.float32
else:
dtype = np.float64
return np.array(value, copy=False, dtype=dtype)
def output_type_handler(self, cursor, metadata):
"""Set the type handler for the output data"""
if metadata.type_code is self.DB_TYPE_VECTOR:
return cursor.var(
metadata.type_code,
arraysize=cursor.arraysize,
outconverter=self.numpy_converter_out,
)
def query(self, sql: str, params: dict = None) -> Union[dict, None]:
"""
Execute a SQL query and return the results.
Args:
sql (str): SQL query to execute.
params (dict, optional): Parameters for the SQL query. Defaults to None.
Returns:
Union[dict, None]: Query results as a dictionary or None if no results.
Raises:
Exception: If there's an error executing the query.
"""
with self.client.acquire() as connection:
connection.inputtypehandler = self.input_type_handler
connection.outputtypehandler = self.output_type_handler
with connection.cursor() as cursor:
try:
if log.dev_mode:
print("sql:\n", sql)
# log.debug("def query:"+params)
# print("sql:\n",sql)
# print("params:\n",params)
cursor.execute(sql, params)
except Exception as e:
log.critical(f"Oracle database error in query: {e}")
raise
columns = [column[0].lower() for column in cursor.description]
rows = cursor.fetchall()
if rows:
data = [dict(zip(columns, row)) for row in rows]
else:
data = []
if log.dev_mode:
print("data:\n", data)
return data
# self.client.drop(connection)
def execute(self, sql: str, data: Union[list, dict] = None):
"""
Execute a SQL statement without returning results.
Args:
sql (str): SQL statement to execute.
data (Union[list, dict], optional): Data for the SQL statement. Defaults to None.
Raises:
Exception: If there's an error executing the statement.
"""
try:
with self.client.acquire() as connection:
connection.inputtypehandler = self.input_type_handler
connection.outputtypehandler = self.output_type_handler
with connection.cursor() as cursor:
# print("sql:\n",sql)
# print("data:\n",data)
if data is None:
cursor.execute(sql)
else:
cursor.execute(sql, data)
connection.commit()
except Exception as e:
log.critical(f"Oracle database error in execute: {e}")
log.error("ERROR sql:\n" + sql)
log.error("ERROR data:\n" + data)
raise
def has_collection(self, collection: str = "deepsearcher"):
"""
Check if a collection exists in the database.
Args:
collection (str, optional): Collection name to check. Defaults to "deepsearcher".
Returns:
bool: True if the collection exists, False otherwise.
"""
SQL = SQL_TEMPLATES["has_collection"]
params = {"collection": collection}
res = self.query(SQL, params)
if res:
if res[0]["rowcnt"] > 0:
return True
else:
return False
else:
return False
def check_table(self):
"""
Check if required tables exist and create them if they don't.
Raises:
Exception: If there's an error checking or creating tables.
"""
SQL = SQL_TEMPLATES["has_table"]
try:
res = self.query(SQL)
if len(res) < 2:
missing_table = TABLES.keys() - set([i["table_name"] for i in res])
for table in missing_table:
self.create_tables(table)
except Exception as e:
log.critical(f"Failed to check table in Oracle database, error info: {e}")
raise
def create_tables(self, table_name):
"""
Create a table in the database.
Args:
table_name: Name of the table to create.
Raises:
Exception: If there's an error creating the table.
"""
SQL = TABLES[table_name]
try:
self.execute(SQL)
log.color_print(f"Created table {table_name} in Oracle database")
except Exception as e:
log.critical(f"Failed to create table {table_name} in Oracle database, error info: {e}")
raise
def drop_collection(self, collection: str = "deepsearcher"):
"""
Drop a collection from the database.
Args:
collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
Raises:
Exception: If there's an error dropping the collection.
"""
try:
params = {"collection": collection}
SQL = SQL_TEMPLATES["drop_collection"]
self.execute(SQL, params)
SQL = SQL_TEMPLATES["drop_collection_item"]
self.execute(SQL, params)
log.color_print(f"Collection {collection} dropped")
except Exception as e:
log.critical(f"fail to drop collection, error info: {e}")
raise
def insertone(self, data):
"""
Insert a single record into the database.
Args:
data: Data to insert.
"""
SQL = SQL_TEMPLATES["insert"]
self.execute(SQL, data)
log.debug("insert done!")
def searchone(
self,
collection: Optional[str],
vector: Union[np.array, List[float]],
top_k: int = 5,
):
"""
Search for similar vectors in a collection.
Args:
collection (Optional[str]): Collection name to search in.
vector (Union[np.array, List[float]]): Query vector for similarity search.
top_k (int, optional): Number of results to return. Defaults to 5.
Returns:
list: List of search results.
Raises:
Exception: If there's an error during search.
"""
log.debug("def searchone:" + collection)
try:
if isinstance(vector, List):
vector = np.array(vector)
embedding_string = "[" + ", ".join(map(str, vector.tolist())) + "]"
dimension = vector.shape[0]
dtype = str(vector.dtype).upper()
SQL = SQL_TEMPLATES["search"].format(dimension=dimension, dtype=dtype)
max_distance = 0.8
params = {
"collection": collection,
"embedding_string": embedding_string,
"top_k": top_k,
"max_distance": max_distance,
}
res = self.query(SQL, params)
if res:
return res
else:
return []
except Exception as e:
log.critical(f"fail to search data, error info: {e}")
raise
def init_collection(
self,
dim: int,
collection: Optional[str] = "deepsearcher",
description: Optional[str] = "",
force_new_collection: bool = False,
text_max_length: int = 65_535,
reference_max_length: int = 2048,
metric_type: str = "L2",
*args,
**kwargs,
):
"""
Initialize a collection in the database.
Args:
dim (int): Dimension of the vector embeddings.
collection (Optional[str], optional): Collection name. Defaults to "deepsearcher".
description (Optional[str], optional): Collection description. Defaults to "".
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Raises:
Exception: If there's an error initializing the collection.
"""
if not collection:
collection = self.default_collection
if description is None:
description = ""
try:
has_collection = self.has_collection(collection)
if force_new_collection and has_collection:
self.drop_collection(collection)
elif has_collection:
return
# insert collection info
SQL = SQL_TEMPLATES["insert_collection"]
params = {"collection": collection, "description": description}
self.execute(SQL, params)
except Exception as e:
log.critical(f"fail to init_collection for oracle, error info: {e}")
def insert_data(
self,
collection: Optional[str],
chunks: List[Chunk],
batch_size: int = 256,
*args,
**kwargs,
):
"""
Insert data into a collection.
Args:
collection (Optional[str]): Collection name. If None, uses default_collection.
chunks (List[Chunk]): List of Chunk objects to insert.
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Raises:
Exception: If there's an error inserting data.
"""
if not collection:
collection = self.default_collection
datas = []
for chunk in chunks:
_data = {
"embedding": self.numpy_converter_in(np.array(chunk.embedding)),
"text": chunk.text,
"reference": chunk.reference,
"metadata": json.dumps(chunk.metadata),
"collection": collection,
}
datas.append(_data)
batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)]
try:
for batch_data in batch_datas:
for _data in batch_data:
self.insertone(data=_data)
log.color_print(f"Successfully insert {len(datas)} data")
except Exception as e:
log.critical(f"fail to insert data, error info: {e}")
raise
def search_data(
self,
collection: Optional[str],
vector: Union[np.array, List[float]],
top_k: int = 5,
*args,
**kwargs,
) -> List[RetrievalResult]:
"""
Search for similar vectors in a collection.
Args:
collection (Optional[str]): Collection name. If None, uses default_collection.
vector (Union[np.array, List[float]]): Query vector for similarity search.
top_k (int, optional): Number of results to return. Defaults to 5.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[RetrievalResult]: List of retrieval results containing similar vectors.
Raises:
Exception: If there's an error during search.
"""
if not collection:
collection = self.default_collection
try:
# print("def search_data:",collection)
# print("def search_data:",type(vector))
search_results = self.searchone(collection=collection, vector=vector, top_k=top_k)
# print("def search_data: search_results",search_results)
return [
RetrievalResult(
embedding=b["embedding"],
text=b["text"],
reference=b["reference"],
score=b["distance"],
metadata=json.loads(b["metadata"]),
)
for b in search_results
]
except Exception as e:
log.critical(f"fail to search data, error info: {e}")
raise
# return []
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
"""
List all collections in the database.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[CollectionInfo]: List of collection information objects.
"""
collection_infos = []
try:
SQL = SQL_TEMPLATES["list_collections"]
log.debug("def list_collections:" + SQL)
collections = self.query(SQL)
if collections:
for collection in collections:
collection_infos.append(
CollectionInfo(
collection_name=collection["collection"],
description=collection["description"],
)
)
return collection_infos
except Exception as e:
log.critical(f"fail to list collections, error info: {e}")
raise
def clear_db(self, collection: str = "deepsearcher", *args, **kwargs):
"""
Clear (drop) a collection from the database.
Args:
collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
if not collection:
collection = self.default_collection
try:
self.client.drop_collection(collection)
except Exception as e:
log.warning(f"fail to clear db, error info: {e}")
raise
TABLES = {
"DEEPSEARCHER_COLLECTION_INFO": """CREATE TABLE DEEPSEARCHER_COLLECTION_INFO (
id INT generated by default as identity primary key,
collection varchar(256),
description CLOB,
status NUMBER DEFAULT 1,
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updatetime TIMESTAMP DEFAULT NULL)""",
"DEEPSEARCHER_COLLECTION_ITEM": """CREATE TABLE DEEPSEARCHER_COLLECTION_ITEM (
id INT generated by default as identity primary key,
collection varchar(256),
embedding VECTOR,
text CLOB,
reference varchar(4000),
metadata CLOB,
status NUMBER DEFAULT 1,
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updatetime TIMESTAMP DEFAULT NULL)""",
}
SQL_TEMPLATES = {
"has_table": f"""SELECT table_name FROM all_tables
WHERE table_name in ({",".join([f"'{k}'" for k in TABLES.keys()])})""",
"has_collection": "select count(*) as rowcnt from DEEPSEARCHER_COLLECTION_INFO where collection=:collection and status=1",
"list_collections": "select collection,description from DEEPSEARCHER_COLLECTION_INFO where status=1",
"drop_collection": "update DEEPSEARCHER_COLLECTION_INFO set status=0 where collection=:collection and status=1",
"drop_collection_item": "update DEEPSEARCHER_COLLECTION_ITEM set status=0 where collection=:collection and status=1",
"insert_collection": """INSERT INTO DEEPSEARCHER_COLLECTION_INFO (collection,description)
values (:collection,:description)""",
"insert": """INSERT INTO DEEPSEARCHER_COLLECTION_ITEM (collection,embedding,text,reference,metadata)
values (:collection,:embedding,:text,:reference,:metadata)""",
"search": """SELECT * FROM
(SELECT t.*,
VECTOR_DISTANCE(t.embedding,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance
FROM DEEPSEARCHER_COLLECTION_ITEM t
JOIN DEEPSEARCHER_COLLECTION_INFO c ON t.collection=c.collection
WHERE t.collection=:collection AND t.status=1 AND c.status=1)
WHERE distance<:max_distance ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""",
}

290
deepsearcher/vector_db/qdrant.py

@ -0,0 +1,290 @@
import uuid
from typing import List, Optional, Union
import numpy as np
from deepsearcher.loader.splitter import Chunk
from deepsearcher.utils import log
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
DEFAULT_COLLECTION_NAME = "deepsearcher"
TEXT_PAYLOAD_KEY = "text"
REFERENCE_PAYLOAD_KEY = "reference"
METADATA_PAYLOAD_KEY = "metadata"
class Qdrant(BaseVectorDB):
"""Vector DB implementation powered by [Qdrant](https://qdrant.tech/)"""
def __init__(
self,
location: Optional[str] = None,
url: Optional[str] = None,
port: Optional[int] = 6333,
grpc_port: int = 6334,
prefer_grpc: bool = False,
https: Optional[bool] = None,
api_key: Optional[str] = None,
prefix: Optional[str] = None,
timeout: Optional[int] = None,
host: Optional[str] = None,
path: Optional[str] = None,
default_collection: str = DEFAULT_COLLECTION_NAME,
):
"""
Initialize the Qdrant client with flexible connection options.
Args:
location (Optional[str], optional):
- If ":memory:" - use in-memory Qdrant instance.
- If str - use it as a URL parameter.
- If None - use default values for host and port.
Defaults to None.
url (Optional[str], optional):
URL for Qdrant service, can include scheme, host, port, and prefix.
Allows flexible connection string specification.
Defaults to None.
port (Optional[int], optional):
Port of the REST API interface.
Defaults to 6333.
grpc_port (int, optional):
Port of the gRPC interface.
Defaults to 6334.
prefer_grpc (bool, optional):
If True, use gRPC interface whenever possible in custom methods.
Defaults to False.
https (Optional[bool], optional):
If True, use HTTPS (SSL) protocol.
Defaults to None.
api_key (Optional[str], optional):
API key for authentication in Qdrant Cloud.
Defaults to None.
prefix (Optional[str], optional):
If not None, add prefix to the REST URL path.
Example: 'service/v1' results in 'http://localhost:6333/service/v1/{qdrant-endpoint}'
Defaults to None.
timeout (Optional[int], optional):
Timeout for REST and gRPC API requests.
Default is 5 seconds for REST and unlimited for gRPC.
Defaults to None.
host (Optional[str], optional):
Host name of Qdrant service.
If url and host are None, defaults to 'localhost'.
Defaults to None.
path (Optional[str], optional):
Persistence path for QdrantLocal.
Defaults to None.
default_collection (str, optional):
Default collection name to be used.
"""
try:
from qdrant_client import QdrantClient
except ImportError as original_error:
raise ImportError(
"Qdrant client is not installed. Install it using: pip install qdrant-client\n"
) from original_error
super().__init__(default_collection)
self.client = QdrantClient(
location=location,
url=url,
port=port,
grpc_port=grpc_port,
prefer_grpc=prefer_grpc,
https=https,
api_key=api_key,
prefix=prefix,
timeout=timeout,
host=host,
path=path,
)
def init_collection(
self,
dim: int,
collection: Optional[str] = None,
description: Optional[str] = "",
force_new_collection: bool = False,
text_max_length: int = 65_535,
reference_max_length: int = 2048,
distance_metric: str = "Cosine",
*args,
**kwargs,
):
"""
Initialize a collection in Qdrant.
Args:
dim (int): Dimension of the vector embeddings.
collection (Optional[str], optional): Collection name.
description (Optional[str], optional): Collection description. Defaults to "".
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
distance_metric (str, optional): Metric type for vector similarity search. Defaults to "Cosine".
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
from qdrant_client import models
collection = collection or self.default_collection
try:
collection_exists = self.client.collection_exists(collection_name=collection)
if force_new_collection and collection_exists:
self.client.delete_collection(collection_name=collection)
collection_exists = False
if not collection_exists:
self.client.create_collection(
collection_name=collection,
vectors_config=models.VectorParams(size=dim, distance=distance_metric),
*args,
**kwargs,
)
log.color_print(f"Created collection [{collection}] successfully")
except Exception as e:
log.critical(f"Failed to init Qdrant collection, error info: {e}")
def insert_data(
self,
collection: Optional[str],
chunks: List[Chunk],
batch_size: int = 256,
*args,
**kwargs,
):
"""
Insert data into a Qdrant collection.
Args:
collection (Optional[str]): Collection name.
chunks (List[Chunk]): List of Chunk objects to insert.
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
from qdrant_client import models
try:
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i : i + batch_size]
points = [
models.PointStruct(
id=uuid.uuid4().hex,
vector=chunk.embedding,
payload={
TEXT_PAYLOAD_KEY: chunk.text,
REFERENCE_PAYLOAD_KEY: chunk.reference,
METADATA_PAYLOAD_KEY: chunk.metadata,
},
)
for chunk in batch_chunks
]
self.client.upsert(
collection_name=collection or self.default_collection, points=points
)
except Exception as e:
log.critical(f"Failed to insert data, error info: {e}")
def search_data(
self,
collection: Optional[str],
vector: Union[np.array, List[float]],
top_k: int = 5,
*args,
**kwargs,
) -> List[RetrievalResult]:
"""
Search for similar vectors in a Qdrant collection.
Args:
collection (Optional[str]): Collection name..
vector (Union[np.array, List[float]]): Query vector for similarity search.
top_k (int, optional): Number of results to return. Defaults to 5.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[RetrievalResult]: List of retrieval results containing similar vectors.
"""
try:
results = self.client.query_points(
collection_name=collection or self.default_collection,
query=vector,
limit=top_k,
with_payload=True,
with_vectors=True,
).points
return [
RetrievalResult(
embedding=result.vector,
text=result.payload.get(TEXT_PAYLOAD_KEY, ""),
reference=result.payload.get(REFERENCE_PAYLOAD_KEY, ""),
score=result.score,
metadata=result.payload.get(METADATA_PAYLOAD_KEY, {}),
)
for result in results
]
except Exception as e:
log.critical(f"Failed to search data, error info: {e}")
return []
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
"""
List all collections in the Qdrant database.
Args:
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
Returns:
List[CollectionInfo]: List of collection information objects.
"""
collection_infos = []
try:
collections = self.client.get_collections().collections
for collection in collections:
collection_infos.append(
CollectionInfo(
collection_name=collection.name,
# Qdrant doesn't have a native description field
description=collection.name,
)
)
except Exception as e:
log.critical(f"Failed to list collections, error info: {e}")
return collection_infos
def clear_db(self, collection: Optional[str] = None, *args, **kwargs):
"""
Clear (drop) a collection from the Qdrant database.
Args:
collection (str, optional): Collection name to drop.
*args: Variable length argument list.
**kwargs: Arbitrary keyword arguments.
"""
try:
self.client.delete_collection(collection_name=collection or self.default_collection)
except Exception as e:
log.warning(f"Failed to drop collection, error info: {e}")

42
docs/README.md

@ -0,0 +1,42 @@
# DeepSearcher Documentation
This directory contains the documentation for DeepSearcher, powered by MkDocs.
## Setup
1. Install MkDocs and required plugins:
```bash
pip install mkdocs mkdocs-material mkdocs-jupyter pymdown-extensions
```
2. Clone the repository:
```bash
git clone https://github.com/zilliztech/deep-searcher.git
cd deep-searcher
```
## Development
To serve the documentation locally:
```bash
mkdocs serve
```
This will start a local server at http://127.0.0.1:8000/ where you can preview the documentation.
## Building
To build the static site:
```bash
mkdocs build
```
This will generate the static site in the `site` directory.
## Deployment
The documentation is automatically deployed when changes are pushed to the main branch using GitHub Actions.

BIN
docs/assets/pic/deep-searcher-arch.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 307 KiB

BIN
docs/assets/pic/demo.gif

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 MiB

BIN
docs/assets/pic/logo-badge.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

BIN
docs/assets/pic/logo.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

126
docs/configuration/embedding.md

@ -0,0 +1,126 @@
# Embedding Model Configuration
DeepSearcher supports various embedding models to convert text into vector representations for semantic search.
## 📝 Basic Configuration
```python
config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)")
```
## 📋 Available Embedding Providers
| Provider | Description | Key Features |
|----------|-------------|--------------|
| **OpenAIEmbedding** | OpenAI's text embedding models | High quality, production-ready |
| **MilvusEmbedding** | Built-in embedding models via Pymilvus | Multiple model options |
| **VoyageEmbedding** | VoyageAI embedding models | Specialized for search |
| **BedrockEmbedding** | Amazon Bedrock embedding | AWS integration |
| **GeminiEmbedding** | Google's Gemini embedding | High performance |
| **GLMEmbedding** | ChatGLM embeddings | Chinese language support |
| **OllamaEmbedding** | Local embedding with Ollama | Self-hosted option |
| **PPIOEmbedding** | PPIO cloud embedding | Scalable solution |
| **SiliconflowEmbedding** | Siliconflow's models | Enterprise support |
| **VolcengineEmbedding** | Volcengine embedding | High throughput |
| **NovitaEmbedding** | Novita AI embedding | Cost-effective |
| **SentenceTransformerEmbedding** | Sentence Transfomer Embedding | Self-hosted option |
| **IBM watsonx.ai** | Various options | IBM's Enterprise AI platform |
## 🔍 Provider Examples
### OpenAI Embedding
```python
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"})
```
*Requires `OPENAI_API_KEY` environment variable*
### Milvus Built-in Embedding
```python
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"})
```
```python
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"})
```
*For Jina's embedding model, requires `JINAAI_API_KEY` environment variable*
### VoyageAI Embedding
```python
config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"})
```
*Requires `VOYAGE_API_KEY` environment variable and `pip install voyageai`*
## 📚 Additional Providers
??? example "Amazon Bedrock"
```python
config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"})
```
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`*
??? example "Novita AI"
```python
config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"})
```
*Requires `NOVITA_API_KEY` environment variable*
??? example "Siliconflow"
```python
config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"})
```
*Requires `SILICONFLOW_API_KEY` environment variable*
??? example "Volcengine"
```python
config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"})
```
*Requires `VOLCENGINE_API_KEY` environment variable*
??? example "GLM"
```python
config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"})
```
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`*
??? example "Google Gemini"
```python
config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"})
```
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`*
??? example "Ollama"
```python
config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"})
```
*Requires local Ollama installation and `pip install ollama`*
??? example "PPIO"
```python
config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"})
```
*Requires `PPIO_API_KEY` environment variable*
??? example "SentenceTransformer"
```python
config.set_provider_config("embedding", "SentenceTransformerEmbedding", {"model": "BAAI/bge-large-zh-v1.5"})
```
*Requires `pip install sentence-transformers`*
??? example "IBM WatsonX"
```python
config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"})
```
*Requires `pip install ibm-watsonx-ai`*

70
docs/configuration/file_loader.md

@ -0,0 +1,70 @@
# File Loader Configuration
DeepSearcher supports various file loaders to extract and process content from different file formats.
## 📝 Basic Configuration
```python
config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)")
```
## 📋 Available File Loaders
| Loader | Description | Supported Formats |
|--------|-------------|-------------------|
| **UnstructuredLoader** | General purpose document loader with broad format support | PDF, DOCX, PPT, HTML, etc. |
| **DoclingLoader** | Document processing library with extraction capabilities | See [documentation](https://docling-project.github.io/docling/usage/supported_formats/) |
## 🔍 File Loader Options
### Unstructured
[Unstructured](https://unstructured.io/) is a powerful library for extracting content from various document formats.
```python
config.set_provider_config("file_loader", "UnstructuredLoader", {})
```
??? tip "Setup Instructions"
You can use Unstructured in two ways:
1. **With API** (recommended for production)
- Set environment variables:
- `UNSTRUCTURED_API_KEY`
- `UNSTRUCTURED_API_URL`
2. **Local Processing**
- Simply don't set the API environment variables
- Install required dependencies:
```bash
# Install core dependencies
pip install unstructured-ingest
# For all document formats
pip install "unstructured[all-docs]"
# For specific formats (e.g., PDF only)
pip install "unstructured[pdf]"
```
For more information:
- [Unstructured Documentation](https://docs.unstructured.io/ingestion/overview)
- [Installation Guide](https://docs.unstructured.io/open-source/installation/full-installation)
### Docling
[Docling](https://docling-project.github.io/docling/) provides document processing capabilities with support for multiple formats.
```python
config.set_provider_config("file_loader", "DoclingLoader", {})
```
??? tip "Setup Instructions"
1. Install Docling:
```bash
pip install docling
```
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats).

33
docs/configuration/index.md

@ -0,0 +1,33 @@
# Configuration Overview
DeepSearcher provides flexible configuration options for all its components. You can customize the following aspects of the system:
## 📋 Components
| Component | Purpose | Documentation |
|-----------|---------|---------------|
| **LLM** | Large Language Models for query processing | [LLM Configuration](llm.md) |
| **Embedding Models** | Text embedding for vector retrieval | [Embedding Models](embedding.md) |
| **Vector Database** | Storage and retrieval of vector embeddings | [Vector Database](vector_db.md) |
| **File Loader** | Loading and processing various file formats | [File Loader](file_loader.md) |
| **Web Crawler** | Gathering information from web sources | [Web Crawler](web_crawler.md) |
## 🔄 Configuration Method
DeepSearcher uses a consistent configuration approach for all components:
```python
from deepsearcher.configuration import Configuration, init_config
# Create configuration
config = Configuration()
# Set provider configurations
config.set_provider_config("[component]", "[provider]", {"option": "value"})
# Initialize with configuration
init_config(config=config)
```
For detailed configuration options for each component, please visit the corresponding documentation pages linked in the table above.

192
docs/configuration/llm.md

@ -0,0 +1,192 @@
# LLM Configuration
DeepSearcher supports various Large Language Models (LLMs) for processing queries and generating responses.
## 📝 Basic Configuration
```python
config.set_provider_config("llm", "(LLMName)", "(Arguments dict)")
```
## 📋 Available LLM Providers
| Provider | Description | Key Models |
|----------|-------------|------------|
| **OpenAI** | OpenAI's API for GPT models | o1-mini, GPT-4 |
| **DeepSeek** | DeepSeek AI offering | deepseek-reasoner, coder |
| **Anthropic** | Anthropic's Claude models | claude-sonnet-4-0 |
| **Gemini** | Google's Gemini models | gemini-1.5-pro, gemini-2.0-flash |
| **XAI** | X.AI's Grok models | grok-2-latest |
| **Ollama** | Local LLM deployment | llama3, qwq, etc. |
| **SiliconFlow** | Enterprise AI platform | deepseek-r1 |
| **TogetherAI** | Multiple model options | llama-4, deepseek |
| **PPIO** | Cloud AI infrastructure | deepseek, llama |
| **Volcengine** | ByteDance LLM platform | deepseek-r1 |
| **GLM** | ChatGLM models | glm-4-plus |
| **Bedrock** | Amazon Bedrock LLMs | anthropic.claude, ai21.j2 |
| **Novita** | Novita AI models | Various options |
| **IBM watsonx.ai** | IBM Enterprise AI platform | Various options |
## 🔍 Provider Examples
### OpenAI
```python
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
```
*Requires `OPENAI_API_KEY` environment variable*
### DeepSeek
```python
config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"})
```
*Requires `DEEPSEEK_API_KEY` environment variable*
### IBM WatsonX
```python
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"})
```
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables*
## 📚 Additional Providers
??? example "DeepSeek from SiliconFlow"
```python
config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"})
```
*Requires `SILICONFLOW_API_KEY` environment variable*
More details about SiliconFlow: [https://docs.siliconflow.cn/quickstart](https://docs.siliconflow.cn/quickstart)
??? example "DeepSeek from TogetherAI"
*Requires `TOGETHER_API_KEY` environment variable and `pip install together`*
For DeepSeek R1:
```python
config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"})
```
For Llama 4:
```python
config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"})
```
More details about TogetherAI: [https://www.together.ai/](https://www.together.ai/)
??? example "XAI Grok"
```python
config.set_provider_config("llm", "XAI", {"model": "grok-2-latest"})
```
*Requires `XAI_API_KEY` environment variable*
More details about XAI Grok: [https://docs.x.ai/docs/overview#featured-models](https://docs.x.ai/docs/overview#featured-models)
??? example "Claude"
```python
config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"})
```
*Requires `ANTHROPIC_API_KEY` environment variable*
More details about Anthropic Claude: [https://docs.anthropic.com/en/home](https://docs.anthropic.com/en/home)
??? example "Google Gemini"
```python
config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' })
```
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`*
More details about Gemini: [https://ai.google.dev/gemini-api/docs](https://ai.google.dev/gemini-api/docs)
??? example "DeepSeek from PPIO"
```python
config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"})
```
*Requires `PPIO_API_KEY` environment variable*
More details about PPIO: [https://ppinfra.com/docs/get-started/quickstart.html](https://ppinfra.com/docs/get-started/quickstart.html)
??? example "Ollama"
```python
config.set_provider_config("llm", "Ollama", {"model": "qwq"})
```
Follow [these instructions](https://github.com/jmorganca/ollama) to set up and run a local Ollama instance:
1. [Download](https://ollama.ai/download) and install Ollama
2. View available models via the [model library](https://ollama.ai/library)
3. Pull models with `ollama pull <name-of-model>`
4. By default, Ollama has a REST API on [http://localhost:11434](http://localhost:11434)
??? example "Volcengine"
```python
config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"})
```
*Requires `VOLCENGINE_API_KEY` environment variable*
More details about Volcengine: [https://www.volcengine.com/docs/82379/1099455](https://www.volcengine.com/docs/82379/1099455)
??? example "GLM"
```python
config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"})
```
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`*
More details about GLM: [https://bigmodel.cn/dev/welcome](https://bigmodel.cn/dev/welcome)
??? example "Amazon Bedrock"
```python
config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"})
```
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`*
More details about Amazon Bedrock: [https://docs.aws.amazon.com/bedrock/](https://docs.aws.amazon.com/bedrock/)
??? example "Aliyun Bailian"
```python
config.set_provider_config("llm", "OpenAI", {"model": "deepseek-r1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"})
```
*Requires `OPENAI_API_KEY` environment variable*
More details about Aliyun Bailian models: [https://bailian.console.aliyun.com](https://bailian.console.aliyun.com)
??? example "IBM watsonx.ai LLM"
```python
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"})
```
With custom parameters:
```python
config.set_provider_config("llm", "WatsonX", {
"model": "ibm/granite-3-3-8b-instruct",
"max_new_tokens": 1000,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50
})
```
With space_id instead of project_id:
```python
config.set_provider_config("llm", "WatsonX", {
"model": "ibm/granite-3-3-8b-instruct""
})
```
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables and `pip install ibm-watsonx-ai`*
More details about WatsonX: [https://www.ibm.com/products/watsonx-ai/foundation-models](https://www.ibm.com/products/watsonx-ai/foundation-models)
```

52
docs/configuration/vector_db.md

@ -0,0 +1,52 @@
# Vector Database Configuration
DeepSearcher uses vector databases to store and retrieve document embeddings for efficient semantic search.
## 📝 Basic Configuration
```python
config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)")
```
Currently supported vector databases:
- Milvus (including Milvus Lite and Zilliz Cloud)
## 🔍 Milvus Configuration
```python
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})
```
### Deployment Options
??? example "Local Storage with Milvus Lite"
Setting the `uri` as a local file (e.g., `./milvus.db`) automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file. This is the most convenient method for development and smaller datasets.
```python
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})
```
??? example "Standalone Milvus Server"
For larger datasets, you can set up a more performant Milvus server using [Docker or Kubernetes](https://milvus.io/docs/quickstart.md). In this setup, use the server URI as your `uri` parameter:
```python
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "token": ""})
```
Also, you could specify other connection parameters supported by Milvus such as `user`, `password`, `secure` or others.
```python
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "user": "<username>", "password":"<password>", "secure": True, "token": ""})
```
??? example "Zilliz Cloud (Managed Service)"
[Zilliz Cloud](https://zilliz.com/cloud) provides a fully managed cloud service for Milvus. To use Zilliz Cloud, adjust the `uri` and `token` according to the [Public Endpoint and API Key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details):
```python
config.set_provider_config("vector_db", "Milvus", {
"uri": "https://your-instance-id.api.gcp-us-west1.zillizcloud.com",
"token": "your_api_key"
})
```

97
docs/configuration/web_crawler.md

@ -0,0 +1,97 @@
# Web Crawler Configuration
DeepSearcher supports various web crawlers to collect data from websites for processing and indexing.
## 📝 Basic Configuration
```python
config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)")
```
## 📋 Available Web Crawlers
| Crawler | Description | Key Feature |
|---------|-------------|-------------|
| **FireCrawlCrawler** | Cloud-based web crawling service | Simple API, managed service |
| **Crawl4AICrawler** | Browser automation crawler | Full JavaScript support |
| **JinaCrawler** | Content extraction service | High accuracy parsing |
| **DoclingCrawler** | Doc processing with crawling | Multiple format support |
## 🔍 Web Crawler Options
### FireCrawl
[FireCrawl](https://docs.firecrawl.dev/introduction) is a cloud-based web crawling service designed for AI applications.
**Key features:**
- Simple API
- Managed Service
- Advanced Parsing
```python
config.set_provider_config("web_crawler", "FireCrawlCrawler", {})
```
??? tip "Setup Instructions"
1. Sign up for FireCrawl and get an API key
2. Set the API key as an environment variable:
```bash
export FIRECRAWL_API_KEY="your_api_key"
```
3. For more information, see the [FireCrawl documentation](https://docs.firecrawl.dev/introduction)
### Crawl4AI
[Crawl4AI](https://docs.crawl4ai.com/) is a Python package for web crawling with browser automation capabilities.
```python
config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}})
```
??? tip "Setup Instructions"
1. Install Crawl4AI:
```bash
pip install crawl4ai
```
2. Run the setup command:
```bash
crawl4ai-setup
```
3. For more information, see the [Crawl4AI documentation](https://docs.crawl4ai.com/)
### Jina Reader
[Jina Reader](https://jina.ai/reader/) is a service for extracting content from web pages with high accuracy.
```python
config.set_provider_config("web_crawler", "JinaCrawler", {})
```
??? tip "Setup Instructions"
1. Get a Jina API key
2. Set the API key as an environment variable:
```bash
export JINA_API_TOKEN="your_api_key"
# or
export JINAAI_API_KEY="your_api_key"
```
3. For more information, see the [Jina Reader documentation](https://jina.ai/reader/)
### Docling Crawler
[Docling](https://docling-project.github.io/docling/) provides web crawling capabilities alongside its document processing features.
```python
config.set_provider_config("web_crawler", "DoclingCrawler", {})
```
??? tip "Setup Instructions"
1. Install Docling:
```bash
pip install docling
```
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats)

159
docs/contributing/index.md

@ -0,0 +1,159 @@
# Contributing to DeepSearcher
We welcome contributions from everyone. This document provides guidelines to make the contribution process straightforward.
## Pull Request Process
1. Fork the repository and create your branch from `master`.
2. Make your changes.
3. Run tests and linting to ensure your code meets the project's standards.
4. Update documentation if necessary.
5. Submit a pull request.
## Linting and Formatting
Keeping a consistent style for code, code comments, commit messages, and PR descriptions will greatly accelerate your PR review process.
We require you to run code linter and formatter before submitting your pull requests:
To check the coding styles:
```shell
make lint
```
To fix the coding styles:
```shell
make format
```
Our CI pipeline also runs these checks automatically on all pull requests to ensure code quality and consistency.
## Development Environment Setup with uv
DeepSearcher uses [uv](https://github.com/astral-sh/uv) as the recommended package manager. uv is a fast, reliable Python package manager and installer. The project's `pyproject.toml` is configured to work with uv, which will provide faster dependency resolution and package installation compared to traditional tools.
### Install Project in Development Mode(aka Editable Installation)
1. Install uv if you haven't already:
Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/).
2. Clone the repository and navigate to the project directory:
```shell
git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher
```
3. Synchronize and install dependencies:
```shell
uv sync
source .venv/bin/activate
```
`uv sync` will install all dependencies specified in `uv.lock` file. And the `source .venv/bin/activate` command will activate the virtual environment.
- (Optional) To install all optional dependencies:
```shell
uv sync --all-extras --dev
```
- (Optional) To install specific optional dependencies:
```shell
# Take optional `ollama` dependency for example
uv sync --extra ollama
```
For more optional dependencies, refer to the `[project.optional-dependencies]` part of `pyproject.toml` file.
### Adding Dependencies
When you need to add new dependencies to the `pyproject.toml` file, you can use the following commands:
```shell
uv add <package_name>
```
DeepSearcher uses optional dependencies to keep the default installation lightweight. Optional features can be installed using the syntax `deepsearcher[<extra>]`. To add a dependency to an optional extra, use the following command:
```shell
uv add <package_name> --optional <extra>
```
For more details, refer to the [offical Managing dependencies documentation](https://docs.astral.sh/uv/concepts/projects/dependencies/).
### Dependencies Locking
For development, we use lockfiles to ensure consistent dependencies. You can use
```shell
uv lock --check
```
to verify if your lockfile is up-to-date with your project dependencies.
When you modify or add dependencies in the project, the lockfile will be automatically updated the next time you run a uv command. You can also explicitly update the lockfile using:
```shell
uv lock
```
While the environment is synced automatically, it may also be explicitly synced using uv sync:
```shell
uv sync
```
Syncing the environment manually is especially useful for ensuring your editor has the correct versions of dependencies.
For more detailed information about dependency locking and syncing, refer to the [offical Locking and syncing documentation](https://docs.astral.sh/uv/concepts/projects/sync/).
## Running Tests
Before submitting your pull request, make sure to run the test suite to ensure your changes haven't introduced any regressions.
### Installing Test Dependencies
First, ensure you have pytest installed. If you haven't installed the development dependencies yet, you can do so with:
```shell
uv sync --all-extras --dev
```
This will install all development dependencies and optional dependencies including pytest and other testing tools.
### Running the Tests
To run all tests in the `tests` directory:
```shell
uv run pytest tests
```
For more verbose output that shows individual test results:
```shell
uv run pytest tests -v
```
You can also run tests for specific directories or files. For example:
```shell
# Run tests in a specific directory
uv run pytest tests/embedding
# Run tests in a specific file
uv run pytest tests/embedding/test_bedrock_embedding.py
# Run a specific test class
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding
# Run a specific test method
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding::test_init_default
```
The `-v` flag (verbose mode) provides more detailed output, showing each test case and its result individually. This is particularly useful when you want to see which specific tests are passing or failing.
## Developer Certificate of Origin (DCO)
All contributions require a sign-off, acknowledging the [Developer Certificate of Origin](https://developercertificate.org/).
Add a `Signed-off-by` line to your commit message:
```text
Signed-off-by: Your Name <your.email@example.com>
```

65
docs/examples/basic_example.md

@ -0,0 +1,65 @@
# Basic Example
This example demonstrates the core functionality of DeepSearcher - loading documents and performing semantic search.
## Overview
The script performs these steps:
1. Configures DeepSearcher with default settings
2. Loads a PDF document about Milvus
3. Asks a question about Milvus and vector databases
4. Displays token usage information
## Code Example
```python
import logging
import os
from deepsearcher.offline_loading import load_from_local_files
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
current_dir = os.path.dirname(os.path.abspath(__file__))
config = Configuration() # Customize your config here
init_config(config=config)
# You should clone the milvus docs repo to your local machine first, execute:
# git clone https://github.com/milvus-io/milvus-docs.git
# Then replace the path below with the path to the milvus-docs repo on your local machine
# import glob
# all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True)
# load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents")
# Hint: You can also load a single file, please execute it in the root directory of the deep searcher project
load_from_local_files(
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
collection_name="milvus_docs",
collection_description="All Milvus Documents",
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
)
question = "Write a report comparing Milvus with other vector databases."
_, _, consumed_token = query(question, max_iter=1)
print(f"Consumed tokens: {consumed_token}")
```
## Running the Example
1. Make sure you have installed DeepSearcher: `pip install deepsearcher`
2. Create a data directory and add a PDF about Milvus (or use your own data)
3. Run the script: `python basic_example.py`
## Key Concepts
- **Configuration**: Using the default configuration
- **Document Loading**: Loading a single PDF file
- **Querying**: Asking a complex question requiring synthesis of information
- **Token Tracking**: Monitoring token usage from the LLM

101
docs/examples/docling.md

@ -0,0 +1,101 @@
# Docling Integration Example
This example shows how to use Docling for loading local files and crawling web content.
## Overview
The script demonstrates:
1. Configuring DeepSearcher to use Docling for both file loading and web crawling
2. Loading data from local files using Docling's document parser
3. Crawling web content from multiple sources including Markdown and PDF files
4. Querying the loaded data
## Code Example
```python
import logging
import os
from deepsearcher.offline_loading import load_from_local_files, load_from_website
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
# Suppress unnecessary logging from third-party libraries
logging.getLogger("httpx").setLevel(logging.WARNING)
def main():
# Step 1: Initialize configuration
config = Configuration()
# Configure Vector Database and Docling providers
config.set_provider_config("vector_db", "Milvus", {})
config.set_provider_config("file_loader", "DoclingLoader", {})
config.set_provider_config("web_crawler", "DoclingCrawler", {})
# Apply the configuration
init_config(config)
# Step 2a: Load data from a local file using DoclingLoader
local_file = "your_local_file_or_directory"
local_collection_name = "DoclingLocalFiles"
local_collection_description = "Milvus Documents loaded using DoclingLoader"
print("\n=== Loading local files using DoclingLoader ===")
try:
load_from_local_files(
paths_or_directory=local_file,
collection_name=local_collection_name,
collection_description=local_collection_description,
force_new_collection=True
)
print(f"Successfully loaded: {local_file}")
except ValueError as e:
print(f"Validation error: {str(e)}")
except Exception as e:
print(f"Error: {str(e)}")
print("Successfully loaded all local files")
# Step 2b: Crawl URLs using DoclingCrawler
urls = [
# Markdown documentation files
"https://milvus.io/docs/quickstart.md",
"https://milvus.io/docs/overview.md",
# PDF example - can handle various URL formats
"https://arxiv.org/pdf/2408.09869",
]
web_collection_name = "DoclingWebCrawl"
web_collection_description = "Milvus Documentation crawled using DoclingCrawler"
print("\n=== Crawling web pages using DoclingCrawler ===")
load_from_website(
urls=urls,
collection_name=web_collection_name,
collection_description=web_collection_description,
force_new_collection=True
)
print("Successfully crawled all URLs")
# Step 3: Query the loaded data
question = "What is Milvus?"
result = query(question)
if __name__ == "__main__":
main()
```
## Running the Example
1. Install DeepSearcher and Docling: `pip install deepsearcher docling`
2. Replace `your_local_file_or_directory` with your actual file/directory path
3. Run the script: `python load_and_crawl_using_docling.py`
## Key Concepts
- **Multiple Providers**: Configuring both file loader and web crawler to use Docling
- **Local Files**: Loading documents from your local filesystem
- **Web Crawling**: Retrieving content from multiple web URLs with different formats
- **Error Handling**: Graceful error handling for loading operations

82
docs/examples/firecrawl.md

@ -0,0 +1,82 @@
# FireCrawl Integration Example
This example demonstrates how to use FireCrawl with DeepSearcher to crawl and extract content from websites.
## Overview
FireCrawl is a specialized web crawling service designed for AI applications. This example shows:
1. Setting up FireCrawl with DeepSearcher
2. Configuring API keys for the service
3. Crawling a website and extracting content
4. Querying the extracted content
## Code Example
```python
import logging
import os
from deepsearcher.offline_loading import load_from_website
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
# Suppress unnecessary logging from third-party libraries
logging.getLogger("httpx").setLevel(logging.WARNING)
# Set API keys (ensure these are set securely in real applications)
os.environ['OPENAI_API_KEY'] = 'sk-***************'
os.environ['FIRECRAWL_API_KEY'] = 'fc-***************'
def main():
# Step 1: Initialize configuration
config = Configuration()
# Set up Vector Database (Milvus) and Web Crawler (FireCrawlCrawler)
config.set_provider_config("vector_db", "Milvus", {})
config.set_provider_config("web_crawler", "FireCrawlCrawler", {})
# Apply the configuration
init_config(config)
# Step 2: Load data from a website into Milvus
website_url = "https://example.com" # Replace with your target website
collection_name = "FireCrawl"
collection_description = "All Milvus Documents"
# crawl a single webpage
load_from_website(urls=website_url, collection_name=collection_name, collection_description=collection_description)
# only applicable if using Firecrawl: deepsearcher can crawl multiple webpages, by setting max_depth, limit, allow_backward_links
# load_from_website(urls=website_url, max_depth=2, limit=20, allow_backward_links=True, collection_name=collection_name, collection_description=collection_description)
# Step 3: Query the loaded data
question = "What is Milvus?" # Replace with your actual question
result = query(question)
if __name__ == "__main__":
main()
```
## Running the Example
1. Install DeepSearcher: `pip install deepsearcher`
2. Sign up for a FireCrawl API key at [firecrawl.dev](https://docs.firecrawl.dev/introduction)
3. Replace the placeholder API keys with your actual keys
4. Change the `website_url` to the website you want to crawl
5. Run the script: `python load_website_using_firecrawl.py`
## Advanced Crawling Options
FireCrawl provides several advanced options for crawling:
- `max_depth`: Control how many links deep the crawler should go
- `limit`: Set a maximum number of pages to crawl
- `allow_backward_links`: Allow the crawler to navigate to parent/sibling pages
## Key Concepts
- **Web Crawling**: Extracting content from websites
- **Depth Control**: Managing how deep the crawler navigates
- **URL Processing**: Handling multiple pages from a single starting point
- **Vector Storage**: Storing the crawled content in a vector database for search

15
docs/examples/index.md

@ -0,0 +1,15 @@
# Usage Examples
DeepSearcher provides several example scripts to help you get started quickly. These examples demonstrate different ways to use DeepSearcher for various use cases.
## 📋 Available Examples
| Example | Description | Key Features |
|---------|-------------|--------------|
| [Basic Example](basic_example.md) | Simple example showing core functionality | Loading PDFs, querying |
| [Docling Integration](docling.md) | Using Docling for file loading and web crawling | Multiple sources, local and web |
| [Unstructured Integration](unstructured.md) | Using Unstructured for parsing documents | API and local processing |
| [FireCrawl Integration](firecrawl.md) | Web crawling with FireCrawl | Website data extraction |
| [Oracle Setup](oracle.md) | Advanced configuration with Oracle | Path setup, token tracking |
Click on any example to see detailed code and explanations.

70
docs/examples/oracle.md

@ -0,0 +1,70 @@
# Oracle Example
This example demonstrates an advanced setup using path manipulation and detailed token tracking.
## Overview
This example shows:
1. Setting up Python path for importing from the parent directory
2. Initializing DeepSearcher with default configuration
3. Loading a PDF document and creating a vector database
4. Performing a complex query with full result and token tracking
5. Optional token consumption monitoring
## Code Example
```python
import sys, os
from pathlib import Path
script_directory = Path(__file__).resolve().parent.parent
sys.path.append(os.path.abspath(script_directory))
import logging
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
current_dir = os.path.dirname(os.path.abspath(__file__))
# Customize your config here
from deepsearcher.configuration import Configuration, init_config
config = Configuration()
init_config(config=config)
# Load your local data
# Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project
from deepsearcher.offline_loading import load_from_local_files
load_from_local_files(
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
collection_name="milvus_docs",
collection_description="All Milvus Documents",
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
)
# Query
from deepsearcher.online_query import query
question = 'Write a report comparing Milvus with other vector databases.'
answer, retrieved_results, consumed_token = query(question)
print(answer)
# get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model
# print(f"Consumed tokens: {consumed_token}")
```
## Running the Example
1. Install DeepSearcher: `pip install deepsearcher`
2. Make sure you have the data directory with "WhatisMilvus.pdf" (or change the path)
3. Run the script: `python basic_example_oracle.py`
## Key Concepts
- **Path Management**: Setting up Python path to import from parent directory
- **Query Unpacking**: Getting full result details (answer, retrieved context, and tokens)
- **Complex Querying**: Asking for a comparative analysis that requires synthesis
- **Token Economy**: Monitoring token usage for cost optimization

76
docs/examples/unstructured.md

@ -0,0 +1,76 @@
# Unstructured Integration Example
This example demonstrates how to use the Unstructured library with DeepSearcher for advanced document parsing.
## Overview
Unstructured is a powerful document processing library that can extract content from various document formats. This example shows:
1. Setting up Unstructured with DeepSearcher
2. Configuring the Unstructured API keys (optional)
3. Loading documents with Unstructured's parser
4. Querying the extracted content
## Code Example
```python
import logging
import os
from deepsearcher.offline_loading import load_from_local_files
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
# Suppress unnecessary logging from third-party libraries
logging.getLogger("httpx").setLevel(logging.WARNING)
# (Optional) Set API keys (ensure these are set securely in real applications)
os.environ['UNSTRUCTURED_API_KEY'] = '***************'
os.environ['UNSTRUCTURED_API_URL'] = '***************'
def main():
# Step 1: Initialize configuration
config = Configuration()
# Configure Vector Database (Milvus) and File Loader (UnstructuredLoader)
config.set_provider_config("vector_db", "Milvus", {})
config.set_provider_config("file_loader", "UnstructuredLoader", {})
# Apply the configuration
init_config(config)
# Step 2: Load data from a local file or directory into Milvus
input_file = "your_local_file_or_directory" # Replace with your actual file path
collection_name = "Unstructured"
collection_description = "All Milvus Documents"
load_from_local_files(paths_or_directory=input_file, collection_name=collection_name, collection_description=collection_description)
# Step 3: Query the loaded data
question = "What is Milvus?" # Replace with your actual question
result = query(question)
if __name__ == "__main__":
main()
```
## Running the Example
1. Install DeepSearcher with Unstructured support: `pip install deepsearcher "unstructured[all-docs]"`
2. (Optional) Sign up for the Unstructured API at [unstructured.io](https://unstructured.io) if you want to use their cloud service
3. Replace `your_local_file_or_directory` with your own document file path or directory
4. Run the script: `python load_local_file_using_unstructured.py`
## Unstructured Options
You can use Unstructured in two modes:
1. **API Mode**: Set the environment variables `UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_API_URL` to use their cloud service
2. **Local Mode**: Don't set the environment variables, and Unstructured will process documents locally on your machine
## Key Concepts
- **Document Processing**: Advanced document parsing for various formats
- **API/Local Options**: Flexibility in deployment based on your needs
- **Integration**: Seamless integration with DeepSearcher's vector database and query capabilities

73
docs/faq/index.md

@ -0,0 +1,73 @@
# Frequently Asked Questions
## 🔍 Common Issues and Solutions
---
### 💬 Q1: Why am I failing to parse LLM output format / How to select the right LLM?
<div class="faq-answer">
<p><strong>Solution:</strong> Small language models often struggle to follow prompts and generate responses in the expected format. For better results, we recommend using large reasoning models such as:</p>
<ul>
<li>DeepSeek-R1 671B</li>
<li>OpenAI o-series models</li>
<li>Claude 3.7 Sonnet</li>
</ul>
<p>These models provide superior reasoning capabilities and are more likely to produce correctly formatted outputs.</p>
</div>
---
### 🌐 Q2: "We couldn't connect to 'https://huggingface.co'" error
<div class="faq-answer">
<p><strong>Error Message:</strong></p>
<div class="error-message">
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
</div>
<p><strong>Solution:</strong> This issue is typically caused by network access problems to Hugging Face. Try these solutions:</p>
<details>
<summary><strong>Network Issue? Try Using a Mirror</strong></summary>
```bash
export HF_ENDPOINT=https://hf-mirror.com
```
</details>
<details>
<summary><strong>Permission Issue? Set Up a Personal Token</strong></summary>
```bash
export HUGGING_FACE_HUB_TOKEN=xxxx
```
</details>
</div>
---
### 📓 Q3: DeepSearcher doesn't run in Jupyter notebook
<div class="faq-answer">
<p><strong>Solution:</strong> This is a common issue with asyncio in Jupyter notebooks. Install <code>nest_asyncio</code> and add the following code to the top of your notebook:</p>
<div class="code-steps">
<p><strong>Step 1:</strong> Install the required package</p>
```bash
pip install nest_asyncio
```
<p><strong>Step 2:</strong> Add these lines to the beginning of your notebook</p>
```python
import nest_asyncio
nest_asyncio.apply()
```
</div>
</div>
</div>

8
docs/future_plans.md

@ -0,0 +1,8 @@
# Future Plans
- Enhance web crawling functionality
- Support more vector databases (e.g., FAISS...)
- Add support for additional large models
- Provide RESTful API interface (**DONE**)
We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯

45
docs/index.md

@ -0,0 +1,45 @@
# 🔍 DeepSearcher
![DeepSearcher](./assets/pic/logo.png)
<div align="center">
<a href="https://opensource.org/licenses/Apache-2.0">
<img height="28" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=flat" alt="License">
</a>
<a href="https://twitter.com/zilliz_universe">
<img height="28" src="https://img.shields.io/badge/Follow-%40Zilliz-1DA1F2?style=flat&logo=twitter" alt="Twitter">
</a>
<a href="https://discord.gg/mKc3R95yE5">
<img height="28" src="https://img.shields.io/badge/Discord-Join%20Chat-5865F2?style=flat&logo=discord&logoColor=white" alt="Discord">
</a>
</div>
---
## ✨ Overview
DeepSearcher combines cutting-edge LLMs (OpenAI o1, o3-mini, DeepSeek, Grok 3, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answers and comprehensive reports.
> **Perfect for:** Enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios.
![Architecture](./assets/pic/deep-searcher-arch.png)
## 🚀 Key Features
| Feature | Description |
|---------|-------------|
| 🔒 **Private Data Search** | Maximizes utilization of enterprise internal data while ensuring data security. When necessary, integrates online content for more accurate answers. |
| 🗄️ **Vector Database Management** | Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval. |
| 🧩 **Flexible Embedding Options** | Compatible with multiple embedding models for optimal selection based on your needs. |
| 🤖 **Multiple LLM Support** | Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation. |
| 📄 **Document Loader** | Supports local file loading, with web crawling capabilities under development. |
## 🎬 Demo
![Demo](./assets/pic/demo.gif)

64
docs/installation/development.md

@ -0,0 +1,64 @@
# 🛠️ Development Mode Installation
This guide is for contributors who want to modify DeepSearcher's code or develop new features.
## 📋 Prerequisites
- Python 3.10 or higher
- git
- [uv](https://github.com/astral-sh/uv) package manager (recommended for faster installation)
## 🔄 Installation Steps
### Step 1: Install uv (Recommended)
[uv](https://github.com/astral-sh/uv) is a faster alternative to pip for Python package management.
=== "Using pip"
```bash
pip install uv
```
=== "Using curl (Unix/macOS)"
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
=== "Using PowerShell (Windows)"
```powershell
irm https://astral.sh/uv/install.ps1 | iex
```
For more options, see the [official uv installation guide](https://docs.astral.sh/uv/getting-started/installation/).
### Step 2: Clone the repository
```bash
git clone https://github.com/zilliztech/deep-searcher.git
cd deep-searcher
```
### Step 3: Set up the development environment
=== "Using uv (Recommended)"
```bash
uv sync
source .venv/bin/activate
```
=== "Using pip"
```bash
python -m venv .venv
source .venv/bin/activate # On Windows: .venv\Scripts\activate
pip install -e ".[dev,all]"
```
## 🧪 Running Tests
```bash
pytest tests/
```
## 📚 Additional Resources
For more detailed development setup instructions, including contribution guidelines, code style, and testing procedures, please refer to the [CONTRIBUTING.md](https://github.com/zilliztech/deep-searcher/blob/main/CONTRIBUTING.md) file in the repository.

29
docs/installation/index.md

@ -0,0 +1,29 @@
# 🔧 Installation
DeepSearcher offers multiple installation methods to suit different user needs.
## 📋 Installation Options
| Method | Best For | Description |
|--------|----------|-------------|
| [📦 Installation via pip](pip.md) | Most users | Quick and easy installation using pip package manager |
| [🛠️ Development mode](development.md) | Contributors | Setup for those who want to modify the code or contribute |
## 🚀 Quick Start
Once installed, you can verify your installation:
```python
from deepsearcher.configuration import Configuration
from deepsearcher.online_query import query
# Initialize with default configuration
config = Configuration()
print("DeepSearcher installed successfully!")
```
## 💻 System Requirements
- Python 3.10 or higher
- 4GB RAM minimum (8GB+ recommended)
- Internet connection for downloading models and dependencies

52
docs/installation/pip.md

@ -0,0 +1,52 @@
# 📦 Installation via pip
This method is recommended for most users who want to use DeepSearcher without modifying its source code.
## 📋 Prerequisites
- Python 3.10 or higher
- pip package manager (included with Python)
- Virtual environment tool (recommended)
## 🔄 Step-by-Step Installation
### Step 1: Create a virtual environment
```bash
python -m venv .venv
```
### Step 2: Activate the virtual environment
=== "Linux/macOS"
```bash
source .venv/bin/activate
```
=== "Windows"
```bash
.venv\Scripts\activate
```
### Step 3: Install DeepSearcher
```bash
pip install deepsearcher
```
## 🧩 Optional Dependencies
DeepSearcher supports various integrations through optional dependencies.
| Integration | Command | Description |
|-------------|---------|-------------|
| Ollama | `pip install "deepsearcher[ollama]"` | For local LLM deployment |
| All extras | `pip install "deepsearcher[all]"` | Installs all optional dependencies |
## ✅ Verify Installation
```python
# Simple verification
from deepsearcher import __version__
print(f"DeepSearcher version: {__version__}")
```

75
docs/integrations/index.md

@ -0,0 +1,75 @@
# Module Support
DeepSearcher supports various integration modules including embedding models, large language models, document loaders and vector databases.
## 📊 Overview
| Module Type | Count | Description |
|-------------|-------|-------------|
| [Embedding Models](#embedding-models) | 7+ | Text vectorization tools |
| [Large Language Models](#llm-support) | 11+ | Query processing and text generation |
| [Document Loaders](#document-loader) | 5+ | Parse and process documents in various formats |
| [Vector Databases](#vector-database-support) | 2+ | Store and retrieve vector data |
## 🔢 Embedding Models {#embedding-models}
Support for various embedding models to convert text into vector representations for semantic search.
| Provider | Required Environment Variables | Features |
|----------|--------------------------------|---------|
| **[Open-source models](https://milvus.io/docs/embeddings.md)** | None | Locally runnable open-source models |
| **[OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases)** | `OPENAI_API_KEY` | High-quality embeddings, easy to use |
| **[VoyageAI](https://docs.voyageai.com/embeddings/)** | `VOYAGE_API_KEY` | Embeddings optimized for retrieval |
| **[Amazon Bedrock](https://docs.aws.amazon.com/bedrock/)** | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` | AWS integration, enterprise-grade |
| **[FastEmbed](https://qdrant.github.io/fastembed/)** | None | Fast lightweight embeddings |
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Flexible cloud embeddings |
| **[Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings)** | `NOVITA_API_KEY` | Rich model selection |
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform |
## 🧠 Large Language Models {#llm-support}
Support for various large language models (LLMs) to process queries and generate responses.
| Provider | Required Environment Variables | Features |
|----------|--------------------------------|---------|
| **[OpenAI](https://platform.openai.com/docs/models)** | `OPENAI_API_KEY` | GPT model family |
| **[DeepSeek](https://api-docs.deepseek.com/)** | `DEEPSEEK_API_KEY` | Powerful reasoning capabilities |
| **[XAI Grok](https://x.ai/blog/grok-3)** | `XAI_API_KEY` | Real-time knowledge and humor |
| **[Anthropic Claude](https://docs.anthropic.com/en/home)** | `ANTHROPIC_API_KEY` | Excellent long-context understanding |
| **[SiliconFlow](https://docs.siliconflow.cn/en/userguide/introduction)** | `SILICONFLOW_API_KEY` | Enterprise inference service |
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Diverse model support |
| **[TogetherAI](https://docs.together.ai/docs/introduction)** | `TOGETHER_API_KEY` | Wide range of open-source models |
| **[Google Gemini](https://ai.google.dev/gemini-api/docs)** | `GEMINI_API_KEY` | Google's multimodal models |
| **[SambaNova](https://docs.together.ai/docs/introduction)** | `SAMBANOVA_API_KEY` | High-performance AI platform |
| **[Ollama](https://ollama.com/)** | None | Local LLM deployment |
| **[Novita AI](https://novita.ai/docs/guides/introduction)** | `NOVITA_API_KEY` | Diverse AI services |
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform |
## 📄 Document Loader {#document-loader}
Support for loading and processing documents from various sources.
### Local File Loaders
| Loader | Supported Formats | Required Environment Variables |
|--------|-------------------|--------------------------------|
| **Built-in Loader** | PDF, TXT, MD | None |
| **[Unstructured](https://unstructured.io/)** | Multiple document formats | `UNSTRUCTURED_API_KEY`, `UNSTRUCTURED_URL` (optional) |
### Web Crawlers
| Crawler | Description | Required Environment Variables/Setup |
|---------|-------------|--------------------------------------|
| **[FireCrawl](https://docs.firecrawl.dev/introduction)** | Crawler designed for AI applications | `FIRECRAWL_API_KEY` |
| **[Jina Reader](https://jina.ai/reader/)** | High-accuracy web content extraction | `JINA_API_TOKEN` |
| **[Crawl4AI](https://docs.crawl4ai.com/)** | Browser automation crawler | Run `crawl4ai-setup` for first-time use |
## 💾 Vector Database Support {#vector-database-support}
Support for various vector databases for efficient storage and retrieval of embeddings.
| Database | Description | Features |
|----------|-------------|----------|
| **[Milvus](https://milvus.io/)** | Open-source vector database | High-performance, scalable |
| **[Zilliz Cloud](https://www.zilliz.com/)** | Managed Milvus service | Fully managed, maintenance-free |
| **[Qdrant](https://qdrant.tech/)** | Vector similarity search engine | Simple, efficient |

0
docs/overrides/.gitkeep

78
docs/stylesheets/extra.css

@ -0,0 +1,78 @@
/* Add your custom CSS here */
/* FAQ Styling */
.faq-answer {
background-color: #f8f9fa;
border-left: 4px solid #5c6bc0;
padding: 15px 20px;
margin-bottom: 20px;
border-radius: 4px;
}
.error-message {
background-color: #ffebee;
border-left: 4px solid #f44336;
padding: 10px 15px;
margin: 10px 0;
font-family: monospace;
white-space: pre-wrap;
font-size: 0.9em;
border-radius: 4px;
}
.code-steps {
margin: 15px 0;
}
.code-steps p {
margin-bottom: 5px;
}
details {
margin-bottom: 10px;
padding: 10px;
background-color: #e3f2fd;
border-radius: 4px;
}
summary {
cursor: pointer;
padding: 8px 0;
}
details[open] summary {
margin-bottom: 10px;
}
h3 {
margin-top: 30px;
margin-bottom: 15px;
}
/* Add smooth transition for collapsible sections */
details summary {
transition: margin 0.3s ease;
}
/* Styling for code blocks within FAQ */
.faq-answer pre {
border-radius: 4px;
margin: 10px 0;
}
/* Add styling for list items */
.faq-answer ul {
padding-left: 25px;
}
.faq-answer ul li {
margin: 5px 0;
}
/* Add horizontal rule styling */
hr {
border: 0;
height: 1px;
background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.1), rgba(0, 0, 0, 0));
margin: 25px 0;
}

63
docs/usage/cli.md

@ -0,0 +1,63 @@
# 💻 Command Line Interface
DeepSearcher provides a convenient command line interface for loading data and querying.
## 📥 Loading Data
Load data from files or URLs:
```shell
deepsearcher load "your_local_path_or_url"
```
Load into a specific collection:
```shell
deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description"
```
### Examples
#### Loading from local files:
```shell
# Load a single file
deepsearcher load "/path/to/your/local/file.pdf"
# Load multiple files at once
deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md"
```
#### Loading from URL:
> **Note:** Set `FIRECRAWL_API_KEY` in your environment variables. See [FireCrawl documentation](https://docs.firecrawl.dev/introduction) for more details.
```shell
deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek"
```
## 🔍 Querying Data
Query your loaded data:
```shell
deepsearcher query "Write a report about xxx."
```
## ❓ Help Commands
Get general help information:
```shell
deepsearcher --help
```
Get help for specific subcommands:
```shell
# Help for load command
deepsearcher load --help
# Help for query command
deepsearcher query --help
```

73
docs/usage/deployment.md

@ -0,0 +1,73 @@
# 🌐 Deployment
This guide explains how to deploy DeepSearcher as a web service.
## ⚙️ Configure Modules
You can configure all arguments by modifying the configuration file:
```yaml
# config.yaml - https://github.com/zilliztech/deep-searcher/blob/main/config.yaml
llm:
provider: "OpenAI"
api_key: "your_openai_api_key_here"
# Additional configuration options...
```
> **Important:** Set your `OPENAI_API_KEY` in the `llm` section of the YAML file.
## 🚀 Start Service
The main script will run a FastAPI service with default address `localhost:8000`:
```shell
$ python main.py
```
Once started, you should see output indicating the service is running successfully.
## 🔍 Access via Browser
You can access the web service through your browser:
1. Open your browser and navigate to [http://localhost:8000/docs](http://localhost:8000/docs)
2. The Swagger UI will display all available API endpoints
3. Click the "Try it out" button on any endpoint to interact with it
4. Fill in the required parameters and execute the request
This interactive documentation makes it easy to test and use all DeepSearcher API functionality.
## 🐳 Docker Deployment
You can also deploy DeepSearcher using Docker for easier environment setup and management.
### Build Docker Image
To build the Docker image, run the following command from the project root directory:
```shell
docker build -t deepsearcher:latest .
```
This command builds a Docker image using the Dockerfile in the current directory and tags it as `deepsearcher:latest`.
### Run Docker Container
Once the image is built, you can run it as a container:
```shell
docker run -p 8000:8000 \
-e OPENAI_API_KEY=your_openai_api_key \
-v $(pwd)/data:/app/data \
-v $(pwd)/logs:/app/logs \
-v $(pwd)/deepsearcher/config.yaml:/app/deepsearcher/config.yaml \
deepsearcher:latest
```
This command:
- Maps port 8000 from the container to port 8000 on your host
- Sets the `OPENAI_API_KEY` environment variable
- Mounts the local `data`, `logs`, and configuration file to the container
- Runs the previously built `deepsearcher:latest` image
> **Note:** Replace `your_openai_api_key` with your actual OpenAI API key, or set any other environment variables required for your configuration.

13
docs/usage/index.md

@ -0,0 +1,13 @@
# 📚 Usage Guide
DeepSearcher provides multiple ways to use the system, including Python API, command line interface, and web service deployment.
## 🔍 Usage Overview
| Guide | Description |
|-------|-------------|
| [🚀 Quick Start](quick_start.md) | Quick start guide for Python API integration |
| [💻 Command Line Interface](cli.md) | Instructions for using the command line interface |
| [🌐 Deployment](deployment.md) | Guide for deploying as a web service |
Choose the method that best suits your needs and follow the instructions on the corresponding page.

42
docs/usage/quick_start.md

@ -0,0 +1,42 @@
# 🚀 Quick Start
## Prerequisites
✅ Before you begin, prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key.
## Basic Usage
```python
# Import configuration modules
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.online_query import query
# Initialize configuration
config = Configuration()
# Customize your config here
# (See the Configuration Details section below for more options)
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"})
init_config(config=config)
# Load data from local files
from deepsearcher.offline_loading import load_from_local_files
load_from_local_files(paths_or_directory=your_local_path)
# (Optional) Load data from websites
# Requires FIRECRAWL_API_KEY environment variable
from deepsearcher.offline_loading import load_from_website
load_from_website(urls=website_url)
# Query your data
result = query("Write a report about xxx.") # Replace with your question
print(result)
```
## Next Steps
After completing this quick start, you might want to explore:
- [Command Line Interface](cli.md) for non-programmatic usage
- [Deployment](deployment.md) for setting up a web service

53
evaluation/README.md

@ -0,0 +1,53 @@
# Evaluation of DeepSearcher
## Introduction
DeepSearcher is very good at answering complex queries. In this evaluation introduction, we provide some scripts to evaluate the performance of DeepSearcher vs. naive RAG.
The evaluation is based on the Recall metric:
> Recall@K: The percentage of relevant documents that are retrieved among the top K documents returned by the search engine.
Currently, we support the multi-hop question answering dataset of [2WikiMultiHopQA](https://paperswithcode.com/dataset/2wikimultihopqa). More dataset will be added in the future.
## Evaluation Script
The main evaluation script is `evaluate.py`.
Your can provide a config file, say `eval_config.yaml`, to specify the LLM, embedding model, and other provider and parameters.
```shell
python evaluate.py \
--dataset 2wikimultihopqa \
--config_yaml ./eval_config.yaml \
--pre_num 5 \
--output_dir ./eval_output
```
`pre_num` is the number of samples to evaluate, the more samples, the more accurate the results will be, but it will consume more time and your LLM api token usage.
After you have loaded the dataset into vectorDB in the first run, if you want to skip loading dataset again, you can set the flag `--skip_load` in the command line.
For more arguments details, you can run
```shell
python evaluate.py --help
```
## Evaluation Results
We conducted tests using the commonly used 2WikiMultiHopQA dataset. (Due to the high consumption of API tokens for testing, we only tested the first 50 samples. This may introduce some fluctuations compared to testing the entire dataset, but it can still roughly reflect the general landscape of performance.)
### Recall Comparison between Naive RAG and DeepSearcher with Different Models
With Max Iterations on the horizontal axis and Recall on the vertical axis, the following chart compares the recall rates of Deep Searcher and naive RAG.
![](plot_results/max_iter_vs_recall.png)
#### Performance Improvement with Iterations
As we can see, as the number of Max Iterations increases, the recall performance of Deep Searcher improves significantly. And all the model results from Deep Searcher are significantly higher than those from naive RAG.
#### Diminishing Returns
However, it is also evident that as the number of iterations gradually increases, the marginal gains decrease, indicating that there may be a certain limit reached after increasing the feedback iterations, and further feedback might not yield significantly better results.
#### Model Performance Comparison
Claude-3-7-sonnet (red line) demonstrates superior performance throughout, achieving nearly perfect recall at 7 iterations. Most models show significant improvement as iterations increase, with the steepest gains occurring between 2-4 iterations. Models like o1-mini (yellow) and deepseek-r1 (green) exhibit strong performance at higher iteration counts. Since our sample number for testing is limited, the results of each test may vary somewhat.
Overall, reasoning models generally perform better than non-reasoning models.
#### Limitations of Non-Reasoning Models
Additionally, in our tests, weaker and smaller non-reasoning models sometimes failed to complete the entire agent query pipeline, due to their inadequate instruction-following capabilities.
### Token Consumption
We plotted the graph below with the number of iterations on the horizontal axis and the average token consumption per sample on the vertical axis:
![](plot_results/max_iter_vs_avg_token_usage.png)
It is evident that as the number of iterations increases, the token consumption of Deep Searcher rises linearly. Based on this approximate token consumption, you can check the pricing on your model provider's website to estimate the cost of running evaluations with different iteration settings.

119
evaluation/eval_config.yaml

@ -0,0 +1,119 @@
provide_settings:
llm:
provider: "OpenAI"
config:
model: "o1-mini"
# api_key: "sk-xxxx" # Uncomment to override the `OPENAI_API_KEY` set in the environment variable
# base_url: ""
# provider: "DeepSeek"
# config:
# model: "deepseek-reasoner"
## api_key: "sk-xxxx" # Uncomment to override the `DEEPSEEK_API_KEY` set in the environment variable
## base_url: ""
# provider: "SiliconFlow"
# config:
# model: "deepseek-ai/DeepSeek-R1"
## api_key: "xxxx" # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable
## base_url: ""
# provider: "PPIO"
# config:
# model: "deepseek/deepseek-r1-turbo"
## api_key: "xxxx" # Uncomment to override the `PPIO_API_KEY` set in the environment variable
## base_url: ""
# provider: "TogetherAI"
# config:
# model: "deepseek-ai/DeepSeek-R1"
## api_key: "xxxx" # Uncomment to override the `TOGETHER_API_KEY` set in the environment variable
# provider: "AzureOpenAI"
# config:
# model: ""
# api_version: ""
## azure_endpoint: "xxxx" # Uncomment to override the `AZURE_OPENAI_ENDPOINT` set in the environment variable
## api_key: "xxxx" # Uncomment to override the `AZURE_OPENAI_KEY` set in the environment variable
# provider: "Ollama"
# config:
# model: "qwq"
## base_url: ""
# provider: "Novita"
# config:
# model: "deepseek/deepseek-v3-0324"
## api_key: "xxxx" # Uncomment to override the `NOVITA_API_KEY` set in the environment variable
## base_url: ""
embedding:
provider: "OpenAIEmbedding"
config:
model: "text-embedding-ada-002"
# api_key: "" # Uncomment to override the `OPENAI_API_KEY` set in the environment variable
# provider: "MilvusEmbedding"
# config:
# model: "default"
# provider: "VoyageEmbedding"
# config:
# model: "voyage-3"
## api_key: "" # Uncomment to override the `VOYAGE_API_KEY` set in the environment variable
# provider: "BedrockEmbedding"
# config:
# model: "amazon.titan-embed-text-v2:0"
## aws_access_key_id: "" # Uncomment to override the `AWS_ACCESS_KEY_ID` set in the environment variable
## aws_secret_access_key: "" # Uncomment to override the `AWS_SECRET_ACCESS_KEY` set in the environment variable
# provider: "SiliconflowEmbedding"
# config:
# model: "BAAI/bge-m3"
# . api_key: "" # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable
# provider: "NovitaEmbedding"
# config:
# model: "baai/bge-m3"
# . api_key: "" # Uncomment to override the `NOVITA_API_KEY` set in the environment variable
file_loader:
# provider: "PDFLoader"
# config: {}
provider: "JsonFileLoader"
config:
text_key: "text"
# provider: "TextLoader"
# config: {}
# provider: "UnstructuredLoader"
# config: {}
web_crawler:
provider: "FireCrawlCrawler"
config: {}
# provider: "Crawl4AICrawler"
# config: {}
# provider: "JinaCrawler"
# config: {}
vector_db:
provider: "Milvus"
config:
default_collection: "deepsearcher"
uri: "./milvus.db"
token: "root:Milvus"
db: "default"
query_settings:
max_iter: 3
load_settings:
chunk_size: 1500
chunk_overlap: 100

329
evaluation/evaluate.py

@ -0,0 +1,329 @@
# Some test dataset and evaluation method are ref from https://github.com/OSU-NLP-Group/HippoRAG/tree/main/data , many thanks
################################################################################
# Note: This evaluation script will cost a lot of LLM token usage, please make sure you have enough token budget.
################################################################################
import argparse
import ast
import json
import logging
import os
import time
import warnings
from collections import defaultdict
from typing import List, Tuple
import pandas as pd
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.offline_loading import load_from_local_files
from deepsearcher.online_query import naive_retrieve, retrieve
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output
current_dir = os.path.dirname(os.path.abspath(__file__))
k_list = [2, 5]
def _deepsearch_retrieve_titles(
question: str,
retry_num: int = 4,
base_wait_time: int = 4,
max_iter: int = 3,
) -> Tuple[List[str], int, bool]:
"""
Retrieve document titles using DeepSearcher with retry mechanism.
Args:
question (str): The query question.
retry_num (int, optional): Number of retry attempts. Defaults to 4.
base_wait_time (int, optional): Base wait time between retries in seconds. Defaults to 4.
max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3.
Returns:
Tuple[List[str], int, bool]: A tuple containing:
- List of retrieved document titles
- Number of tokens consumed
- Boolean indicating whether the retrieval failed
"""
retrieved_results = []
consume_tokens = 0
for i in range(retry_num):
try:
retrieved_results, _, consume_tokens = retrieve(question, max_iter=max_iter)
break
except Exception:
wait_time = base_wait_time * (2**i)
print(f"Parse LLM's output failed, retry again after {wait_time} seconds...")
time.sleep(wait_time)
if retrieved_results:
retrieved_titles = [
retrieved_result.metadata["title"] for retrieved_result in retrieved_results
]
fail = False
else:
print("Pipeline error, no retrieved results.")
retrieved_titles = []
fail = True
return retrieved_titles, consume_tokens, fail
def _naive_retrieve_titles(question: str) -> List[str]:
"""
Retrieve document titles using naive retrieval method.
Args:
question (str): The query question.
Returns:
List[str]: List of retrieved document titles.
"""
retrieved_results = naive_retrieve(question)
retrieved_titles = [
retrieved_result.metadata["title"] for retrieved_result in retrieved_results
]
return retrieved_titles
def _calcu_recall(sample, retrieved_titles, dataset) -> dict:
"""
Calculate recall metrics for retrieved titles.
Args:
sample: The sample data containing ground truth information.
retrieved_titles: List of retrieved document titles.
dataset (str): The name of the dataset being evaluated.
Returns:
dict: Dictionary containing recall values at different k values.
Raises:
NotImplementedError: If the dataset is not supported.
"""
if dataset in ["2wikimultihopqa"]:
gold_passages = [item for item in sample["supporting_facts"]]
gold_items = set([item[0] for item in gold_passages])
retrieved_items = retrieved_titles
else:
raise NotImplementedError
recall = dict()
for k in k_list:
recall[k] = round(
sum(1 for t in gold_items if t in retrieved_items[:k]) / len(gold_items), 4
)
return recall
def _print_recall_line(recall: dict, pre_str="", post_str="\n"):
"""
Print recall metrics in a formatted line.
Args:
recall (dict): Dictionary containing recall values at different k values.
pre_str (str, optional): String to print before recall values. Defaults to "".
post_str (str, optional): String to print after recall values. Defaults to "\n".
"""
print(pre_str, end="")
for k in k_list:
print(f"R@{k}: {recall[k]:.3f} ", end="")
print(post_str, end="")
def evaluate(
dataset: str,
output_root: str,
pre_num: int = 10,
max_iter: int = 3,
skip_load=False,
flag: str = "result",
):
"""
Evaluate the retrieval performance on a dataset.
Args:
dataset (str): Name of the dataset to evaluate.
output_root (str): Root directory for output files.
pre_num (int, optional): Number of samples to evaluate. Defaults to 10.
max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3.
skip_load (bool, optional): Whether to skip loading the dataset. Defaults to False.
flag (str, optional): Flag for the evaluation run. Defaults to "result".
"""
corpus_file = os.path.join(current_dir, f"../examples/data/{dataset}_corpus.json")
if not skip_load:
# set chunk size to a large number to avoid chunking, because the dataset was chunked already.
load_from_local_files(
corpus_file, force_new_collection=True, chunk_size=999999, chunk_overlap=0
)
eval_output_subdir = os.path.join(output_root, flag)
os.makedirs(eval_output_subdir, exist_ok=True)
csv_file_path = os.path.join(eval_output_subdir, "details.csv")
statistics_file_path = os.path.join(eval_output_subdir, "statistics.json")
data_with_gt_file_path = os.path.join(current_dir, f"../examples/data/{dataset}.json")
data_with_gt = json.load(open(data_with_gt_file_path, "r"))
if not pre_num:
pre_num = len(data_with_gt)
pipeline_error_num = 0
end_ind = min(pre_num, len(data_with_gt))
start_ind = 0
existing_df = pd.DataFrame()
existing_statistics = defaultdict(dict)
existing_token_usage = 0
existing_error_num = 0
existing_sample_num = 0
if os.path.exists(csv_file_path):
existing_df = pd.read_csv(csv_file_path)
start_ind = len(existing_df)
print(f"Loading results from {csv_file_path}, start_index = {start_ind}")
if os.path.exists(statistics_file_path):
existing_statistics = json.load(open(statistics_file_path, "r"))
print(
f"Loading statistics from {statistics_file_path}, will recalculate the statistics based on both new and existing results."
)
existing_token_usage = existing_statistics["deepsearcher"]["token_usage"]
existing_error_num = existing_statistics["deepsearcher"].get("error_num", 0)
existing_sample_num = existing_statistics["deepsearcher"].get("sample_num", 0)
for sample_idx, sample in enumerate(data_with_gt[start_ind:end_ind]):
global_idx = sample_idx + start_ind
question = sample["question"]
retrieved_titles, consume_tokens, fail = _deepsearch_retrieve_titles(
question, max_iter=max_iter
)
retrieved_titles_naive = _naive_retrieve_titles(question)
if fail:
pipeline_error_num += 1
print(
f"Pipeline error, no retrieved results. Current pipeline_error_num = {pipeline_error_num}"
)
print(f"idx: {global_idx}: ")
recall = _calcu_recall(sample, retrieved_titles, dataset)
recall_naive = _calcu_recall(sample, retrieved_titles_naive, dataset)
current_result = [
{
"idx": global_idx,
"question": question,
"recall": recall,
"recall_naive": recall_naive,
"gold_titles": [item[0] for item in sample["supporting_facts"]],
"retrieved_titles": retrieved_titles,
"retrieved_titles_naive": retrieved_titles_naive,
}
]
current_df = pd.DataFrame(current_result)
existing_df = pd.concat([existing_df, current_df], ignore_index=True)
existing_df.to_csv(csv_file_path, index=False)
average_recall = dict()
average_recall_naive = dict()
for k in k_list:
average_recall[k] = sum(
[
ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k)
for d in existing_df["recall"]
]
) / len(existing_df)
average_recall_naive[k] = sum(
[
ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k)
for d in existing_df["recall_naive"]
]
) / len(existing_df)
_print_recall_line(average_recall, pre_str="Average recall of DeepSearcher: ")
_print_recall_line(average_recall_naive, pre_str="Average recall of naive RAG : ")
existing_token_usage += consume_tokens
existing_error_num += 1 if fail else 0
existing_sample_num += 1
existing_statistics["deepsearcher"]["average_recall"] = average_recall
existing_statistics["deepsearcher"]["token_usage"] = existing_token_usage
existing_statistics["deepsearcher"]["error_num"] = existing_error_num
existing_statistics["deepsearcher"]["sample_num"] = existing_sample_num
existing_statistics["deepsearcher"]["token_usage_per_sample"] = (
existing_token_usage / existing_sample_num
)
existing_statistics["naive_rag"]["average_recall"] = average_recall_naive
json.dump(existing_statistics, open(statistics_file_path, "w"), indent=4)
print("")
print("Finish results to save.")
def main_eval():
"""
Main function for running the evaluation from command line.
This function parses command line arguments and calls the evaluate function
with the appropriate parameters.
"""
parser = argparse.ArgumentParser(prog="evaluate", description="Deep Searcher evaluation.")
parser.add_argument(
"--dataset",
type=str,
default="2wikimultihopqa",
help="Dataset name, default is `2wikimultihopqa`. More datasets will be supported in the future.",
)
parser.add_argument(
"--config_yaml",
type=str,
default="./eval_config.yaml",
help="Configuration yaml file path, default is `./eval_config.yaml`",
)
parser.add_argument(
"--pre_num",
type=int,
default=30,
help="Number of samples to evaluate, default is 30",
)
parser.add_argument(
"--max_iter",
type=int,
default=3,
help="Max iterations of reflection. Default is 3. It will overwrite the one in config yaml file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="./eval_output",
help="Output root directory, default is `./eval_output`",
)
parser.add_argument(
"--skip_load",
action="store_true",
help="Whether to skip loading the dataset. Default it don't skip loading. If you want to skip loading, please set this flag.",
)
parser.add_argument(
"--flag",
type=str,
default="result",
help="Flag for evaluation, default is `result`",
)
args = parser.parse_args()
config = Configuration(config_path=args.config_yaml)
init_config(config=config)
evaluate(
dataset=args.dataset,
output_root=args.output_dir,
pre_num=args.pre_num,
max_iter=args.max_iter,
skip_load=args.skip_load,
flag=args.flag,
)
if __name__ == "__main__":
main_eval()

BIN
evaluation/plot_results/max_iter_vs_avg_token_usage.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

BIN
evaluation/plot_results/max_iter_vs_error_num.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

BIN
evaluation/plot_results/max_iter_vs_recall.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

35
examples/basic_example.py

@ -0,0 +1,35 @@
import logging
import os
from deepsearcher.offline_loading import load_from_local_files
from deepsearcher.online_query import query
from deepsearcher.configuration import Configuration, init_config
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
current_dir = os.path.dirname(os.path.abspath(__file__))
config = Configuration() # Customize your config here
init_config(config=config)
# You should clone the milvus docs repo to your local machine first, execute:
# git clone https://github.com/milvus-io/milvus-docs.git
# Then replace the path below with the path to the milvus-docs repo on your local machine
# import glob
# all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True)
# load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents")
# Hint: You can also load a single file, please execute it in the root directory of the deep searcher project
load_from_local_files(
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
collection_name="milvus_docs",
collection_description="All Milvus Documents",
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
)
question = "Write a report comparing Milvus with other vector databases."
_, _, consumed_token = query(question, max_iter=1)
print(f"Consumed tokens: {consumed_token}")

68
examples/basic_example_azuresearch.py

@ -0,0 +1,68 @@
import logging
import os
import time
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.online_query import query
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
logger.info("Initializing DeepSearcher configuration")
config = Configuration()
config.set_provider_config("llm", "AzureOpenAI", {
"model": "gpt-4.1",
"api_key": "<yourkey>",
"base_url": "https://<youraifoundry>.openai.azure.com/openai/",
"api_version": "2024-12-01-preview"
})
config.set_provider_config("embedding", "OpenAIEmbedding", {
"model": "text-embedding-ada-002",
"api_key": "<yourkey>",
"azure_endpoint": "https://<youraifoundry>.openai.azure.com/",
"api_version": "2023-05-15"
# Remove api_version and other Azure-specific parameters
})
config.set_provider_config("vector_db", "AzureSearch", {
"endpoint": "https://<yourazureaisearch>.search.windows.net",
"index_name": "<yourindex>",
"api_key": "<yourkey>",
"vector_field": "content_vector"
})
logger.info("Configuration initialized successfully")
try:
logger.info("Applying global configuration")
init_config(config)
logger.info("Configuration applied globally")
# Example question
question = "Create a detailed report about what Python is all about"
logger.info(f"Processing query: '{question}'")
start_time = time.time()
result = query(question)
query_time = time.time() - start_time
logger.info(f"Query processed in {query_time:.2f} seconds")
logger.info("Retrieved result successfully")
print(result[0]) # Print the first element of the tuple
# Check if there's a second element in the tuple that contains source documents
if len(result) > 1 and hasattr(result[1], "__len__"):
logger.info(f"Found {len(result[1])} source documents")
for i, doc in enumerate(result[1]):
if hasattr(doc, "metadata") and "source" in doc.metadata:
logger.info(f"Source {i+1}: {doc.metadata['source']}")
except Exception as e:
logger.error(f"Error executing query: {str(e)}")
import traceback
logger.error(traceback.format_exc())

40
examples/basic_example_oracle.py

@ -0,0 +1,40 @@
import sys, os
from pathlib import Path
script_directory = Path(__file__).resolve().parent.parent
sys.path.append(os.path.abspath(script_directory))
import logging
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
current_dir = os.path.dirname(os.path.abspath(__file__))
# Customize your config here
from deepsearcher.configuration import Configuration, init_config
config = Configuration()
init_config(config=config)
# # Load your local data
# # Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project
from deepsearcher.offline_loading import load_from_local_files
load_from_local_files(
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
collection_name="milvus_docs",
collection_description="All Milvus Documents",
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
)
# Query
from deepsearcher.online_query import query
question = 'Write a report comparing Milvus with other vector databases.'
answer, retrieved_results, consumed_token = query(question)
print(answer)
# # get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model
# print(f"Consumed tokens: {consumed_token}")

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save