@ -0,0 +1,32 @@ |
|||
--- |
|||
name: Bug report |
|||
about: Create a report to help us improve |
|||
title: '' |
|||
labels: '' |
|||
assignees: '' |
|||
|
|||
--- |
|||
|
|||
Please describe your issue **in English** |
|||
|
|||
*Note: Small LLMs cannot perform well at prompt following, and are prone to hallucinations. Please make sure your LLM is cutting-edge, preferably a reasoning model, e.g. OpenAI o-series, DeepSeek R1, Claude 3.7 Sonnet etc.* |
|||
|
|||
**Describe the bug** |
|||
A clear and concise description of what the bug is. |
|||
|
|||
**To Reproduce** |
|||
Steps to reproduce the behavior: |
|||
|
|||
**Expected behavior** |
|||
A clear and concise description of what you expected to happen. |
|||
|
|||
**Screenshots** |
|||
If applicable, add screenshots to help explain your problem. |
|||
|
|||
**Environment (please complete the following information):** |
|||
- OS: [e.g. MacOS] |
|||
- pip dependencies |
|||
- Version [e.g. 0.0.1] |
|||
|
|||
**Additional context** |
|||
Add any other context about the problem here. |
@ -0,0 +1,22 @@ |
|||
--- |
|||
name: Feature request |
|||
about: Suggest an idea for this project |
|||
title: '' |
|||
labels: '' |
|||
assignees: '' |
|||
|
|||
--- |
|||
|
|||
Please describe your suggestion **in English**. |
|||
|
|||
**Is your feature request related to a problem? Please describe.** |
|||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] |
|||
|
|||
**Describe the solution you'd like** |
|||
A clear and concise description of what you want to happen. |
|||
|
|||
**Describe alternatives you've considered** |
|||
A clear and concise description of any alternative solutions or features you've considered. |
|||
|
|||
**Additional context** |
|||
Add any other context or screenshots about the feature request here. |
@ -0,0 +1,34 @@ |
|||
misc: |
|||
- branch: &BRANCHES |
|||
# In this pull request, the changes are based on the main branch |
|||
- &MASTER_BRANCH base=main |
|||
|
|||
- name: Label bug fix PRs |
|||
conditions: |
|||
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES |
|||
- or: *BRANCHES |
|||
- 'title~=^fix:' |
|||
actions: |
|||
label: |
|||
add: |
|||
- kind/bug |
|||
|
|||
- name: Label feature PRs |
|||
conditions: |
|||
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES |
|||
- or: *BRANCHES |
|||
- 'title~=^feat:' |
|||
actions: |
|||
label: |
|||
add: |
|||
- kind/feature |
|||
|
|||
- name: Label enhancement PRs |
|||
conditions: |
|||
# branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES |
|||
- or: *BRANCHES |
|||
- 'title~=^enhance:' |
|||
actions: |
|||
label: |
|||
add: |
|||
- kind/enhancement |
@ -0,0 +1,20 @@ |
|||
name: "Run Docs CD with UV" |
|||
|
|||
on: |
|||
push: |
|||
branches: |
|||
- "main" |
|||
- "master" |
|||
paths: |
|||
- 'docs/**' |
|||
- 'mkdocs.yml' |
|||
- '.github/workflows/docs.yml' |
|||
|
|||
jobs: |
|||
build-deploy-docs: |
|||
if: github.repository == 'zilliztech/deep-searcher' |
|||
uses: ./.github/workflows/docs.yml |
|||
with: |
|||
deploy: true |
|||
permissions: |
|||
contents: write |
@ -0,0 +1,24 @@ |
|||
name: "Run Docs CI with UV" |
|||
|
|||
on: |
|||
pull_request: |
|||
types: [opened, reopened, synchronize] |
|||
paths: |
|||
- 'docs/**' |
|||
- 'mkdocs.yml' |
|||
- '.github/workflows/docs.yml' |
|||
push: |
|||
branches: |
|||
- "**" |
|||
- "!gh-pages" |
|||
paths: |
|||
- 'docs/**' |
|||
- 'mkdocs.yml' |
|||
- '.github/workflows/docs.yml' |
|||
|
|||
jobs: |
|||
build-docs: |
|||
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'zilliztech/deep-searcher') }} |
|||
uses: ./.github/workflows/docs.yml |
|||
with: |
|||
deploy: false |
@ -0,0 +1,27 @@ |
|||
on: |
|||
workflow_call: |
|||
inputs: |
|||
deploy: |
|||
type: boolean |
|||
description: "If true, the docs will be deployed." |
|||
default: false |
|||
|
|||
jobs: |
|||
run-docs: |
|||
runs-on: ubuntu-latest |
|||
steps: |
|||
- uses: actions/checkout@v4 |
|||
|
|||
- name: Install uv |
|||
uses: astral-sh/setup-uv@v5 |
|||
- name: Install dependencies |
|||
run: | |
|||
uv sync --all-extras --dev |
|||
source .venv/bin/activate |
|||
|
|||
- name: Build docs |
|||
run: uv run mkdocs build --verbose --clean |
|||
|
|||
- name: Build and push docs |
|||
if: inputs.deploy |
|||
run: uv run mkdocs gh-deploy --force |
@ -0,0 +1,37 @@ |
|||
#git tag v0.x.x # Must be same as the version in pyproject.toml |
|||
#git push --tags |
|||
|
|||
name: Publish Python Package to PyPI |
|||
|
|||
on: |
|||
push: |
|||
tags: |
|||
- "v*" |
|||
|
|||
jobs: |
|||
publish: |
|||
name: Publish to PyPI |
|||
runs-on: ubuntu-latest |
|||
environment: pypi |
|||
|
|||
permissions: |
|||
id-token: write |
|||
contents: read |
|||
|
|||
steps: |
|||
- name: Checkout code |
|||
uses: actions/checkout@v4 |
|||
|
|||
- name: Set up Python |
|||
uses: actions/setup-python@v5 |
|||
with: |
|||
python-version: "3.10" |
|||
|
|||
- name: Install build tools |
|||
run: python -m pip install build |
|||
|
|||
- name: Build package |
|||
run: python -m build |
|||
|
|||
- name: Publish to PyPI |
|||
uses: pypa/gh-action-pypi-publish@release/v1 |
@ -0,0 +1,25 @@ |
|||
name: Ruff |
|||
on: |
|||
push: |
|||
branches: [ main, master ] |
|||
pull_request: |
|||
jobs: |
|||
build: |
|||
runs-on: ubuntu-latest |
|||
steps: |
|||
- uses: actions/checkout@v4 |
|||
|
|||
- name: Install uv |
|||
uses: astral-sh/setup-uv@v5 |
|||
- name: Install the project |
|||
run: | |
|||
uv sync --all-extras --dev |
|||
source .venv/bin/activate |
|||
|
|||
- name: Run Ruff |
|||
run: | |
|||
uv run ruff format --diff |
|||
uv run ruff check |
|||
|
|||
# - name: Run tests |
|||
# run: uv run pytest tests |
@ -0,0 +1,199 @@ |
|||
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode |
|||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode |
|||
|
|||
### Python ### |
|||
# Byte-compiled / optimized / DLL files |
|||
__pycache__/ |
|||
*.py[cod] |
|||
*$py.class |
|||
|
|||
# C extensions |
|||
*.so |
|||
|
|||
# Distribution / packaging |
|||
.Python |
|||
build/ |
|||
develop-eggs/ |
|||
dist/ |
|||
downloads/ |
|||
eggs/ |
|||
.eggs/ |
|||
lib/ |
|||
lib64/ |
|||
parts/ |
|||
sdist/ |
|||
var/ |
|||
wheels/ |
|||
share/python-wheels/ |
|||
*.egg-info/ |
|||
.installed.cfg |
|||
*.egg |
|||
MANIFEST |
|||
|
|||
# PyInstaller |
|||
# Usually these files are written by a python script from a template |
|||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
|||
*.manifest |
|||
*.spec |
|||
|
|||
# Installer logs |
|||
pip-log.txt |
|||
pip-delete-this-directory.txt |
|||
|
|||
# Unit test / coverage reports |
|||
htmlcov/ |
|||
.tox/ |
|||
.nox/ |
|||
.coverage |
|||
.coverage.* |
|||
.cache |
|||
nosetests.xml |
|||
coverage.xml |
|||
*.cover |
|||
*.py,cover |
|||
.hypothesis/ |
|||
.pytest_cache/ |
|||
cover/ |
|||
|
|||
# Translations |
|||
*.mo |
|||
*.pot |
|||
|
|||
# Django stuff: |
|||
*.log |
|||
local_settings.py |
|||
db.sqlite3 |
|||
db.sqlite3-journal |
|||
|
|||
# Flask stuff: |
|||
instance/ |
|||
.webassets-cache |
|||
|
|||
# Scrapy stuff: |
|||
.scrapy |
|||
|
|||
# Sphinx documentation |
|||
docs/_build/ |
|||
|
|||
# PyBuilder |
|||
.pybuilder/ |
|||
target/ |
|||
|
|||
# Jupyter Notebook |
|||
.ipynb_checkpoints |
|||
|
|||
# IPython |
|||
profile_default/ |
|||
ipython_config.py |
|||
|
|||
# pyenv |
|||
# For a library or package, you might want to ignore these files since the code is |
|||
# intended to run in multiple environments; otherwise, check them in: |
|||
# .python-version |
|||
|
|||
# pipenv |
|||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
|||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
|||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
|||
# install all needed dependencies. |
|||
#Pipfile.lock |
|||
|
|||
# poetry |
|||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. |
|||
# This is especially recommended for binary packages to ensure reproducibility, and is more |
|||
# commonly ignored for libraries. |
|||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control |
|||
#poetry.lock |
|||
|
|||
# pdm |
|||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. |
|||
#pdm.lock |
|||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it |
|||
# in version control. |
|||
# https://pdm.fming.dev/#use-with-ide |
|||
.pdm.toml |
|||
|
|||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm |
|||
__pypackages__/ |
|||
|
|||
# Celery stuff |
|||
celerybeat-schedule |
|||
celerybeat.pid |
|||
|
|||
# SageMath parsed files |
|||
*.sage.py |
|||
|
|||
# Environments |
|||
.env |
|||
.venv |
|||
env/ |
|||
venv/ |
|||
ENV/ |
|||
env.bak/ |
|||
venv.bak/ |
|||
|
|||
# Spyder project settings |
|||
.spyderproject |
|||
.spyproject |
|||
|
|||
# Rope project settings |
|||
.ropeproject |
|||
|
|||
# mkdocs documentation |
|||
/site |
|||
|
|||
# mypy |
|||
.mypy_cache/ |
|||
.dmypy.json |
|||
dmypy.json |
|||
|
|||
# Pyre type checker |
|||
.pyre/ |
|||
|
|||
# pytype static type analyzer |
|||
.pytype/ |
|||
|
|||
# Cython debug symbols |
|||
cython_debug/ |
|||
|
|||
# PyCharm |
|||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can |
|||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore |
|||
# and can be added to the global gitignore or merged into this file. For a more nuclear |
|||
# option (not recommended) you can uncomment the following to ignore the entire idea folder. |
|||
#.idea/ |
|||
|
|||
### Python Patch ### |
|||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration |
|||
poetry.toml |
|||
|
|||
# ruff |
|||
.ruff_cache/ |
|||
|
|||
# LSP config files |
|||
pyrightconfig.json |
|||
|
|||
### VisualStudioCode ### |
|||
.vscode/* |
|||
!.vscode/settings.json |
|||
!.vscode/tasks.json |
|||
!.vscode/launch.json |
|||
!.vscode/extensions.json |
|||
!.vscode/*.code-snippets |
|||
|
|||
# Local History for Visual Studio Code |
|||
.history/ |
|||
|
|||
# Built Visual Studio Code Extensions |
|||
*.vsix |
|||
|
|||
### VisualStudioCode Patch ### |
|||
# Ignore all local history of files |
|||
.history |
|||
.ionide |
|||
|
|||
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode |
|||
|
|||
.DS_Store |
|||
|
|||
*.db |
@ -0,0 +1 @@ |
|||
3.10 |
@ -0,0 +1,11 @@ |
|||
{ |
|||
"python.testing.unittestArgs": [ |
|||
"-v", |
|||
"-s", |
|||
"./tests", |
|||
"-p", |
|||
"test_*.py" |
|||
], |
|||
"python.testing.pytestEnabled": false, |
|||
"python.testing.unittestEnabled": true |
|||
} |
@ -0,0 +1,19 @@ |
|||
FROM ghcr.io/astral-sh/uv:python3.10-bookworm-slim |
|||
|
|||
WORKDIR /app |
|||
|
|||
RUN mkdir -p /tmp/uv-cache /app/data /app/logs |
|||
|
|||
COPY pyproject.toml uv.lock LICENSE README.md ./ |
|||
COPY deepsearcher/ ./deepsearcher/ |
|||
|
|||
RUN uv sync |
|||
|
|||
COPY . . |
|||
|
|||
EXPOSE 8000 |
|||
|
|||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ |
|||
CMD curl -f http://localhost:8000/docs || exit 1 |
|||
|
|||
CMD ["uv", "run", "python", "main.py", "--enable-cors", "true"] |
@ -0,0 +1,201 @@ |
|||
Apache License |
|||
Version 2.0, January 2004 |
|||
http://www.apache.org/licenses/ |
|||
|
|||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
|||
|
|||
1. Definitions. |
|||
|
|||
"License" shall mean the terms and conditions for use, reproduction, |
|||
and distribution as defined by Sections 1 through 9 of this document. |
|||
|
|||
"Licensor" shall mean the copyright owner or entity authorized by |
|||
the copyright owner that is granting the License. |
|||
|
|||
"Legal Entity" shall mean the union of the acting entity and all |
|||
other entities that control, are controlled by, or are under common |
|||
control with that entity. For the purposes of this definition, |
|||
"control" means (i) the power, direct or indirect, to cause the |
|||
direction or management of such entity, whether by contract or |
|||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
|||
outstanding shares, or (iii) beneficial ownership of such entity. |
|||
|
|||
"You" (or "Your") shall mean an individual or Legal Entity |
|||
exercising permissions granted by this License. |
|||
|
|||
"Source" form shall mean the preferred form for making modifications, |
|||
including but not limited to software source code, documentation |
|||
source, and configuration files. |
|||
|
|||
"Object" form shall mean any form resulting from mechanical |
|||
transformation or translation of a Source form, including but |
|||
not limited to compiled object code, generated documentation, |
|||
and conversions to other media types. |
|||
|
|||
"Work" shall mean the work of authorship, whether in Source or |
|||
Object form, made available under the License, as indicated by a |
|||
copyright notice that is included in or attached to the work |
|||
(an example is provided in the Appendix below). |
|||
|
|||
"Derivative Works" shall mean any work, whether in Source or Object |
|||
form, that is based on (or derived from) the Work and for which the |
|||
editorial revisions, annotations, elaborations, or other modifications |
|||
represent, as a whole, an original work of authorship. For the purposes |
|||
of this License, Derivative Works shall not include works that remain |
|||
separable from, or merely link (or bind by name) to the interfaces of, |
|||
the Work and Derivative Works thereof. |
|||
|
|||
"Contribution" shall mean any work of authorship, including |
|||
the original version of the Work and any modifications or additions |
|||
to that Work or Derivative Works thereof, that is intentionally |
|||
submitted to Licensor for inclusion in the Work by the copyright owner |
|||
or by an individual or Legal Entity authorized to submit on behalf of |
|||
the copyright owner. For the purposes of this definition, "submitted" |
|||
means any form of electronic, verbal, or written communication sent |
|||
to the Licensor or its representatives, including but not limited to |
|||
communication on electronic mailing lists, source code control systems, |
|||
and issue tracking systems that are managed by, or on behalf of, the |
|||
Licensor for the purpose of discussing and improving the Work, but |
|||
excluding communication that is conspicuously marked or otherwise |
|||
designated in writing by the copyright owner as "Not a Contribution." |
|||
|
|||
"Contributor" shall mean Licensor and any individual or Legal Entity |
|||
on behalf of whom a Contribution has been received by Licensor and |
|||
subsequently incorporated within the Work. |
|||
|
|||
2. Grant of Copyright License. Subject to the terms and conditions of |
|||
this License, each Contributor hereby grants to You a perpetual, |
|||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
|||
copyright license to reproduce, prepare Derivative Works of, |
|||
publicly display, publicly perform, sublicense, and distribute the |
|||
Work and such Derivative Works in Source or Object form. |
|||
|
|||
3. Grant of Patent License. Subject to the terms and conditions of |
|||
this License, each Contributor hereby grants to You a perpetual, |
|||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
|||
(except as stated in this section) patent license to make, have made, |
|||
use, offer to sell, sell, import, and otherwise transfer the Work, |
|||
where such license applies only to those patent claims licensable |
|||
by such Contributor that are necessarily infringed by their |
|||
Contribution(s) alone or by combination of their Contribution(s) |
|||
with the Work to which such Contribution(s) was submitted. If You |
|||
institute patent litigation against any entity (including a |
|||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
|||
or a Contribution incorporated within the Work constitutes direct |
|||
or contributory patent infringement, then any patent licenses |
|||
granted to You under this License for that Work shall terminate |
|||
as of the date such litigation is filed. |
|||
|
|||
4. Redistribution. You may reproduce and distribute copies of the |
|||
Work or Derivative Works thereof in any medium, with or without |
|||
modifications, and in Source or Object form, provided that You |
|||
meet the following conditions: |
|||
|
|||
(a) You must give any other recipients of the Work or |
|||
Derivative Works a copy of this License; and |
|||
|
|||
(b) You must cause any modified files to carry prominent notices |
|||
stating that You changed the files; and |
|||
|
|||
(c) You must retain, in the Source form of any Derivative Works |
|||
that You distribute, all copyright, patent, trademark, and |
|||
attribution notices from the Source form of the Work, |
|||
excluding those notices that do not pertain to any part of |
|||
the Derivative Works; and |
|||
|
|||
(d) If the Work includes a "NOTICE" text file as part of its |
|||
distribution, then any Derivative Works that You distribute must |
|||
include a readable copy of the attribution notices contained |
|||
within such NOTICE file, excluding those notices that do not |
|||
pertain to any part of the Derivative Works, in at least one |
|||
of the following places: within a NOTICE text file distributed |
|||
as part of the Derivative Works; within the Source form or |
|||
documentation, if provided along with the Derivative Works; or, |
|||
within a display generated by the Derivative Works, if and |
|||
wherever such third-party notices normally appear. The contents |
|||
of the NOTICE file are for informational purposes only and |
|||
do not modify the License. You may add Your own attribution |
|||
notices within Derivative Works that You distribute, alongside |
|||
or as an addendum to the NOTICE text from the Work, provided |
|||
that such additional attribution notices cannot be construed |
|||
as modifying the License. |
|||
|
|||
You may add Your own copyright statement to Your modifications and |
|||
may provide additional or different license terms and conditions |
|||
for use, reproduction, or distribution of Your modifications, or |
|||
for any such Derivative Works as a whole, provided Your use, |
|||
reproduction, and distribution of the Work otherwise complies with |
|||
the conditions stated in this License. |
|||
|
|||
5. Submission of Contributions. Unless You explicitly state otherwise, |
|||
any Contribution intentionally submitted for inclusion in the Work |
|||
by You to the Licensor shall be under the terms and conditions of |
|||
this License, without any additional terms or conditions. |
|||
Notwithstanding the above, nothing herein shall supersede or modify |
|||
the terms of any separate license agreement you may have executed |
|||
with Licensor regarding such Contributions. |
|||
|
|||
6. Trademarks. This License does not grant permission to use the trade |
|||
names, trademarks, service marks, or product names of the Licensor, |
|||
except as required for reasonable and customary use in describing the |
|||
origin of the Work and reproducing the content of the NOTICE file. |
|||
|
|||
7. Disclaimer of Warranty. Unless required by applicable law or |
|||
agreed to in writing, Licensor provides the Work (and each |
|||
Contributor provides its Contributions) on an "AS IS" BASIS, |
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
|||
implied, including, without limitation, any warranties or conditions |
|||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
|||
PARTICULAR PURPOSE. You are solely responsible for determining the |
|||
appropriateness of using or redistributing the Work and assume any |
|||
risks associated with Your exercise of permissions under this License. |
|||
|
|||
8. Limitation of Liability. In no event and under no legal theory, |
|||
whether in tort (including negligence), contract, or otherwise, |
|||
unless required by applicable law (such as deliberate and grossly |
|||
negligent acts) or agreed to in writing, shall any Contributor be |
|||
liable to You for damages, including any direct, indirect, special, |
|||
incidental, or consequential damages of any character arising as a |
|||
result of this License or out of the use or inability to use the |
|||
Work (including but not limited to damages for loss of goodwill, |
|||
work stoppage, computer failure or malfunction, or any and all |
|||
other commercial damages or losses), even if such Contributor |
|||
has been advised of the possibility of such damages. |
|||
|
|||
9. Accepting Warranty or Additional Liability. While redistributing |
|||
the Work or Derivative Works thereof, You may choose to offer, |
|||
and charge a fee for, acceptance of support, warranty, indemnity, |
|||
or other liability obligations and/or rights consistent with this |
|||
License. However, in accepting such obligations, You may act only |
|||
on Your own behalf and on Your sole responsibility, not on behalf |
|||
of any other Contributor, and only if You agree to indemnify, |
|||
defend, and hold each Contributor harmless for any liability |
|||
incurred by, or claims asserted against, such Contributor by reason |
|||
of your accepting any such warranty or additional liability. |
|||
|
|||
END OF TERMS AND CONDITIONS |
|||
|
|||
APPENDIX: How to apply the Apache License to your work. |
|||
|
|||
To apply the Apache License to your work, attach the following |
|||
boilerplate notice, with the fields enclosed by brackets "[]" |
|||
replaced with your own identifying information. (Don't include |
|||
the brackets!) The text should be enclosed in the appropriate |
|||
comment syntax for the file format. We also recommend that a |
|||
file or class name and description of purpose be included on the |
|||
same "printed page" as the copyright notice for easier |
|||
identification within third-party archives. |
|||
|
|||
Copyright 2019 Zilliz |
|||
|
|||
Licensed under the Apache License, Version 2.0 (the "License"); |
|||
you may not use this file except in compliance with the License. |
|||
You may obtain a copy of the License at |
|||
|
|||
http://www.apache.org/licenses/LICENSE-2.0 |
|||
|
|||
Unless required by applicable law or agreed to in writing, software |
|||
distributed under the License is distributed on an "AS IS" BASIS, |
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
See the License for the specific language governing permissions and |
|||
limitations under the License. |
@ -0,0 +1,7 @@ |
|||
lint: |
|||
uv run ruff format --diff |
|||
uv run ruff check |
|||
|
|||
format: |
|||
uv run ruff format |
|||
uv run ruff check --fix |
@ -0,0 +1,590 @@ |
|||
 |
|||
|
|||
<div align="center"> |
|||
|
|||
[](https://opensource.org/licenses/Apache-2.0) |
|||
[](https://deepwiki.com/zilliztech/deep-searcher) |
|||
[](https://twitter.com/zilliz_universe) |
|||
<a href="https://discord.gg/mKc3R95yE5"><img height="20" src="https://img.shields.io/badge/Discord-%235865F2.svg?style=for-the-badge&logo=discord&logoColor=white" alt="discord"/></a> |
|||
|
|||
</div> |
|||
|
|||
--- |
|||
|
|||
DeepSearcher combines cutting-edge LLMs (OpenAI o3, Qwen3, DeepSeek, Grok 4, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answer and comprehensive report. This project is suitable for enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios. |
|||
|
|||
 |
|||
|
|||
## 🚀 Features |
|||
|
|||
- **Private Data Search**: Maximizes the utilization of enterprise internal data while ensuring data security. When necessary, it can integrate online content for more accurate answers. |
|||
- **Vector Database Management**: Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval. |
|||
- **Flexible Embedding Options**: Compatible with multiple embedding models for optimal selection. |
|||
- **Multiple LLM Support**: Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation. |
|||
- **Document Loader**: Supports local file loading, with web crawling capabilities under development. |
|||
|
|||
--- |
|||
|
|||
## 🎉 Demo |
|||
 |
|||
|
|||
|
|||
## 📖 Quick Start |
|||
|
|||
### Installation |
|||
Install DeepSearcher using one of the following methods: |
|||
|
|||
#### Option 1: Using pip |
|||
Create and activate a virtual environment(Python 3.10 version is recommended). |
|||
```bash |
|||
python -m venv .venv |
|||
source .venv/bin/activate |
|||
``` |
|||
Install DeepSearcher |
|||
```bash |
|||
pip install deepsearcher |
|||
``` |
|||
|
|||
For optional dependencies, e.g., ollama: |
|||
```bash |
|||
pip install "deepsearcher[ollama]" |
|||
``` |
|||
|
|||
#### Option 2: Install in Development Mode |
|||
We recommend using [uv](https://github.com/astral-sh/uv) for faster and more reliable installation. Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/) to install it. |
|||
|
|||
Clone the repository and navigate to the project directory: |
|||
```shell |
|||
git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher |
|||
``` |
|||
Synchronize and install dependencies: |
|||
```shell |
|||
uv sync |
|||
source .venv/bin/activate |
|||
``` |
|||
|
|||
For more detailed development setup and optional dependency installation options, see [CONTRIBUTING.md](CONTRIBUTING.md#development-environment-setup-with-uv). |
|||
|
|||
### Quick start demo |
|||
|
|||
To run this quick start demo, please prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key. |
|||
|
|||
```python |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.online_query import query |
|||
|
|||
config = Configuration() |
|||
|
|||
# Customize your config here, |
|||
# more configuration see the Configuration Details section below. |
|||
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"}) |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"}) |
|||
init_config(config = config) |
|||
|
|||
# Load your local data |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
load_from_local_files(paths_or_directory=your_local_path) |
|||
|
|||
# (Optional) Load from web crawling (`FIRECRAWL_API_KEY` env variable required) |
|||
from deepsearcher.offline_loading import load_from_website |
|||
load_from_website(urls=website_url) |
|||
|
|||
# Query |
|||
result = query("Write a report about xxx.") # Your question here |
|||
``` |
|||
### Configuration Details: |
|||
#### LLM Configuration |
|||
|
|||
<pre><code>config.set_provider_config("llm", "(LLMName)", "(Arguments dict)")</code></pre> |
|||
<p>The "LLMName" can be one of the following: ["DeepSeek", "OpenAI", "XAI", "SiliconFlow", "Aliyun", "PPIO", "TogetherAI", "Gemini", "Ollama", "Novita"]</p> |
|||
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the LLM class.</p> |
|||
|
|||
<details> |
|||
<summary>Example (OpenAI)</summary> |
|||
<p> Make sure you have prepared your OPENAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})</code></pre> |
|||
<p> More details about OpenAI models: https://platform.openai.com/docs/models </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Qwen3 from Aliyun Bailian)</summary> |
|||
<p> Make sure you have prepared your Bailian API KEY as an env variable <code>DASHSCOPE_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "Aliyun", {"model": "qwen-plus-latest"})</code></pre> |
|||
<p> More details about Aliyun Bailian models: https://bailian.console.aliyun.com </p> |
|||
</details> |
|||
|
|||
|
|||
<details> |
|||
<summary>Example (Qwen3 from OpenRouter)</summary> |
|||
<pre><code>config.set_provider_config("llm", "OpenAI", {"model": "qwen/qwen3-235b-a22b:free", "base_url": "https://openrouter.ai/api/v1", "api_key": "OPENROUTER_API_KEY"})</code></pre> |
|||
<p> More details about OpenRouter models: https://openrouter.ai/qwen/qwen3-235b-a22b:free </p> |
|||
</details> |
|||
|
|||
|
|||
<details> |
|||
<summary>Example (DeepSeek from official)</summary> |
|||
<p> Make sure you have prepared your DEEPSEEK API KEY as an env variable <code>DEEPSEEK_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"})</code></pre> |
|||
<p> More details about DeepSeek: https://api-docs.deepseek.com/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (DeepSeek from SiliconFlow)</summary> |
|||
<p> Make sure you have prepared your SILICONFLOW API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre> |
|||
<p> More details about SiliconFlow: https://docs.siliconflow.cn/quickstart </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (DeepSeek from TogetherAI)</summary> |
|||
<p> Make sure you have prepared your TOGETHER API KEY as an env variable <code>TOGETHER_API_KEY</code>.</p> |
|||
For deepseek R1: |
|||
<pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre> |
|||
For Llama 4: |
|||
<pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"})</code></pre> |
|||
<p> You need to install together before running, execute: <code>pip install together</code>. More details about TogetherAI: https://www.together.ai/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (XAI Grok)</summary> |
|||
<p> Make sure you have prepared your XAI API KEY as an env variable <code>XAI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "XAI", {"model": "grok-4-0709"})</code></pre> |
|||
<p> More details about XAI Grok: https://docs.x.ai/docs/overview#featured-models </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Claude)</summary> |
|||
<p> Make sure you have prepared your ANTHROPIC API KEY as an env variable <code>ANTHROPIC_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"})</code></pre> |
|||
<p> More details about Anthropic Claude: https://docs.anthropic.com/en/home </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Google Gemini)</summary> |
|||
<p> Make sure you have prepared your GEMINI API KEY as an env variable <code>GEMINI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' })</code></pre> |
|||
<p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (DeepSeek from PPIO)</summary> |
|||
<p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>. You can create an API Key <a href="https://ppinfra.com/settings/key-management?utm_source=github_deep-searcher">here</a>. </p> |
|||
<pre><code>config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"})</code></pre> |
|||
<p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Ollama)</summary> |
|||
<p> Follow <a href="https://github.com/jmorganca/ollama">these instructions</a> to set up and run a local Ollama instance:</p> |
|||
<p> <a href="https://ollama.ai/download">Download</a> and install Ollama onto the available supported platforms (including Windows Subsystem for Linux).</p> |
|||
<p> View a list of available models via the <a href="https://ollama.ai/library">model library</a>.</p> |
|||
<p> Fetch available LLM models via <code>ollama pull <name-of-model></code></p> |
|||
<p> Example: <code>ollama pull qwen3</code></p> |
|||
<p> To chat directly with a model from the command line, use <code>ollama run <name-of-model></code>.</p> |
|||
<p> By default, Ollama has a REST API for running and managing models on <a href="http://localhost:11434">http://localhost:11434</a>.</p> |
|||
<pre><code>config.set_provider_config("llm", "Ollama", {"model": "qwen3"})</code></pre> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Volcengine)</summary> |
|||
<p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>. You can create an API Key <a href="https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey">here</a>. </p> |
|||
<pre><code>config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"})</code></pre> |
|||
<p> More details about Volcengine: https://www.volcengine.com/docs/82379/1099455?utm_source=github_deep-searcher </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (GLM)</summary> |
|||
<p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"})</code></pre> |
|||
<p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Amazon Bedrock)</summary> |
|||
<p> Make sure you have prepared your Amazon Bedrock API KEY as an env variable <code>AWS_ACCESS_KEY_ID</code> and <code>AWS_SECRET_ACCESS_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"})</code></pre> |
|||
<p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (IBM watsonx.ai)</summary> |
|||
<p> Make sure you have prepared your watsonx.ai credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p> |
|||
<pre><code>config.set_provider_config("llm", "watsonx", {"model": "us.deepseek.r1-v1:0"})</code></pre> |
|||
<p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p> |
|||
</details> |
|||
|
|||
|
|||
#### Embedding Model Configuration |
|||
<pre><code>config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)")</code></pre> |
|||
<p>The "EmbeddingModelName" can be one of the following: ["MilvusEmbedding", "OpenAIEmbedding", "VoyageEmbedding", "SiliconflowEmbedding", "PPIOEmbedding", "NovitaEmbedding"]</p> |
|||
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the embedding model class.</p> |
|||
|
|||
<details> |
|||
<summary>Example (OpenAI embedding)</summary> |
|||
<p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"})</code></pre> |
|||
<p> More details about OpenAI models: https://platform.openai.com/docs/guides/embeddings/use-cases </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (OpenAI embedding Azure)</summary> |
|||
<p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", { |
|||
"model": "text-embedding-ada-002", |
|||
"azure_endpoint": "https://<youraifoundry>.openai.azure.com/", |
|||
"api_version": "2023-05-15" |
|||
})</code></pre> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Pymilvus built-in embedding model)</summary> |
|||
<p> Use the built-in embedding model in Pymilvus, you can set the model name as <code>"default"</code>, <code>"BAAI/bge-base-en-v1.5"</code>, <code>"BAAI/bge-large-en-v1.5"</code>, <code>"jina-embeddings-v3"</code>, etc. <br/> |
|||
See [milvus_embedding.py](deepsearcher/embedding/milvus_embedding.py) for more details. </p> |
|||
<pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"})</code></pre> |
|||
<pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"})</code></pre> |
|||
<p> For Jina's embedding model, you need<code>JINAAI_API_KEY</code>.</p> |
|||
<p> You need to install pymilvus model before running, execute: <code>pip install pymilvus.model</code>. More details about Pymilvus: https://milvus.io/docs/embeddings.md </p> |
|||
|
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (VoyageAI embedding)</summary> |
|||
<p> Make sure you have prepared your VOYAGE API KEY as an env variable <code>VOYAGE_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"})</code></pre> |
|||
<p> You need to install voyageai before running, execute: <code>pip install voyageai</code>. More details about VoyageAI: https://docs.voyageai.com/embeddings/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Amazon Bedrock embedding)</summary> |
|||
<pre><code>config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"})</code></pre> |
|||
<p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Novita AI embedding)</summary> |
|||
<p> Make sure you have prepared your Novita AI API KEY as an env variable <code>NOVITA_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"})</code></pre> |
|||
<p> More details about Novita AI: https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Siliconflow embedding)</summary> |
|||
<p> Make sure you have prepared your Siliconflow API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"})</code></pre> |
|||
<p> More details about Siliconflow: https://docs.siliconflow.cn/en/api-reference/embeddings/create-embeddings </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Volcengine embedding)</summary> |
|||
<p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"})</code></pre> |
|||
<p> More details about Volcengine: https://www.volcengine.com/docs/82379/1302003 </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (GLM embedding)</summary> |
|||
<p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"})</code></pre> |
|||
<p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Google Gemini embedding)</summary> |
|||
<p> Make sure you have prepared your Gemini API KEY as an env variable <code>GEMINI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"})</code></pre> |
|||
<p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Ollama embedding)</summary> |
|||
<pre><code>config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"})</code></pre> |
|||
<p> You need to install ollama before running, execute: <code>pip install ollama</code>. More details about Ollama Python SDK: https://github.com/ollama/ollama-python </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (PPIO embedding)</summary> |
|||
<p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"})</code></pre> |
|||
<p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p> |
|||
</details> |
|||
|
|||
|
|||
<details> |
|||
<summary>Example (FastEmbed embedding)</summary> |
|||
<pre><code>config.set_provider_config("embedding", "FastEmbedEmbedding", {"model": "intfloat/multilingual-e5-large"})</code></pre> |
|||
<p> You need to install fastembed before running, execute: <code>pip install fastembed</code>. More details about fastembed: https://github.com/qdrant/fastembed </p> |
|||
</details> |
|||
|
|||
|
|||
<details> |
|||
<summary>Example (IBM watsonx.ai embedding)</summary> |
|||
<p> Make sure you have prepared your WatsonX credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p> |
|||
<pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"})</code></pre> |
|||
<pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "sentence-transformers/all-minilm-l6-v2"})</code></pre> |
|||
<p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p> |
|||
</details> |
|||
|
|||
#### Vector Database Configuration |
|||
<pre><code>config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)")</code></pre> |
|||
<p>The "VectorDBName" can be one of the following: ["Milvus"] (Under development)</p> |
|||
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Vector Database class.</p> |
|||
|
|||
<details> |
|||
<summary>Example (Milvus)</summary> |
|||
<pre><code>config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})</code></pre> |
|||
<p> More details about Milvus Config:</p> |
|||
<ul> |
|||
<li> |
|||
Setting the <code>uri</code> as a local file, e.g. <code>./milvus.db</code>, is the most convenient method, as it automatically utilizes <a href="https://milvus.io/docs/milvus_lite.md" target="_blank">Milvus Lite</a> to store all data in this file. |
|||
</li> |
|||
</ul> |
|||
<ul> |
|||
<li> |
|||
If you have a large-scale dataset, you can set up a more performant Milvus server using |
|||
<a href="https://milvus.io/docs/quickstart.md" target="_blank">Docker or Kubernetes</a>. |
|||
In this setup, use the server URI, e.g., <code>http://localhost:19530</code>, as your <code>uri</code>. |
|||
You can also use any other connection parameters supported by Milvus such as <code>host</code>, <code>user</code>, <code>password</code>, or <code>secure</code>. |
|||
</li> |
|||
</ul> |
|||
<ul> |
|||
<li> |
|||
If you want to use <a href="https://zilliz.com/cloud" target="_blank">Zilliz Cloud</a>, |
|||
the fully managed cloud service for Milvus, adjust the <code>uri</code> and <code>token</code> |
|||
according to the <a href="https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details" |
|||
target="_blank">Public Endpoint and API Key</a> in Zilliz Cloud. |
|||
</li> |
|||
</ul> |
|||
|
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (AZURE AI Search)</summary> |
|||
<pre><code>config.set_provider_config("vector_db", "AzureSearch", { |
|||
"endpoint": "https://<yourazureaisearch>.search.windows.net", |
|||
"index_name": "<yourindex>", |
|||
"api_key": "<yourkey>", |
|||
"vector_field": "" |
|||
})</code></pre> |
|||
<p> More details about Milvus Config:</p> |
|||
|
|||
</details> |
|||
|
|||
#### File Loader Configuration |
|||
<pre><code>config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)")</code></pre> |
|||
<p>The "FileLoaderName" can be one of the following: ["PDFLoader", "TextLoader", "UnstructuredLoader"]</p> |
|||
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the File Loader class.</p> |
|||
|
|||
<details> |
|||
<summary>Example (Unstructured)</summary> |
|||
<p>You can use Unstructured in two ways:</p> |
|||
<ul> |
|||
<li>With API: Set environment variables <code>UNSTRUCTURED_API_KEY</code> and <code>UNSTRUCTURED_API_URL</code></li> |
|||
<li>Without API: Use the local processing mode by simply not setting these environment variables</li> |
|||
</ul> |
|||
<pre><code>config.set_provider_config("file_loader", "UnstructuredLoader", {})</code></pre> |
|||
<ul> |
|||
<li>Currently supported file types: ["pdf"] (Under development)</li> |
|||
<li>Installation requirements: |
|||
<ul> |
|||
<li>Install ingest pipeline: <code>pip install unstructured-ingest</code></li> |
|||
<li>For all document formats: <code>pip install "unstructured[all-docs]"</code></li> |
|||
<li>For specific formats (e.g., PDF only): <code>pip install "unstructured[pdf]"</code></li> |
|||
</ul> |
|||
</li> |
|||
<li>More information: |
|||
<ul> |
|||
<li>Unstructured documentation: <a href="https://docs.unstructured.io/ingestion/overview">https://docs.unstructured.io/ingestion/overview</a></li> |
|||
<li>Installation guide: <a href="https://docs.unstructured.io/open-source/installation/full-installation">https://docs.unstructured.io/open-source/installation/full-installation</a></li> |
|||
</ul> |
|||
</li> |
|||
</ul> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Docling)</summary> |
|||
<pre><code>config.set_provider_config("file_loader", "DoclingLoader", {})</code></pre> |
|||
<p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p> |
|||
<p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p> |
|||
</details> |
|||
|
|||
#### Web Crawler Configuration |
|||
<pre><code>config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)")</code></pre> |
|||
<p>The "WebCrawlerName" can be one of the following: ["FireCrawlCrawler", "Crawl4AICrawler", "JinaCrawler"]</p> |
|||
<p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Web Crawler class.</p> |
|||
|
|||
<details> |
|||
<summary>Example (FireCrawl)</summary> |
|||
<p> Make sure you have prepared your FireCrawl API KEY as an env variable <code>FIRECRAWL_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("web_crawler", "FireCrawlCrawler", {})</code></pre> |
|||
<p> More details about FireCrawl: https://docs.firecrawl.dev/introduction </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Crawl4AI)</summary> |
|||
<p> Make sure you have run <code>crawl4ai-setup</code> in your environment.</p> |
|||
<pre><code>config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}})</code></pre> |
|||
<p> You need to install crawl4ai before running, execute: <code>pip install crawl4ai</code>. More details about Crawl4AI: https://docs.crawl4ai.com/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Jina Reader)</summary> |
|||
<p> Make sure you have prepared your Jina Reader API KEY as an env variable <code>JINA_API_TOKEN</code> or <code>JINAAI_API_KEY</code>.</p> |
|||
<pre><code>config.set_provider_config("web_crawler", "JinaCrawler", {})</code></pre> |
|||
<p> More details about Jina Reader: https://jina.ai/reader/ </p> |
|||
</details> |
|||
|
|||
<details> |
|||
<summary>Example (Docling)</summary> |
|||
<pre><code>config.set_provider_config("web_crawler", "DoclingCrawler", {})</code></pre> |
|||
<p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p> |
|||
<p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p> |
|||
</details> |
|||
|
|||
|
|||
### Python CLI Mode |
|||
#### Load |
|||
```shell |
|||
deepsearcher load "your_local_path_or_url" |
|||
# load into a specific collection |
|||
deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description" |
|||
``` |
|||
Example loading from local file: |
|||
```shell |
|||
deepsearcher load "/path/to/your/local/file.pdf" |
|||
# or more files at once |
|||
deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md" |
|||
``` |
|||
Example loading from url (*Set `FIRECRAWL_API_KEY` in your environment variables, see [FireCrawl](https://docs.firecrawl.dev/introduction) for more details*): |
|||
|
|||
```shell |
|||
deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek" |
|||
``` |
|||
|
|||
#### Query |
|||
```shell |
|||
deepsearcher query "Write a report about xxx." |
|||
``` |
|||
|
|||
More help information |
|||
```shell |
|||
deepsearcher --help |
|||
``` |
|||
For more help information about a specific subcommand, you can use `deepsearcher [subcommand] --help`. |
|||
```shell |
|||
deepsearcher load --help |
|||
deepsearcher query --help |
|||
``` |
|||
|
|||
### Deployment |
|||
|
|||
#### Configure modules |
|||
|
|||
You can configure all arguments by modifying [config.yaml](./config.yaml) to set up your system with default modules. |
|||
For example, set your `OPENAI_API_KEY` in the `llm` section of the YAML file. |
|||
|
|||
#### Start service |
|||
The main script will run a FastAPI service with default address `localhost:8000`. |
|||
|
|||
```shell |
|||
$ python main.py |
|||
``` |
|||
|
|||
#### Access via browser |
|||
|
|||
You can open url http://localhost:8000/docs in browser to access the web service. |
|||
Click on the button "Try it out", it allows you to fill the parameters and directly interact with the API. |
|||
|
|||
|
|||
--- |
|||
|
|||
## ❓ Q&A |
|||
|
|||
**Q1**: Why I failed to parse LLM output format / How to select the LLM? |
|||
|
|||
|
|||
**A1**: Small LLMs struggle to follow the prompt to generate a desired response, which usually cause the format parsing problem. A better practice is to use large reasoning models e.g. deepseek-r1 671b, OpenAI o-series, Claude 4 sonnet, etc. as your LLM. |
|||
|
|||
--- |
|||
|
|||
**Q2**: |
|||
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json. |
|||
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. |
|||
|
|||
**A2**: This is mainly due to abnormal access to huggingface, which may be a network or permission problem. You can try the following two methods: |
|||
1. If there is a network problem, set up a proxy, try adding the following environment variable. |
|||
```bash |
|||
export HF_ENDPOINT=https://hf-mirror.com |
|||
``` |
|||
2. If there is a permission problem, set up a personal token, try adding the following environment variable. |
|||
```bash |
|||
export HUGGING_FACE_HUB_TOKEN=xxxx |
|||
``` |
|||
|
|||
--- |
|||
|
|||
**Q3**: DeepSearcher doesn't run in Jupyter notebook. |
|||
|
|||
**A3**: Install `nest_asyncio` and then put this code block in front of your jupyter notebook. |
|||
|
|||
``` |
|||
pip install nest_asyncio |
|||
``` |
|||
|
|||
``` |
|||
import nest_asyncio |
|||
nest_asyncio.apply() |
|||
``` |
|||
|
|||
--- |
|||
|
|||
## 🔧 Module Support |
|||
|
|||
### 🔹 Embedding Models |
|||
- [Open-source embedding models](https://milvus.io/docs/embeddings.md) |
|||
- [OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases) (`OPENAI_API_KEY` env variable required) |
|||
- [VoyageAI](https://docs.voyageai.com/embeddings/) (`VOYAGE_API_KEY` env variable required) |
|||
- [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/) (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` env variable required) |
|||
- [FastEmbed](https://qdrant.github.io/fastembed/) |
|||
- [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required) |
|||
- [Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required) |
|||
- [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variables required) |
|||
|
|||
### 🔹 LLM Support |
|||
- [OpenAI](https://platform.openai.com/docs/models) (`OPENAI_API_KEY` env variable required) |
|||
- [DeepSeek](https://api-docs.deepseek.com/) (`DEEPSEEK_API_KEY` env variable required) |
|||
- [XAI Grok](https://x.ai/api) (`XAI_API_KEY` env variable required) |
|||
- [Anthropic Claude](https://docs.anthropic.com/en/home) (`ANTHROPIC_API_KEY` env variable required) |
|||
- [SiliconFlow Inference Service](https://docs.siliconflow.cn/en/userguide/introduction) (`SILICONFLOW_API_KEY` env variable required) |
|||
- [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required) |
|||
- [TogetherAI Inference Service](https://docs.together.ai/docs/introduction) (`TOGETHER_API_KEY` env variable required) |
|||
- [Google Gemini](https://ai.google.dev/gemini-api/docs) (`GEMINI_API_KEY` env variable required) |
|||
- [SambaNova Cloud Inference Service](https://docs.together.ai/docs/introduction) (`SAMBANOVA_API_KEY` env variable required) |
|||
- [Ollama](https://ollama.com/) |
|||
- [Novita AI](https://novita.ai/docs/guides/introduction?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required) |
|||
- [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variable required) |
|||
|
|||
### 🔹 Document Loader |
|||
- Local File |
|||
- PDF(with txt/md) loader |
|||
- [Unstructured](https://unstructured.io/) (under development) (`UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_URL` env variables required) |
|||
- Web Crawler |
|||
- [FireCrawl](https://docs.firecrawl.dev/introduction) (`FIRECRAWL_API_KEY` env variable required) |
|||
- [Jina Reader](https://jina.ai/reader/) (`JINA_API_TOKEN` env variable required) |
|||
- [Crawl4AI](https://docs.crawl4ai.com/) (You should run command `crawl4ai-setup` for the first time) |
|||
|
|||
### 🔹 Vector Database Support |
|||
- [Milvus](https://milvus.io/) and [Zilliz Cloud](https://www.zilliz.com/) (fully managed Milvus) |
|||
- [Qdrant](https://qdrant.tech/) |
|||
|
|||
--- |
|||
## 📊 Evaluation |
|||
See the [Evaluation](./evaluation) directory for more details. |
|||
|
|||
--- |
|||
## 📌 Future Plans |
|||
- Enhance web crawling functionality |
|||
- Support more vector databases (e.g., FAISS...) |
|||
- Add support for additional large models |
|||
- Provide RESTful API interface (**DONE**) |
|||
|
|||
We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯 |
After Width: | Height: | Size: 307 KiB |
After Width: | Height: | Size: 3.4 MiB |
After Width: | Height: | Size: 54 KiB |
@ -0,0 +1,5 @@ |
|||
import os |
|||
|
|||
# ignore the warnings |
|||
# None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used. |
|||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" |
@ -0,0 +1,12 @@ |
|||
from .base import BaseAgent, RAGAgent |
|||
from .chain_of_rag import ChainOfRAG |
|||
from .deep_search import DeepSearch |
|||
from .naive_rag import NaiveRAG |
|||
|
|||
__all__ = [ |
|||
"ChainOfRAG", |
|||
"DeepSearch", |
|||
"NaiveRAG", |
|||
"BaseAgent", |
|||
"RAGAgent", |
|||
] |
@ -0,0 +1,103 @@ |
|||
from abc import ABC |
|||
from typing import Any, List, Tuple |
|||
|
|||
from deepsearcher.vector_db import RetrievalResult |
|||
|
|||
|
|||
def describe_class(description): |
|||
""" |
|||
Decorator function to add a description to a class. |
|||
|
|||
This decorator adds a __description__ attribute to the decorated class, |
|||
which can be used for documentation or introspection. |
|||
|
|||
Args: |
|||
description: The description to add to the class. |
|||
|
|||
Returns: |
|||
A decorator function that adds the description to the class. |
|||
""" |
|||
|
|||
def decorator(cls): |
|||
cls.__description__ = description |
|||
return cls |
|||
|
|||
return decorator |
|||
|
|||
|
|||
class BaseAgent(ABC): |
|||
""" |
|||
Abstract base class for all agents in the DeepSearcher system. |
|||
|
|||
This class defines the basic interface for agents, including initialization |
|||
and invocation methods. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize a BaseAgent object. |
|||
|
|||
Args: |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
pass |
|||
|
|||
def invoke(self, query: str, **kwargs) -> Any: |
|||
""" |
|||
Invoke the agent and return the result. |
|||
|
|||
Args: |
|||
query: The query string. |
|||
**kwargs: Additional keyword arguments. |
|||
|
|||
Returns: |
|||
The result of invoking the agent. |
|||
""" |
|||
|
|||
|
|||
class RAGAgent(BaseAgent): |
|||
""" |
|||
Abstract base class for Retrieval-Augmented Generation (RAG) agents. |
|||
|
|||
This class extends BaseAgent with methods specific to RAG, including |
|||
retrieval and query methods. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize a RAGAgent object. |
|||
|
|||
Args: |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
pass |
|||
|
|||
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]: |
|||
""" |
|||
Retrieve document results from the knowledge base. |
|||
|
|||
Args: |
|||
query: The query string. |
|||
**kwargs: Additional keyword arguments. |
|||
|
|||
Returns: |
|||
A tuple containing: |
|||
- the retrieved results |
|||
- the total number of token usages of the LLM |
|||
- any additional metadata, which can be an empty dictionary |
|||
""" |
|||
|
|||
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]: |
|||
""" |
|||
Query the agent and return the answer. |
|||
|
|||
Args: |
|||
query: The query string. |
|||
**kwargs: Additional keyword arguments. |
|||
|
|||
Returns: |
|||
A tuple containing: |
|||
- the result generated from LLM |
|||
- the retrieved document results |
|||
- the total number of token usages of the LLM |
|||
""" |
@ -0,0 +1,326 @@ |
|||
from typing import List, Tuple |
|||
|
|||
from deepsearcher.agent.base import RAGAgent, describe_class |
|||
from deepsearcher.agent.collection_router import CollectionRouter |
|||
from deepsearcher.embedding.base import BaseEmbedding |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db import RetrievalResult |
|||
from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results |
|||
|
|||
FOLLOWUP_QUERY_PROMPT = """You are using a search tool to answer the main query by iteratively searching the database. Given the following intermediate queries and answers, generate a new simple follow-up question that can help answer the main query. You may rephrase or decompose the main query when previous answers are not helpful. Ask simple follow-up questions only as the search tool may not understand complex questions. |
|||
|
|||
## Previous intermediate queries and answers |
|||
{intermediate_context} |
|||
|
|||
## Main query to answer |
|||
{query} |
|||
|
|||
Respond with a simple follow-up question that will help answer the main query, do not explain yourself or output anything else. |
|||
""" |
|||
|
|||
INTERMEDIATE_ANSWER_PROMPT = """Given the following documents, generate an appropriate answer for the query. DO NOT hallucinate any information, only use the provided documents to generate the answer. Respond "No relevant information found" if the documents do not contain useful information. |
|||
|
|||
## Documents |
|||
{retrieved_documents} |
|||
|
|||
## Query |
|||
{sub_query} |
|||
|
|||
Respond with a concise answer only, do not explain yourself or output anything else. |
|||
""" |
|||
|
|||
FINAL_ANSWER_PROMPT = """Given the following intermediate queries and answers, generate a final answer for the main query by combining relevant information. Note that intermediate answers are generated by an LLM and may not always be accurate. |
|||
|
|||
## Documents |
|||
{retrieved_documents} |
|||
|
|||
## Intermediate queries and answers |
|||
{intermediate_context} |
|||
|
|||
## Main query |
|||
{query} |
|||
|
|||
Respond with an appropriate answer only, do not explain yourself or output anything else. |
|||
""" |
|||
|
|||
REFLECTION_PROMPT = """Given the following intermediate queries and answers, judge whether you have enough information to answer the main query. If you believe you have enough information, respond with "Yes", otherwise respond with "No". |
|||
|
|||
## Intermediate queries and answers |
|||
{intermediate_context} |
|||
|
|||
## Main query |
|||
{query} |
|||
|
|||
Respond with "Yes" or "No" only, do not explain yourself or output anything else. |
|||
""" |
|||
|
|||
GET_SUPPORTED_DOCS_PROMPT = """Given the following documents, select the ones that are support the Q-A pair. |
|||
|
|||
## Documents |
|||
{retrieved_documents} |
|||
|
|||
## Q-A Pair |
|||
### Question |
|||
{query} |
|||
### Answer |
|||
{answer} |
|||
|
|||
Respond with a python list of indices of the selected documents. |
|||
""" |
|||
|
|||
|
|||
@describe_class( |
|||
"This agent can decompose complex queries and gradually find the fact information of sub-queries. " |
|||
"It is very suitable for handling concrete factual queries and multi-hop questions." |
|||
) |
|||
class ChainOfRAG(RAGAgent): |
|||
""" |
|||
Chain of Retrieval-Augmented Generation (RAG) agent implementation. |
|||
|
|||
This agent implements a multi-step RAG process where each step can refine |
|||
the query and retrieval process based on previous results, creating a chain |
|||
of increasingly focused and relevant information retrieval and generation. |
|||
Inspired by: https://arxiv.org/pdf/2501.14342 |
|||
|
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
llm: BaseLLM, |
|||
embedding_model: BaseEmbedding, |
|||
vector_db: BaseVectorDB, |
|||
max_iter: int = 4, |
|||
early_stopping: bool = False, |
|||
route_collection: bool = True, |
|||
text_window_splitter: bool = True, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize the ChainOfRAG agent with configuration parameters. |
|||
|
|||
Args: |
|||
llm (BaseLLM): The language model to use for generating answers. |
|||
embedding_model (BaseEmbedding): The embedding model to use for embedding queries. |
|||
vector_db (BaseVectorDB): The vector database to search for relevant documents. |
|||
max_iter (int, optional): The maximum number of iterations for the RAG process. Defaults to 4. |
|||
early_stopping (bool, optional): Whether to use early stopping. Defaults to False. |
|||
route_collection (bool, optional): Whether to route the query to specific collections. Defaults to True. |
|||
text_window_splitter (bool, optional): Whether use text_window splitter. Defaults to True. |
|||
""" |
|||
self.llm = llm |
|||
self.embedding_model = embedding_model |
|||
self.vector_db = vector_db |
|||
self.max_iter = max_iter |
|||
self.early_stopping = early_stopping |
|||
self.route_collection = route_collection |
|||
self.collection_router = CollectionRouter( |
|||
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension |
|||
) |
|||
self.text_window_splitter = text_window_splitter |
|||
|
|||
def _reflect_get_subquery(self, query: str, intermediate_context: List[str]) -> Tuple[str, int]: |
|||
chat_response = self.llm.chat( |
|||
[ |
|||
{ |
|||
"role": "user", |
|||
"content": FOLLOWUP_QUERY_PROMPT.format( |
|||
query=query, |
|||
intermediate_context="\n".join(intermediate_context), |
|||
), |
|||
} |
|||
] |
|||
) |
|||
return self.llm.remove_think(chat_response.content), chat_response.total_tokens |
|||
|
|||
def _retrieve_and_answer(self, query: str) -> Tuple[str, List[RetrievalResult], int]: |
|||
consume_tokens = 0 |
|||
if self.route_collection: |
|||
selected_collections, n_token_route = self.collection_router.invoke( |
|||
query=query, dim=self.embedding_model.dimension |
|||
) |
|||
else: |
|||
selected_collections = self.collection_router.all_collections |
|||
n_token_route = 0 |
|||
consume_tokens += n_token_route |
|||
all_retrieved_results = [] |
|||
for collection in selected_collections: |
|||
log.color_print(f"<search> Search [{query}] in [{collection}]... </search>\n") |
|||
query_vector = self.embedding_model.embed_query(query) |
|||
retrieved_results = self.vector_db.search_data( |
|||
collection=collection, vector=query_vector, query_text=query |
|||
) |
|||
all_retrieved_results.extend(retrieved_results) |
|||
all_retrieved_results = deduplicate_results(all_retrieved_results) |
|||
chat_response = self.llm.chat( |
|||
[ |
|||
{ |
|||
"role": "user", |
|||
"content": INTERMEDIATE_ANSWER_PROMPT.format( |
|||
retrieved_documents=self._format_retrieved_results(all_retrieved_results), |
|||
sub_query=query, |
|||
), |
|||
} |
|||
] |
|||
) |
|||
return ( |
|||
self.llm.remove_think(chat_response.content), |
|||
all_retrieved_results, |
|||
consume_tokens + chat_response.total_tokens, |
|||
) |
|||
|
|||
def _get_supported_docs( |
|||
self, |
|||
retrieved_results: List[RetrievalResult], |
|||
query: str, |
|||
intermediate_answer: str, |
|||
) -> Tuple[List[RetrievalResult], int]: |
|||
supported_retrieved_results = [] |
|||
token_usage = 0 |
|||
if "No relevant information found" not in intermediate_answer: |
|||
chat_response = self.llm.chat( |
|||
[ |
|||
{ |
|||
"role": "user", |
|||
"content": GET_SUPPORTED_DOCS_PROMPT.format( |
|||
retrieved_documents=self._format_retrieved_results(retrieved_results), |
|||
query=query, |
|||
answer=intermediate_answer, |
|||
), |
|||
} |
|||
] |
|||
) |
|||
supported_doc_indices = self.llm.literal_eval(chat_response.content) |
|||
supported_retrieved_results = [ |
|||
retrieved_results[int(i)] |
|||
for i in supported_doc_indices |
|||
if int(i) < len(retrieved_results) |
|||
] |
|||
token_usage = chat_response.total_tokens |
|||
return supported_retrieved_results, token_usage |
|||
|
|||
def _check_has_enough_info( |
|||
self, query: str, intermediate_contexts: List[str] |
|||
) -> Tuple[bool, int]: |
|||
if not intermediate_contexts: |
|||
return False, 0 |
|||
|
|||
chat_response = self.llm.chat( |
|||
[ |
|||
{ |
|||
"role": "user", |
|||
"content": REFLECTION_PROMPT.format( |
|||
query=query, |
|||
intermediate_context="\n".join(intermediate_contexts), |
|||
), |
|||
} |
|||
] |
|||
) |
|||
has_enough_info = self.llm.remove_think(chat_response.content).strip().lower() == "yes" |
|||
return has_enough_info, chat_response.total_tokens |
|||
|
|||
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]: |
|||
""" |
|||
Retrieves relevant documents based on the input query and iteratively refines the search. |
|||
|
|||
This method iteratively refines the search query based on intermediate results, retrieves documents, |
|||
and filters out supported documents. It keeps track of the intermediate contexts and token usage. |
|||
|
|||
Args: |
|||
query (str): The initial search query. |
|||
**kwargs: Additional keyword arguments. |
|||
- max_iter (int, optional): The maximum number of iterations for refinement. Defaults to self.max_iter. |
|||
|
|||
Returns: |
|||
Tuple[List[RetrievalResult], int, dict]: A tuple containing: |
|||
- List[RetrievalResult]: The list of all retrieved and deduplicated results. |
|||
- int: The total token usage across all iterations. |
|||
- dict: A dictionary containing additional information, including the intermediate contexts. |
|||
""" |
|||
max_iter = kwargs.pop("max_iter", self.max_iter) |
|||
intermediate_contexts = [] |
|||
all_retrieved_results = [] |
|||
token_usage = 0 |
|||
for iter in range(max_iter): |
|||
log.color_print(f">> Iteration: {iter + 1}\n") |
|||
followup_query, n_token0 = self._reflect_get_subquery(query, intermediate_contexts) |
|||
intermediate_answer, retrieved_results, n_token1 = self._retrieve_and_answer( |
|||
followup_query |
|||
) |
|||
supported_retrieved_results, n_token2 = self._get_supported_docs( |
|||
retrieved_results, followup_query, intermediate_answer |
|||
) |
|||
|
|||
all_retrieved_results.extend(supported_retrieved_results) |
|||
intermediate_idx = len(intermediate_contexts) + 1 |
|||
intermediate_contexts.append( |
|||
f"Intermediate query{intermediate_idx}: {followup_query}\nIntermediate answer{intermediate_idx}: {intermediate_answer}" |
|||
) |
|||
token_usage += n_token0 + n_token1 + n_token2 |
|||
|
|||
if self.early_stopping: |
|||
has_enough_info, n_token_check = self._check_has_enough_info( |
|||
query, intermediate_contexts |
|||
) |
|||
token_usage += n_token_check |
|||
|
|||
if has_enough_info: |
|||
log.color_print( |
|||
f"<think> Early stopping after iteration {iter + 1}: Have enough information to answer the main query. </think>\n" |
|||
) |
|||
break |
|||
|
|||
all_retrieved_results = deduplicate_results(all_retrieved_results) |
|||
additional_info = {"intermediate_context": intermediate_contexts} |
|||
return all_retrieved_results, token_usage, additional_info |
|||
|
|||
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]: |
|||
""" |
|||
Executes a query and returns the final answer along with all retrieved results and total token usage. |
|||
|
|||
This method initiates a query, retrieves relevant documents, and then summarizes the answer based on the retrieved documents and intermediate contexts. It logs the final answer and returns the answer content, all retrieved results, and the total token usage including the tokens used for the final answer. |
|||
|
|||
Args: |
|||
query (str): The initial query to execute. |
|||
**kwargs: Additional keyword arguments to pass to the `retrieve` method. |
|||
|
|||
Returns: |
|||
Tuple[str, List[RetrievalResult], int]: A tuple containing: |
|||
- str: The final answer content. |
|||
- List[RetrievalResult]: The list of all retrieved and deduplicated results. |
|||
- int: The total token usage across all iterations, including the final answer. |
|||
""" |
|||
all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs) |
|||
intermediate_context = additional_info["intermediate_context"] |
|||
log.color_print( |
|||
f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n" |
|||
) |
|||
chat_response = self.llm.chat( |
|||
[ |
|||
{ |
|||
"role": "user", |
|||
"content": FINAL_ANSWER_PROMPT.format( |
|||
retrieved_documents=self._format_retrieved_results(all_retrieved_results), |
|||
intermediate_context="\n".join(intermediate_context), |
|||
query=query, |
|||
), |
|||
} |
|||
] |
|||
) |
|||
log.color_print("\n==== FINAL ANSWER====\n") |
|||
log.color_print(self.llm.remove_think(chat_response.content)) |
|||
return ( |
|||
self.llm.remove_think(chat_response.content), |
|||
all_retrieved_results, |
|||
n_token_retrieval + chat_response.total_tokens, |
|||
) |
|||
|
|||
def _format_retrieved_results(self, retrieved_results: List[RetrievalResult]) -> str: |
|||
formatted_documents = [] |
|||
for i, result in enumerate(retrieved_results): |
|||
if self.text_window_splitter and "wider_text" in result.metadata: |
|||
text = result.metadata["wider_text"] |
|||
else: |
|||
text = result.text |
|||
formatted_documents.append(f"<Document {i}>\n{text}\n<\Document {i}>") |
|||
return "\n".join(formatted_documents) |
@ -0,0 +1,98 @@ |
|||
from typing import List, Tuple |
|||
|
|||
from deepsearcher.agent.base import BaseAgent |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB |
|||
|
|||
COLLECTION_ROUTE_PROMPT = """ |
|||
I provide you with collection_name(s) and corresponding collection_description(s). Please select the collection names that may be related to the question and return a python list of str. If there is no collection related to the question, you can return an empty list. |
|||
|
|||
"QUESTION": {question} |
|||
"COLLECTION_INFO": {collection_info} |
|||
|
|||
When you return, you can ONLY return a json convertable python list of str, WITHOUT any other additional content. Your selected collection name list is: |
|||
""" |
|||
|
|||
|
|||
class CollectionRouter(BaseAgent): |
|||
""" |
|||
Routes queries to appropriate collections in the vector database. |
|||
|
|||
This class analyzes the content of a query and determines which collections |
|||
in the vector database are most likely to contain relevant information. |
|||
""" |
|||
|
|||
def __init__(self, llm: BaseLLM, vector_db: BaseVectorDB, dim: int, **kwargs): |
|||
""" |
|||
Initialize the CollectionRouter. |
|||
|
|||
Args: |
|||
llm: The language model to use for analyzing queries. |
|||
vector_db: The vector database containing the collections. |
|||
dim: The dimension of the vector space to search in. |
|||
""" |
|||
self.llm = llm |
|||
self.vector_db = vector_db |
|||
self.all_collections = [ |
|||
collection_info.collection_name |
|||
for collection_info in self.vector_db.list_collections(dim=dim) |
|||
] |
|||
|
|||
def invoke(self, query: str, dim: int, **kwargs) -> Tuple[List[str], int]: |
|||
""" |
|||
Determine which collections are relevant for the given query. |
|||
|
|||
This method analyzes the query content and selects collections that are |
|||
most likely to contain information relevant to answering the query. |
|||
|
|||
Args: |
|||
query (str): The query to analyze. |
|||
dim (int): The dimension of the vector space to search in. |
|||
|
|||
Returns: |
|||
Tuple[List[str], int]: A tuple containing: |
|||
- A list of selected collection names |
|||
- The token usage for the routing operation |
|||
""" |
|||
consume_tokens = 0 |
|||
collection_infos = self.vector_db.list_collections(dim=dim) |
|||
if len(collection_infos) == 0: |
|||
log.color_print( |
|||
"No collections found in the vector database. Please check the database connection." |
|||
) |
|||
return [], 0 |
|||
if len(collection_infos) == 1: |
|||
the_only_collection = collection_infos[0].collection_name |
|||
log.color_print( |
|||
f"<think> Perform search [{query}] on the vector DB collection: {the_only_collection} </think>\n" |
|||
) |
|||
return [the_only_collection], 0 |
|||
vector_db_search_prompt = COLLECTION_ROUTE_PROMPT.format( |
|||
question=query, |
|||
collection_info=[ |
|||
{ |
|||
"collection_name": collection_info.collection_name, |
|||
"collection_description": collection_info.description, |
|||
} |
|||
for collection_info in collection_infos |
|||
], |
|||
) |
|||
chat_response = self.llm.chat( |
|||
messages=[{"role": "user", "content": vector_db_search_prompt}] |
|||
) |
|||
selected_collections = self.llm.literal_eval(chat_response.content) |
|||
consume_tokens += chat_response.total_tokens |
|||
|
|||
for collection_info in collection_infos: |
|||
# If a collection description is not provided, use the query as the search query |
|||
if not collection_info.description: |
|||
selected_collections.append(collection_info.collection_name) |
|||
# If the default collection exists, use the query as the search query |
|||
if self.vector_db.default_collection == collection_info.collection_name: |
|||
selected_collections.append(collection_info.collection_name) |
|||
selected_collections = list(set(selected_collections)) |
|||
log.color_print( |
|||
f"<think> Perform search [{query}] on the vector DB collections: {selected_collections} </think>\n" |
|||
) |
|||
return selected_collections, consume_tokens |
@ -0,0 +1,319 @@ |
|||
import asyncio |
|||
from typing import List, Tuple |
|||
|
|||
from deepsearcher.agent.base import RAGAgent, describe_class |
|||
from deepsearcher.agent.collection_router import CollectionRouter |
|||
from deepsearcher.embedding.base import BaseEmbedding |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db import RetrievalResult |
|||
from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results |
|||
|
|||
SUB_QUERY_PROMPT = """To answer this question more comprehensively, please break down the original question into up to four sub-questions. Return as list of str. |
|||
If this is a very simple question and no decomposition is necessary, then keep the only one original question in the python code list. |
|||
|
|||
Original Question: {original_query} |
|||
|
|||
|
|||
<EXAMPLE> |
|||
Example input: |
|||
"Explain deep learning" |
|||
|
|||
Example output: |
|||
[ |
|||
"What is deep learning?", |
|||
"What is the difference between deep learning and machine learning?", |
|||
"What is the history of deep learning?" |
|||
] |
|||
</EXAMPLE> |
|||
|
|||
Provide your response in a python code list of str format: |
|||
""" |
|||
|
|||
RERANK_PROMPT = """Based on the query questions and the retrieved chunk, to determine whether the chunk is helpful in answering any of the query question, you can only return "YES" or "NO", without any other information. |
|||
|
|||
Query Questions: {query} |
|||
Retrieved Chunk: {retrieved_chunk} |
|||
|
|||
Is the chunk helpful in answering the any of the questions? |
|||
""" |
|||
|
|||
|
|||
REFLECT_PROMPT = """Determine whether additional search queries are needed based on the original query, previous sub queries, and all retrieved document chunks. If further research is required, provide a Python list of up to 3 search queries. If no further research is required, return an empty list. |
|||
|
|||
If the original query is to write a report, then you prefer to generate some further queries, instead return an empty list. |
|||
|
|||
Original Query: {question} |
|||
|
|||
Previous Sub Queries: {mini_questions} |
|||
|
|||
Related Chunks: |
|||
{mini_chunk_str} |
|||
|
|||
Respond exclusively in valid List of str format without any other text.""" |
|||
|
|||
|
|||
SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks. |
|||
|
|||
Original Query: {question} |
|||
|
|||
Previous Sub Queries: {mini_questions} |
|||
|
|||
Related Chunks: |
|||
{mini_chunk_str} |
|||
|
|||
""" |
|||
|
|||
|
|||
@describe_class( |
|||
"This agent is suitable for handling general and simple queries, such as given a topic and then writing a report, survey, or article." |
|||
) |
|||
class DeepSearch(RAGAgent): |
|||
""" |
|||
Deep Search agent implementation for comprehensive information retrieval. |
|||
|
|||
This agent performs a thorough search through the knowledge base, analyzing |
|||
multiple aspects of the query to provide comprehensive and detailed answers. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
llm: BaseLLM, |
|||
embedding_model: BaseEmbedding, |
|||
vector_db: BaseVectorDB, |
|||
max_iter: int = 3, |
|||
route_collection: bool = True, |
|||
text_window_splitter: bool = True, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize the DeepSearch agent. |
|||
|
|||
Args: |
|||
llm: The language model to use for generating answers. |
|||
embedding_model: The embedding model to use for query embedding. |
|||
vector_db: The vector database to search for relevant documents. |
|||
max_iter: The maximum number of iterations for the search process. |
|||
route_collection: Whether to use a collection router for search. |
|||
text_window_splitter: Whether to use text_window splitter. |
|||
**kwargs: Additional keyword arguments for customization. |
|||
""" |
|||
self.llm = llm |
|||
self.embedding_model = embedding_model |
|||
self.vector_db = vector_db |
|||
self.max_iter = max_iter |
|||
self.route_collection = route_collection |
|||
self.collection_router = CollectionRouter( |
|||
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension |
|||
) |
|||
self.text_window_splitter = text_window_splitter |
|||
|
|||
def _generate_sub_queries(self, original_query: str) -> Tuple[List[str], int]: |
|||
chat_response = self.llm.chat( |
|||
messages=[ |
|||
{"role": "user", "content": SUB_QUERY_PROMPT.format(original_query=original_query)} |
|||
] |
|||
) |
|||
response_content = self.llm.remove_think(chat_response.content) |
|||
return self.llm.literal_eval(response_content), chat_response.total_tokens |
|||
|
|||
async def _search_chunks_from_vectordb(self, query: str, sub_queries: List[str]): |
|||
consume_tokens = 0 |
|||
if self.route_collection: |
|||
selected_collections, n_token_route = self.collection_router.invoke( |
|||
query=query, dim=self.embedding_model.dimension |
|||
) |
|||
else: |
|||
selected_collections = self.collection_router.all_collections |
|||
n_token_route = 0 |
|||
consume_tokens += n_token_route |
|||
|
|||
all_retrieved_results = [] |
|||
query_vector = self.embedding_model.embed_query(query) |
|||
for collection in selected_collections: |
|||
log.color_print(f"<search> Search [{query}] in [{collection}]... </search>\n") |
|||
retrieved_results = self.vector_db.search_data( |
|||
collection=collection, vector=query_vector, query_text=query |
|||
) |
|||
if not retrieved_results or len(retrieved_results) == 0: |
|||
log.color_print( |
|||
f"<search> No relevant document chunks found in '{collection}'! </search>\n" |
|||
) |
|||
continue |
|||
accepted_chunk_num = 0 |
|||
references = set() |
|||
for retrieved_result in retrieved_results: |
|||
chat_response = self.llm.chat( |
|||
messages=[ |
|||
{ |
|||
"role": "user", |
|||
"content": RERANK_PROMPT.format( |
|||
query=[query] + sub_queries, |
|||
retrieved_chunk=f"<chunk>{retrieved_result.text}</chunk>", |
|||
), |
|||
} |
|||
] |
|||
) |
|||
consume_tokens += chat_response.total_tokens |
|||
response_content = self.llm.remove_think(chat_response.content).strip() |
|||
if "YES" in response_content and "NO" not in response_content: |
|||
all_retrieved_results.append(retrieved_result) |
|||
accepted_chunk_num += 1 |
|||
references.add(retrieved_result.reference) |
|||
if accepted_chunk_num > 0: |
|||
log.color_print( |
|||
f"<search> Accept {accepted_chunk_num} document chunk(s) from references: {list(references)} </search>\n" |
|||
) |
|||
else: |
|||
log.color_print( |
|||
f"<search> No document chunk accepted from '{collection}'! </search>\n" |
|||
) |
|||
return all_retrieved_results, consume_tokens |
|||
|
|||
def _generate_gap_queries( |
|||
self, original_query: str, all_sub_queries: List[str], all_chunks: List[RetrievalResult] |
|||
) -> Tuple[List[str], int]: |
|||
reflect_prompt = REFLECT_PROMPT.format( |
|||
question=original_query, |
|||
mini_questions=all_sub_queries, |
|||
mini_chunk_str=self._format_chunk_texts([chunk.text for chunk in all_chunks]) |
|||
if len(all_chunks) > 0 |
|||
else "NO RELATED CHUNKS FOUND.", |
|||
) |
|||
chat_response = self.llm.chat([{"role": "user", "content": reflect_prompt}]) |
|||
response_content = self.llm.remove_think(chat_response.content) |
|||
return self.llm.literal_eval(response_content), chat_response.total_tokens |
|||
|
|||
def retrieve(self, original_query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]: |
|||
""" |
|||
Retrieve relevant documents from the knowledge base for the given query. |
|||
|
|||
This method performs a deep search through the vector database to find |
|||
the most relevant documents for answering the query. |
|||
|
|||
Args: |
|||
original_query (str): The query to search for. |
|||
**kwargs: Additional keyword arguments for customizing the retrieval. |
|||
|
|||
Returns: |
|||
Tuple[List[RetrievalResult], int, dict]: A tuple containing: |
|||
- A list of retrieved document results |
|||
- The token usage for the retrieval operation |
|||
- Additional information about the retrieval process |
|||
""" |
|||
return asyncio.run(self.async_retrieve(original_query, **kwargs)) |
|||
|
|||
async def async_retrieve( |
|||
self, original_query: str, **kwargs |
|||
) -> Tuple[List[RetrievalResult], int, dict]: |
|||
max_iter = kwargs.pop("max_iter", self.max_iter) |
|||
### SUB QUERIES ### |
|||
log.color_print(f"<query> {original_query} </query>\n") |
|||
all_search_res = [] |
|||
all_sub_queries = [] |
|||
total_tokens = 0 |
|||
|
|||
sub_queries, used_token = self._generate_sub_queries(original_query) |
|||
total_tokens += used_token |
|||
if not sub_queries: |
|||
log.color_print("No sub queries were generated by the LLM. Exiting.") |
|||
return [], total_tokens, {} |
|||
else: |
|||
log.color_print( |
|||
f"<think> Break down the original query into new sub queries: {sub_queries}</think>\n" |
|||
) |
|||
all_sub_queries.extend(sub_queries) |
|||
sub_gap_queries = sub_queries |
|||
|
|||
for iter in range(max_iter): |
|||
log.color_print(f">> Iteration: {iter + 1}\n") |
|||
search_res_from_vectordb = [] |
|||
search_res_from_internet = [] # TODO |
|||
|
|||
# Create all search tasks |
|||
search_tasks = [ |
|||
self._search_chunks_from_vectordb(query, sub_gap_queries) |
|||
for query in sub_gap_queries |
|||
] |
|||
# Execute all tasks in parallel and wait for results |
|||
search_results = await asyncio.gather(*search_tasks) |
|||
# Merge all results |
|||
for result in search_results: |
|||
search_res, consumed_token = result |
|||
total_tokens += consumed_token |
|||
search_res_from_vectordb.extend(search_res) |
|||
|
|||
search_res_from_vectordb = deduplicate_results(search_res_from_vectordb) |
|||
# search_res_from_internet = deduplicate_results(search_res_from_internet) |
|||
all_search_res.extend(search_res_from_vectordb + search_res_from_internet) |
|||
if iter == max_iter - 1: |
|||
log.color_print("<think> Exceeded maximum iterations. Exiting. </think>\n") |
|||
break |
|||
### REFLECTION & GET GAP QUERIES ### |
|||
log.color_print("<think> Reflecting on the search results... </think>\n") |
|||
sub_gap_queries, consumed_token = self._generate_gap_queries( |
|||
original_query, all_sub_queries, all_search_res |
|||
) |
|||
total_tokens += consumed_token |
|||
if not sub_gap_queries or len(sub_gap_queries) == 0: |
|||
log.color_print("<think> No new search queries were generated. Exiting. </think>\n") |
|||
break |
|||
else: |
|||
log.color_print( |
|||
f"<think> New search queries for next iteration: {sub_gap_queries} </think>\n" |
|||
) |
|||
all_sub_queries.extend(sub_gap_queries) |
|||
|
|||
all_search_res = deduplicate_results(all_search_res) |
|||
additional_info = {"all_sub_queries": all_sub_queries} |
|||
return all_search_res, total_tokens, additional_info |
|||
|
|||
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]: |
|||
""" |
|||
Query the agent and generate an answer based on retrieved documents. |
|||
|
|||
This method retrieves relevant documents and uses the language model |
|||
to generate a comprehensive answer to the query. |
|||
|
|||
Args: |
|||
query (str): The query to answer. |
|||
**kwargs: Additional keyword arguments for customizing the query process. |
|||
|
|||
Returns: |
|||
Tuple[str, List[RetrievalResult], int]: A tuple containing: |
|||
- The generated answer |
|||
- A list of retrieved document results |
|||
- The total token usage |
|||
""" |
|||
all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs) |
|||
if not all_retrieved_results or len(all_retrieved_results) == 0: |
|||
return f"No relevant information found for query '{query}'.", [], n_token_retrieval |
|||
all_sub_queries = additional_info["all_sub_queries"] |
|||
chunk_texts = [] |
|||
for chunk in all_retrieved_results: |
|||
if self.text_window_splitter and "wider_text" in chunk.metadata: |
|||
chunk_texts.append(chunk.metadata["wider_text"]) |
|||
else: |
|||
chunk_texts.append(chunk.text) |
|||
log.color_print( |
|||
f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n" |
|||
) |
|||
summary_prompt = SUMMARY_PROMPT.format( |
|||
question=query, |
|||
mini_questions=all_sub_queries, |
|||
mini_chunk_str=self._format_chunk_texts(chunk_texts), |
|||
) |
|||
chat_response = self.llm.chat([{"role": "user", "content": summary_prompt}]) |
|||
log.color_print("\n==== FINAL ANSWER====\n") |
|||
log.color_print(self.llm.remove_think(chat_response.content)) |
|||
return ( |
|||
self.llm.remove_think(chat_response.content), |
|||
all_retrieved_results, |
|||
n_token_retrieval + chat_response.total_tokens, |
|||
) |
|||
|
|||
def _format_chunk_texts(self, chunk_texts: List[str]) -> str: |
|||
chunk_str = "" |
|||
for i, chunk in enumerate(chunk_texts): |
|||
chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n""" |
|||
return chunk_str |
@ -0,0 +1,128 @@ |
|||
from typing import List, Tuple |
|||
|
|||
from deepsearcher.agent.base import RAGAgent |
|||
from deepsearcher.agent.collection_router import CollectionRouter |
|||
from deepsearcher.embedding.base import BaseEmbedding |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, RetrievalResult, deduplicate_results |
|||
|
|||
SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks. |
|||
|
|||
Original Query: {query} |
|||
|
|||
Related Chunks: |
|||
{mini_chunk_str} |
|||
""" |
|||
|
|||
|
|||
class NaiveRAG(RAGAgent): |
|||
""" |
|||
Naive Retrieval-Augmented Generation agent implementation. |
|||
|
|||
This agent implements a straightforward RAG approach, retrieving relevant |
|||
documents and generating answers without complex processing or refinement steps. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
llm: BaseLLM, |
|||
embedding_model: BaseEmbedding, |
|||
vector_db: BaseVectorDB, |
|||
top_k: int = 10, |
|||
route_collection: bool = True, |
|||
text_window_splitter: bool = True, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize the NaiveRAG agent. |
|||
|
|||
Args: |
|||
llm: The language model to use for generating answers. |
|||
embedding_model: The embedding model to use for query embedding. |
|||
vector_db: The vector database to search for relevant documents. |
|||
**kwargs: Additional keyword arguments for customization. |
|||
""" |
|||
self.llm = llm |
|||
self.embedding_model = embedding_model |
|||
self.vector_db = vector_db |
|||
self.top_k = top_k |
|||
self.route_collection = route_collection |
|||
if self.route_collection: |
|||
self.collection_router = CollectionRouter( |
|||
llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension |
|||
) |
|||
self.text_window_splitter = text_window_splitter |
|||
|
|||
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]: |
|||
""" |
|||
Retrieve relevant documents from the knowledge base for the given query. |
|||
|
|||
This method performs a basic search through the vector database to find |
|||
documents relevant to the query. |
|||
|
|||
Args: |
|||
query (str): The query to search for. |
|||
**kwargs: Additional keyword arguments for customizing the retrieval. |
|||
|
|||
Returns: |
|||
Tuple[List[RetrievalResult], int, dict]: A tuple containing: |
|||
- A list of retrieved document results |
|||
- The token usage for the retrieval operation |
|||
- Additional information about the retrieval process |
|||
""" |
|||
consume_tokens = 0 |
|||
if self.route_collection: |
|||
selected_collections, n_token_route = self.collection_router.invoke( |
|||
query=query, dim=self.embedding_model.dimension |
|||
) |
|||
else: |
|||
selected_collections = self.collection_router.all_collections |
|||
n_token_route = 0 |
|||
consume_tokens += n_token_route |
|||
all_retrieved_results = [] |
|||
for collection in selected_collections: |
|||
retrieval_res = self.vector_db.search_data( |
|||
collection=collection, |
|||
vector=self.embedding_model.embed_query(query), |
|||
top_k=max(self.top_k // len(selected_collections), 1), |
|||
query_text=query, |
|||
) |
|||
all_retrieved_results.extend(retrieval_res) |
|||
all_retrieved_results = deduplicate_results(all_retrieved_results) |
|||
return all_retrieved_results, consume_tokens, {} |
|||
|
|||
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]: |
|||
""" |
|||
Query the agent and generate an answer based on retrieved documents. |
|||
|
|||
This method retrieves relevant documents and uses the language model |
|||
to generate a simple answer to the query. |
|||
|
|||
Args: |
|||
query (str): The query to answer. |
|||
**kwargs: Additional keyword arguments for customizing the query process. |
|||
|
|||
Returns: |
|||
Tuple[str, List[RetrievalResult], int]: A tuple containing: |
|||
- The generated answer |
|||
- A list of retrieved document results |
|||
- The total token usage |
|||
""" |
|||
all_retrieved_results, n_token_retrieval, _ = self.retrieve(query) |
|||
chunk_texts = [] |
|||
for chunk in all_retrieved_results: |
|||
if self.text_window_splitter and "wider_text" in chunk.metadata: |
|||
chunk_texts.append(chunk.metadata["wider_text"]) |
|||
else: |
|||
chunk_texts.append(chunk.text) |
|||
mini_chunk_str = "" |
|||
for i, chunk in enumerate(chunk_texts): |
|||
mini_chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n""" |
|||
|
|||
summary_prompt = SUMMARY_PROMPT.format(query=query, mini_chunk_str=mini_chunk_str) |
|||
char_response = self.llm.chat([{"role": "user", "content": summary_prompt}]) |
|||
final_answer = char_response.content |
|||
log.color_print("\n==== FINAL ANSWER====\n") |
|||
log.color_print(final_answer) |
|||
return final_answer, all_retrieved_results, n_token_retrieval + char_response.total_tokens |
@ -0,0 +1,93 @@ |
|||
from typing import List, Optional, Tuple |
|||
|
|||
from deepsearcher.agent import RAGAgent |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db import RetrievalResult |
|||
|
|||
RAG_ROUTER_PROMPT = """Given a list of agent indexes and corresponding descriptions, each agent has a specific function. |
|||
Given a query, select only one agent that best matches the agent handling the query, and return the index without any other information. |
|||
|
|||
## Question |
|||
{query} |
|||
|
|||
## Agent Indexes and Descriptions |
|||
{description_str} |
|||
|
|||
Only return one agent index number that best matches the agent handling the query: |
|||
""" |
|||
|
|||
|
|||
class RAGRouter(RAGAgent): |
|||
""" |
|||
Routes queries to the most appropriate RAG agent implementation. |
|||
|
|||
This class analyzes the content and requirements of a query and determines |
|||
which RAG agent implementation is best suited to handle it. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
llm: BaseLLM, |
|||
rag_agents: List[RAGAgent], |
|||
agent_descriptions: Optional[List[str]] = None, |
|||
): |
|||
""" |
|||
Initialize the RAGRouter. |
|||
|
|||
Args: |
|||
llm: The language model to use for analyzing queries. |
|||
rag_agents: A list of RAGAgent instances. |
|||
agent_descriptions (list, optional): A list of descriptions for each agent. |
|||
""" |
|||
self.llm = llm |
|||
self.rag_agents = rag_agents |
|||
self.agent_descriptions = agent_descriptions |
|||
if not self.agent_descriptions: |
|||
try: |
|||
self.agent_descriptions = [ |
|||
agent.__class__.__description__ for agent in self.rag_agents |
|||
] |
|||
except Exception: |
|||
raise AttributeError( |
|||
"Please provide agent descriptions or set __description__ attribute for each agent class." |
|||
) |
|||
|
|||
def _route(self, query: str) -> Tuple[RAGAgent, int]: |
|||
description_str = "\n".join( |
|||
[f"[{i + 1}]: {description}" for i, description in enumerate(self.agent_descriptions)] |
|||
) |
|||
prompt = RAG_ROUTER_PROMPT.format(query=query, description_str=description_str) |
|||
chat_response = self.llm.chat(messages=[{"role": "user", "content": prompt}]) |
|||
try: |
|||
selected_agent_index = int(self.llm.remove_think(chat_response.content)) - 1 |
|||
except ValueError: |
|||
# In some reasoning LLM, the output is not a number, but a explaination string with a number in the end. |
|||
log.warning( |
|||
"Parse int failed in RAGRouter, but will try to find the last digit as fallback." |
|||
) |
|||
selected_agent_index = ( |
|||
int(self.find_last_digit(self.llm.remove_think(chat_response.content))) - 1 |
|||
) |
|||
|
|||
selected_agent = self.rag_agents[selected_agent_index] |
|||
log.color_print( |
|||
f"<think> Select agent [{selected_agent.__class__.__name__}] to answer the query [{query}] </think>\n" |
|||
) |
|||
return self.rag_agents[selected_agent_index], chat_response.total_tokens |
|||
|
|||
def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]: |
|||
agent, n_token_router = self._route(query) |
|||
retrieved_results, n_token_retrieval, metadata = agent.retrieve(query, **kwargs) |
|||
return retrieved_results, n_token_router + n_token_retrieval, metadata |
|||
|
|||
def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]: |
|||
agent, n_token_router = self._route(query) |
|||
answer, retrieved_results, n_token_retrieval = agent.query(query, **kwargs) |
|||
return answer, retrieved_results, n_token_router + n_token_retrieval |
|||
|
|||
def find_last_digit(self, string): |
|||
for char in reversed(string): |
|||
if char.isdigit(): |
|||
return char |
|||
raise ValueError("No digit found in the string") |
@ -0,0 +1,118 @@ |
|||
import argparse |
|||
import logging |
|||
import sys |
|||
import warnings |
|||
|
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.offline_loading import load_from_local_files, load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.utils import log |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
|
|||
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output |
|||
|
|||
|
|||
def main(): |
|||
""" |
|||
Main entry point for the DeepSearcher CLI. |
|||
|
|||
This function parses command line arguments and executes the appropriate action |
|||
based on the subcommand provided (query or load). It handles the deprecated |
|||
command line format and provides helpful error messages. |
|||
|
|||
Returns: |
|||
None |
|||
""" |
|||
if "--query" in sys.argv or "--load" in sys.argv: |
|||
print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.") |
|||
print("Please use:") |
|||
print(" deepsearcher query <your_query> --max_iter 3") |
|||
print( |
|||
" deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>" |
|||
) |
|||
sys.exit(1) |
|||
|
|||
config = Configuration() # Customize your config here |
|||
init_config(config=config) |
|||
|
|||
parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.") |
|||
subparsers = parser.add_subparsers(dest="subcommand", title="subcommands") |
|||
|
|||
## Arguments of query |
|||
query_parser = subparsers.add_parser("query", help="Query a question or search topic.") |
|||
query_parser.add_argument("query", type=str, default="", help="query question or search topic.") |
|||
query_parser.add_argument( |
|||
"--max_iter", |
|||
type=int, |
|||
default=3, |
|||
help="Max iterations of reflection. Default is 3.", |
|||
) |
|||
|
|||
## Arguments of loading |
|||
load_parser = subparsers.add_parser( |
|||
"load", help="Load knowledge from local files or from URLs." |
|||
) |
|||
load_parser.add_argument( |
|||
"load_path", |
|||
type=str, |
|||
nargs="+", # 1 or more files or urls |
|||
help="Load knowledge from local files or from URLs.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--batch_size", |
|||
type=int, |
|||
default=256, |
|||
help="Batch size for loading knowledge.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--collection_name", |
|||
type=str, |
|||
default=None, |
|||
help="Destination collection name of loaded knowledge.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--collection_desc", |
|||
type=str, |
|||
default=None, |
|||
help="Description of the collection.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--force_new_collection", |
|||
type=bool, |
|||
default=False, |
|||
help="If you want to drop origin collection and create a new collection on every load, set to True", |
|||
) |
|||
|
|||
args = parser.parse_args() |
|||
if args.subcommand == "query": |
|||
final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter) |
|||
log.color_print("\n==== FINAL ANSWER====\n") |
|||
log.color_print(final_answer) |
|||
log.color_print("\n### References\n") |
|||
for i, ref in enumerate(refs): |
|||
log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}") |
|||
elif args.subcommand == "load": |
|||
urls = [url for url in args.load_path if url.startswith("http")] |
|||
local_files = [file for file in args.load_path if not file.startswith("http")] |
|||
kwargs = {} |
|||
if args.collection_name: |
|||
kwargs["collection_name"] = args.collection_name |
|||
if args.collection_desc: |
|||
kwargs["collection_description"] = args.collection_desc |
|||
if args.force_new_collection: |
|||
kwargs["force_new_collection"] = args.force_new_collection |
|||
if args.batch_size: |
|||
kwargs["batch_size"] = args.batch_size |
|||
if len(urls) > 0: |
|||
load_from_website(urls, **kwargs) |
|||
if len(local_files) > 0: |
|||
load_from_local_files(local_files, **kwargs) |
|||
else: |
|||
print("Please provide a query or a load argument.") |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
@ -0,0 +1,87 @@ |
|||
provide_settings: |
|||
llm: |
|||
provider: "OpenAILLM" |
|||
config: |
|||
model: "Qwen/Qwen3-8B-FP8" |
|||
api_key: "empty" |
|||
base_url: "http://localhost:8000/v1" |
|||
|
|||
embedding: |
|||
provider: "OpenAIEmbedding" |
|||
config: |
|||
model: "Qwen/Qwen3-Embedding-0.6B" |
|||
api_key: "empty" |
|||
base_url: "http://localhost:8001/v1" |
|||
dimension: 1024 |
|||
dim_change: false |
|||
|
|||
file_loader: |
|||
provider: "PDFLoader" |
|||
config: {} |
|||
|
|||
# provider: "JsonFileLoader" |
|||
# config: |
|||
# text_key: "" |
|||
|
|||
# provider: "TextLoader" |
|||
# config: {} |
|||
|
|||
# provider: "UnstructuredLoader" |
|||
# config: {} |
|||
|
|||
# provider: "DoclingLoader" |
|||
# config: {} |
|||
|
|||
|
|||
web_crawler: |
|||
provider: "FireCrawlCrawler" |
|||
config: {} |
|||
|
|||
# provider: "Crawl4AICrawler" |
|||
# config: # Uncomment to custom browser configuration for Crawl4AI |
|||
# browser_config: |
|||
# headless: false |
|||
# proxy: "http://127.0.0.1:7890" |
|||
# chrome_channel: "chrome" |
|||
# verbose: true |
|||
# viewport_width: 800 |
|||
# viewport_height: 600 |
|||
|
|||
# provider: "JinaCrawler" |
|||
# config: {} |
|||
|
|||
# provider: "DoclingCrawler" |
|||
# config: {} |
|||
|
|||
vector_db: |
|||
provider: "Milvus" |
|||
config: |
|||
default_collection: "deepsearcher" |
|||
uri: "http://localhost:19530" |
|||
token: "root:Milvus" |
|||
db: "default" |
|||
|
|||
# vector_db: |
|||
# provider: "OracleDB" |
|||
# config: |
|||
# default_collection: "deepsearcher" |
|||
# user: "" |
|||
# password: "" |
|||
# dsn: "" |
|||
# config_dir: "" |
|||
# wallet_location: "" |
|||
# wallet_password: "" |
|||
|
|||
# vector_db: |
|||
# provider: "Qdrant" |
|||
# config: |
|||
# default_collection: "deepsearcher" |
|||
# host: "localhost" |
|||
# port: 6333 |
|||
|
|||
query_settings: |
|||
max_iter: 2 |
|||
|
|||
load_settings: |
|||
chunk_size: 1024 |
|||
chunk_overlap: 128 |
@ -0,0 +1,240 @@ |
|||
import os |
|||
from typing import Literal |
|||
|
|||
import yaml |
|||
|
|||
from deepsearcher.agent import ChainOfRAG, DeepSearch, NaiveRAG |
|||
from deepsearcher.agent.rag_router import RAGRouter |
|||
from deepsearcher.embedding.base import BaseEmbedding |
|||
from deepsearcher.llm.base import BaseLLM |
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
from deepsearcher.vector_db.base import BaseVectorDB |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
DEFAULT_CONFIG_YAML_PATH = os.path.join(current_dir, "config.yaml") |
|||
|
|||
FeatureType = Literal["llm", "embedding", "file_loader", "web_crawler", "vector_db"] |
|||
|
|||
|
|||
class Configuration: |
|||
""" |
|||
Configuration class for DeepSearcher. |
|||
|
|||
This class manages the configuration settings for various components of the DeepSearcher system, |
|||
including LLM providers, embedding models, file loaders, web crawlers, and vector databases. |
|||
It loads configurations from a YAML file and provides methods to get and set provider configurations. |
|||
""" |
|||
|
|||
def __init__(self, config_path: str = DEFAULT_CONFIG_YAML_PATH): |
|||
""" |
|||
Initialize the Configuration object. |
|||
|
|||
Args: |
|||
config_path: Path to the configuration YAML file. Defaults to the config.yaml in the project root. |
|||
""" |
|||
# Initialize default configurations |
|||
config_data = self.load_config_from_yaml(config_path) |
|||
self.provide_settings = config_data["provide_settings"] |
|||
self.query_settings = config_data["query_settings"] |
|||
self.load_settings = config_data["load_settings"] |
|||
|
|||
def load_config_from_yaml(self, config_path: str): |
|||
""" |
|||
Load configuration from a YAML file. |
|||
|
|||
Args: |
|||
config_path: Path to the configuration YAML file. |
|||
|
|||
Returns: |
|||
The loaded configuration data as a dictionary. |
|||
""" |
|||
with open(config_path, "r") as file: |
|||
return yaml.safe_load(file) |
|||
|
|||
def set_provider_config(self, feature: FeatureType, provider: str, provider_configs: dict): |
|||
""" |
|||
Set the provider and its configurations for a given feature. |
|||
|
|||
Args: |
|||
feature: The feature to configure (e.g., 'llm', 'file_loader', 'web_crawler'). |
|||
provider: The provider name (e.g., 'openai', 'deepseek'). |
|||
provider_configs: A dictionary with configurations specific to the provider. |
|||
|
|||
Raises: |
|||
ValueError: If the feature is not supported. |
|||
""" |
|||
if feature not in self.provide_settings: |
|||
raise ValueError(f"Unsupported feature: {feature}") |
|||
|
|||
self.provide_settings[feature]["provider"] = provider |
|||
self.provide_settings[feature]["config"] = provider_configs |
|||
|
|||
def get_provider_config(self, feature: FeatureType): |
|||
""" |
|||
Get the current provider and configuration for a given feature. |
|||
|
|||
Args: |
|||
feature: The feature to retrieve (e.g., 'llm', 'file_loader', 'web_crawler'). |
|||
|
|||
Returns: |
|||
A dictionary with provider and its configurations. |
|||
|
|||
Raises: |
|||
ValueError: If the feature is not supported. |
|||
""" |
|||
if feature not in self.provide_settings: |
|||
raise ValueError(f"Unsupported feature: {feature}") |
|||
|
|||
return self.provide_settings[feature] |
|||
|
|||
|
|||
class ModuleFactory: |
|||
""" |
|||
Factory class for creating instances of various modules in the DeepSearcher system. |
|||
|
|||
This class creates instances of LLMs, embedding models, file loaders, web crawlers, |
|||
and vector databases based on the configuration settings. |
|||
""" |
|||
|
|||
def __init__(self, config: Configuration): |
|||
""" |
|||
Initialize the ModuleFactory. |
|||
|
|||
Args: |
|||
config: The Configuration object containing provider settings. |
|||
""" |
|||
self.config = config |
|||
|
|||
def _create_module_instance(self, feature: FeatureType, module_name: str): |
|||
""" |
|||
Create an instance of a module based on the feature and module name. |
|||
|
|||
Args: |
|||
feature: The feature type (e.g., 'llm', 'embedding'). |
|||
module_name: The module name to import from. |
|||
|
|||
Returns: |
|||
An instance of the specified module. |
|||
""" |
|||
# e.g. |
|||
# feature = "file_loader" |
|||
# module_name = "deepsearcher.loader.file_loader" |
|||
class_name = self.config.provide_settings[feature]["provider"] |
|||
module = __import__(module_name, fromlist=[class_name]) |
|||
class_ = getattr(module, class_name) |
|||
return class_(**self.config.provide_settings[feature]["config"]) |
|||
|
|||
def create_llm(self) -> BaseLLM: |
|||
""" |
|||
Create an instance of a language model. |
|||
|
|||
Returns: |
|||
An instance of a BaseLLM implementation. |
|||
""" |
|||
return self._create_module_instance("llm", "deepsearcher.llm") |
|||
|
|||
def create_embedding(self) -> BaseEmbedding: |
|||
""" |
|||
Create an instance of an embedding model. |
|||
|
|||
Returns: |
|||
An instance of a BaseEmbedding implementation. |
|||
""" |
|||
return self._create_module_instance("embedding", "deepsearcher.embedding") |
|||
|
|||
def create_file_loader(self) -> BaseLoader: |
|||
""" |
|||
Create an instance of a file loader. |
|||
|
|||
Returns: |
|||
An instance of a BaseLoader implementation. |
|||
""" |
|||
return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader") |
|||
|
|||
def create_web_crawler(self) -> BaseCrawler: |
|||
""" |
|||
Create an instance of a web crawler. |
|||
|
|||
Returns: |
|||
An instance of a BaseCrawler implementation. |
|||
""" |
|||
return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler") |
|||
|
|||
def create_vector_db(self) -> BaseVectorDB: |
|||
""" |
|||
Create an instance of a vector database. |
|||
|
|||
Returns: |
|||
An instance of a BaseVectorDB implementation. |
|||
""" |
|||
return self._create_module_instance("vector_db", "deepsearcher.vector_db") |
|||
|
|||
|
|||
config = Configuration() |
|||
|
|||
module_factory: ModuleFactory = None |
|||
llm: BaseLLM = None |
|||
embedding_model: BaseEmbedding = None |
|||
file_loader: BaseLoader = None |
|||
vector_db: BaseVectorDB = None |
|||
web_crawler: BaseCrawler = None |
|||
default_searcher: RAGRouter = None |
|||
naive_rag: NaiveRAG = None |
|||
|
|||
|
|||
def init_config(config: Configuration): |
|||
""" |
|||
Initialize the global configuration and create instances of all required modules. |
|||
|
|||
This function initializes the global variables for the LLM, embedding model, |
|||
file loader, web crawler, vector database, and RAG agents. |
|||
|
|||
Args: |
|||
config: The Configuration object to use for initialization. |
|||
""" |
|||
global \ |
|||
module_factory, \ |
|||
llm, \ |
|||
embedding_model, \ |
|||
file_loader, \ |
|||
vector_db, \ |
|||
web_crawler, \ |
|||
default_searcher, \ |
|||
naive_rag |
|||
module_factory = ModuleFactory(config) |
|||
llm = module_factory.create_llm() |
|||
embedding_model = module_factory.create_embedding() |
|||
file_loader = module_factory.create_file_loader() |
|||
web_crawler = module_factory.create_web_crawler() |
|||
vector_db = module_factory.create_vector_db() |
|||
|
|||
default_searcher = RAGRouter( |
|||
llm=llm, |
|||
rag_agents=[ |
|||
DeepSearch( |
|||
llm=llm, |
|||
embedding_model=embedding_model, |
|||
vector_db=vector_db, |
|||
max_iter=config.query_settings["max_iter"], |
|||
route_collection=True, |
|||
text_window_splitter=True, |
|||
), |
|||
ChainOfRAG( |
|||
llm=llm, |
|||
embedding_model=embedding_model, |
|||
vector_db=vector_db, |
|||
max_iter=config.query_settings["max_iter"], |
|||
route_collection=True, |
|||
text_window_splitter=True, |
|||
), |
|||
], |
|||
) |
|||
naive_rag = NaiveRAG( |
|||
llm=llm, |
|||
embedding_model=embedding_model, |
|||
vector_db=vector_db, |
|||
top_k=10, |
|||
route_collection=True, |
|||
text_window_splitter=True, |
|||
) |
@ -0,0 +1,5 @@ |
|||
from .openai_embedding import OpenAIEmbedding |
|||
|
|||
__all__ = [ |
|||
"OpenAIEmbedding", |
|||
] |
@ -0,0 +1,76 @@ |
|||
from typing import List |
|||
|
|||
from tqdm import tqdm |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
|
|||
|
|||
class BaseEmbedding: |
|||
""" |
|||
Abstract base class for embedding model implementations. |
|||
|
|||
This class defines the interface for embedding model implementations, |
|||
including methods for embedding queries and documents, and a property |
|||
for the dimensionality of the embeddings. |
|||
""" |
|||
|
|||
def embed_query(self, text: str) -> List[float]: |
|||
""" |
|||
Embed a single query text. |
|||
|
|||
Args: |
|||
text: The query text to embed. |
|||
|
|||
Returns: |
|||
A list of floats representing the embedding vector. |
|||
""" |
|||
pass |
|||
|
|||
def embed_documents(self, texts: List[str]) -> List[List[float]]: |
|||
""" |
|||
Embed a list of document texts. |
|||
|
|||
This default implementation calls embed_query for each text, |
|||
but implementations may override this with a more efficient batch method. |
|||
|
|||
Args: |
|||
texts: A list of document texts to embed. |
|||
|
|||
Returns: |
|||
A list of embedding vectors, one for each input text. |
|||
""" |
|||
return [self.embed_query(text) for text in texts] |
|||
|
|||
def embed_chunks(self, chunks: List[Chunk], batch_size: int = 256) -> List[Chunk]: |
|||
""" |
|||
Embed a list of Chunk objects. |
|||
|
|||
This method extracts the text from each chunk, embeds it in batches, |
|||
and updates the chunks with their embeddings. |
|||
|
|||
Args: |
|||
chunks: A list of Chunk objects to embed. |
|||
batch_size: The number of chunks to process in each batch. |
|||
|
|||
Returns: |
|||
The input list of Chunk objects, updated with embeddings. |
|||
""" |
|||
texts = [chunk.text for chunk in chunks] |
|||
batch_texts = [texts[i : i + batch_size] for i in range(0, len(texts), batch_size)] |
|||
embeddings = [] |
|||
for batch_text in tqdm(batch_texts, desc="Embedding chunks"): |
|||
batch_embeddings = self.embed_documents(batch_text) |
|||
embeddings.extend(batch_embeddings) |
|||
for chunk, embedding in zip(chunks, embeddings): |
|||
chunk.embedding = embedding |
|||
return chunks |
|||
|
|||
@property |
|||
def dimension(self) -> int: |
|||
""" |
|||
Get the dimensionality of the embeddings. |
|||
|
|||
Returns: |
|||
The number of dimensions in the embedding vectors. |
|||
""" |
|||
pass |
@ -0,0 +1,103 @@ |
|||
import os |
|||
from typing import List |
|||
|
|||
from openai import OpenAI |
|||
from openai._types import NOT_GIVEN |
|||
|
|||
from deepsearcher.embedding.base import BaseEmbedding |
|||
|
|||
|
|||
class OpenAIEmbedding(BaseEmbedding): |
|||
""" |
|||
OpenAI embedding model implementation. |
|||
|
|||
This class provides an interface to the OpenAI embedding API, which offers |
|||
various embedding models for text processing. |
|||
|
|||
For more information, see: |
|||
https://platform.openai.com/docs/guides/embeddings/use-cases |
|||
""" |
|||
|
|||
def __init__(self, model: str, **kwargs): |
|||
""" |
|||
Initialize the OpenAI embedding model. |
|||
|
|||
Args: |
|||
model (str): The model identifier to use for embeddings. |
|||
**kwargs: Additional keyword arguments. |
|||
- api_key (str): The API key. |
|||
- base_url (str): The base URL. |
|||
- model_name (str): Alternative way to specify the model. |
|||
- dimension (int): The dimension of the embedding vectors. |
|||
- dim_change (bool): Whether it's able to change the dimension of the generated embeddings. |
|||
|
|||
""" |
|||
# Extract standard parameters (keep original behavior) |
|||
if "api_key" in kwargs: |
|||
api_key = kwargs.pop("api_key") |
|||
|
|||
if "base_url" in kwargs: |
|||
base_url = kwargs.pop("base_url") |
|||
else: |
|||
base_url = os.getenv("OPENAI_BASE_URL") |
|||
|
|||
if "model_name" in kwargs: |
|||
model = kwargs.pop("model_name") |
|||
|
|||
if "dimension" in kwargs: |
|||
dimension = kwargs.pop("dimension") |
|||
else: |
|||
dimension = NOT_GIVEN |
|||
|
|||
if "dim_change" in kwargs: |
|||
dim_change = kwargs.pop("dim_change") |
|||
|
|||
self.dim = dimension |
|||
self.dim_change = dim_change |
|||
self.model = model |
|||
|
|||
self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs) |
|||
|
|||
def embed_query(self, text: str) -> List[float]: |
|||
""" |
|||
Embed a single query text. |
|||
|
|||
Args: |
|||
text (str): The query text to embed. |
|||
|
|||
Returns: |
|||
List[float]: A list of floats representing the embedding vector. |
|||
""" |
|||
|
|||
response = self.client.embeddings.create( |
|||
input=[text], model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN |
|||
) |
|||
|
|||
return response.data[0].embedding |
|||
|
|||
def embed_documents(self, texts: List[str]) -> List[List[float]]: |
|||
""" |
|||
Embed a list of document texts. |
|||
|
|||
Args: |
|||
texts (List[str]): A list of document texts to embed. |
|||
|
|||
Returns: |
|||
List[List[float]]: A list of embedding vectors, one for each input text. |
|||
""" |
|||
|
|||
response = self.client.embeddings.create( |
|||
input=texts, model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN |
|||
) |
|||
|
|||
return [r.embedding for r in response.data] |
|||
|
|||
@property |
|||
def dimension(self) -> int: |
|||
""" |
|||
Get the dimensionality of the embeddings for the current model. |
|||
|
|||
Returns: |
|||
int: The number of dimensions in the embedding vectors. |
|||
""" |
|||
return self.dim |
@ -0,0 +1,5 @@ |
|||
from .openai_llm import OpenAILLM |
|||
|
|||
__all__ = [ |
|||
"OpenAILLM", |
|||
] |
@ -0,0 +1,120 @@ |
|||
import ast |
|||
import re |
|||
from abc import ABC |
|||
from typing import Dict, List |
|||
|
|||
|
|||
class ChatResponse(ABC): |
|||
""" |
|||
Represents a response from a chat model. |
|||
|
|||
This class encapsulates the content of a response from a chat model |
|||
along with information about token usage. |
|||
|
|||
Attributes: |
|||
content: The text content of the response. |
|||
total_tokens: The total number of tokens used in the request and response. |
|||
""" |
|||
|
|||
def __init__(self, content: str, total_tokens: int) -> None: |
|||
""" |
|||
Initialize a ChatResponse object. |
|||
|
|||
Args: |
|||
content: The text content of the response. |
|||
total_tokens: The total number of tokens used in the request and response. |
|||
""" |
|||
self.content = content |
|||
self.total_tokens = total_tokens |
|||
|
|||
def __repr__(self) -> str: |
|||
""" |
|||
Return a string representation of the ChatResponse. |
|||
|
|||
Returns: |
|||
A string representation of the ChatResponse object. |
|||
""" |
|||
return f"ChatResponse(content={self.content}, total_tokens={self.total_tokens})" |
|||
|
|||
|
|||
class BaseLLM(ABC): |
|||
""" |
|||
Abstract base class for language model implementations. |
|||
|
|||
This class defines the interface for language model implementations, |
|||
including methods for chat-based interactions and parsing responses. |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Initialize a BaseLLM object. |
|||
""" |
|||
pass |
|||
|
|||
def chat(self, messages: List[Dict]) -> ChatResponse: |
|||
""" |
|||
Send a chat message to the language model and get a response. |
|||
|
|||
Args: |
|||
messages: A list of message dictionaries, typically in the format |
|||
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}] |
|||
|
|||
Returns: |
|||
A ChatResponse object containing the model's response. |
|||
""" |
|||
pass |
|||
|
|||
@staticmethod |
|||
def literal_eval(response_content: str): |
|||
""" |
|||
Parse a string response into a Python object using ast.literal_eval. |
|||
|
|||
This method attempts to extract and parse JSON or Python literals from the response content, |
|||
handling various formats like code blocks and special tags. |
|||
|
|||
Args: |
|||
response_content: The string content to parse. |
|||
|
|||
Returns: |
|||
The parsed Python object. |
|||
|
|||
Raises: |
|||
ValueError: If the response content cannot be parsed. |
|||
""" |
|||
response_content = response_content.strip() |
|||
|
|||
response_content = BaseLLM.remove_think(response_content) |
|||
|
|||
try: |
|||
if response_content.startswith("```") and response_content.endswith("```"): |
|||
if response_content.startswith("```python"): |
|||
response_content = response_content[9:-3] |
|||
elif response_content.startswith("```json"): |
|||
response_content = response_content[7:-3] |
|||
elif response_content.startswith("```str"): |
|||
response_content = response_content[6:-3] |
|||
elif response_content.startswith("```\n"): |
|||
response_content = response_content[4:-3] |
|||
else: |
|||
raise ValueError("Invalid code block format") |
|||
result = ast.literal_eval(response_content.strip()) |
|||
except Exception: |
|||
matches = re.findall(r"(\[.*?\]|\{.*?\})", response_content, re.DOTALL) |
|||
|
|||
if len(matches) != 1: |
|||
raise ValueError( |
|||
f"Invalid JSON/List format for response content:\n{response_content}" |
|||
) |
|||
|
|||
json_part = matches[0] |
|||
return ast.literal_eval(json_part) |
|||
|
|||
return result |
|||
|
|||
@staticmethod |
|||
def remove_think(response_content: str) -> str: |
|||
# remove content between <think> and </think>, especial for reasoning model |
|||
if "<think>" in response_content and "</think>" in response_content: |
|||
end_of_think = response_content.find("</think>") + len("</think>") |
|||
response_content = response_content[end_of_think:] |
|||
return response_content.strip() |
@ -0,0 +1,61 @@ |
|||
import os |
|||
from typing import Dict, List |
|||
|
|||
from deepsearcher.llm.base import BaseLLM, ChatResponse |
|||
|
|||
|
|||
class OpenAILLM(BaseLLM): |
|||
""" |
|||
OpenAI language model implementation. |
|||
|
|||
This class provides an interface to interact with OpenAI's language models |
|||
through their API. |
|||
|
|||
Attributes: |
|||
model (str): The OpenAI model identifier to use. |
|||
client: The OpenAI client instance. |
|||
""" |
|||
|
|||
def __init__(self, model: str = "o1-mini", **kwargs): |
|||
""" |
|||
Initialize an OpenAI language model client. |
|||
|
|||
Args: |
|||
model (str, optional): The model identifier to use. Defaults to "o1-mini". |
|||
**kwargs: Additional keyword arguments to pass to the OpenAI client. |
|||
- api_key: OpenAI API key. If not provided, uses OPENAI_API_KEY environment variable. |
|||
- base_url: OpenAI API base URL. If not provided, uses OPENAI_BASE_URL environment variable. |
|||
""" |
|||
from openai import OpenAI |
|||
|
|||
self.model = model |
|||
if "api_key" in kwargs: |
|||
api_key = kwargs.pop("api_key") |
|||
else: |
|||
api_key = os.getenv("OPENAI_API_KEY") |
|||
if "base_url" in kwargs: |
|||
base_url = kwargs.pop("base_url") |
|||
else: |
|||
base_url = os.getenv("OPENAI_BASE_URL") |
|||
self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs) |
|||
|
|||
def chat(self, messages: List[Dict]) -> ChatResponse: |
|||
""" |
|||
Send a chat message to the OpenAI model and get a response. |
|||
|
|||
Args: |
|||
messages (List[Dict]): A list of message dictionaries, typically in the format |
|||
[{"role": "system", "content": "..."}, |
|||
{"role": "user", "content": "..."}] |
|||
|
|||
Returns: |
|||
ChatResponse: An object containing the model's response and token usage information. |
|||
""" |
|||
completion = self.client.chat.completions.create( |
|||
model=self.model, |
|||
messages=messages, |
|||
) |
|||
return ChatResponse( |
|||
content=completion.choices[0].message.content, |
|||
total_tokens=completion.usage.total_tokens, |
|||
) |
@ -0,0 +1,7 @@ |
|||
from deepsearcher.loader.file_loader.docling_loader import DoclingLoader |
|||
from deepsearcher.loader.file_loader.json_loader import JsonFileLoader |
|||
from deepsearcher.loader.file_loader.pdf_loader import PDFLoader |
|||
from deepsearcher.loader.file_loader.text_loader import TextLoader |
|||
from deepsearcher.loader.file_loader.unstructured_loader import UnstructuredLoader |
|||
|
|||
__all__ = ["PDFLoader", "TextLoader", "UnstructuredLoader", "JsonFileLoader", "DoclingLoader"] |
@ -0,0 +1,70 @@ |
|||
import os |
|||
from abc import ABC |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
|
|||
class BaseLoader(ABC): |
|||
""" |
|||
Abstract base class for file loaders. |
|||
|
|||
This class defines the interface for loading documents from files and directories. |
|||
All specific file loaders should inherit from this class and implement the required methods. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the loader with optional keyword arguments. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments for specific loader implementations. |
|||
""" |
|||
pass |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a single file and convert it to Document objects. |
|||
|
|||
Args: |
|||
file_path: Path to the file to be loaded. |
|||
|
|||
Returns: |
|||
A list of Document objects containing the text and metadata. |
|||
|
|||
Note: |
|||
Return a list of Document objects which contain the text and metadata. |
|||
In the metadata, it's recommended to include the reference to the file. |
|||
e.g. return [Document(page_content=..., metadata={"reference": file_path})] |
|||
""" |
|||
pass |
|||
|
|||
def load_directory(self, directory: str) -> List[Document]: |
|||
""" |
|||
Load all supported files from a directory and its subdirectories recursively. |
|||
|
|||
Args: |
|||
directory: Path to the directory containing files to be loaded. |
|||
|
|||
Returns: |
|||
A list of Document objects from all supported files in the directory and subdirectories. |
|||
""" |
|||
documents = [] |
|||
for root, _, files in os.walk(directory): |
|||
for file in files: |
|||
for suffix in self.supported_file_types: |
|||
if file.endswith(suffix): |
|||
full_path = os.path.join(root, file) |
|||
documents.extend(self.load_file(full_path)) |
|||
break |
|||
return documents |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Get the list of file extensions supported by this loader. |
|||
|
|||
Returns: |
|||
A list of supported file extensions (without the dot). |
|||
""" |
|||
pass |
@ -0,0 +1,117 @@ |
|||
import os |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class DoclingLoader(BaseLoader): |
|||
""" |
|||
Loader that utilizes Docling's DocumentConverter and HierarchicalChunker |
|||
to convert and chunk files (e.g. Markdown or HTML) into Document objects. |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Initialize the DoclingLoader with DocumentConverter and HierarchicalChunker instances. |
|||
""" |
|||
from docling.document_converter import DocumentConverter |
|||
from docling_core.transforms.chunker import HierarchicalChunker |
|||
|
|||
self.converter = DocumentConverter() |
|||
self.chunker = HierarchicalChunker() |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a local file (or URL) using docling's conversion and perform hierarchical chunking. |
|||
|
|||
Args: |
|||
file_path: Path or URL of the file to be loaded. |
|||
|
|||
Returns: |
|||
A list of Document objects, each representing a chunk. |
|||
|
|||
Raises: |
|||
FileNotFoundError: If the file does not exist. |
|||
ValueError: If the file type is not supported. |
|||
IOError: If there is an error reading the file. |
|||
""" |
|||
if not os.path.exists(file_path): |
|||
raise FileNotFoundError(f"Error: File '{file_path}' does not exist.") |
|||
|
|||
# Check if the file has a supported extension |
|||
file_extension = os.path.splitext(file_path)[1].lower().lstrip(".") |
|||
if file_extension not in self.supported_file_types: |
|||
supported_formats = ", ".join(self.supported_file_types) |
|||
raise ValueError( |
|||
f"Unsupported file type: '{file_extension}'. " |
|||
f"Supported file types are: {supported_formats}" |
|||
) |
|||
|
|||
try: |
|||
conversion_result = self.converter.convert(file_path) |
|||
docling_document = conversion_result.document |
|||
|
|||
chunks = list(self.chunker.chunk(docling_document)) |
|||
|
|||
documents = [] |
|||
for chunk in chunks: |
|||
metadata = {"reference": file_path, "text": chunk.text} |
|||
documents.append(Document(page_content=chunk.text, metadata=metadata)) |
|||
return documents |
|||
except Exception as e: |
|||
log.color_print(f"Error processing file {file_path}: {str(e)}") |
|||
raise IOError(f"Failed to process file {file_path}: {str(e)}") |
|||
|
|||
def load_directory(self, directory: str) -> List[Document]: |
|||
""" |
|||
Load all supported files from a directory. |
|||
|
|||
Args: |
|||
directory: Path to the directory containing files to be loaded. |
|||
|
|||
Returns: |
|||
A list of Document objects from all supported files in the directory. |
|||
|
|||
Raises: |
|||
NotADirectoryError: If the specified path is not a directory. |
|||
""" |
|||
if not os.path.isdir(directory): |
|||
raise NotADirectoryError(f"Error: '{directory}' is not a directory.") |
|||
|
|||
return super().load_directory(directory) |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Return the list of file extensions supported by this loader. |
|||
|
|||
Supported formats (refer to the official website: https://docling-project.github.io/docling/usage/supported_formats/): |
|||
- PDF |
|||
- Office formats: DOCX, XLSX, PPTX |
|||
- Markdown |
|||
- AsciiDoc |
|||
- HTML, XHTML |
|||
- CSV |
|||
- Images: PNG, JPEG, TIFF, BMP |
|||
""" |
|||
return [ |
|||
"pdf", |
|||
"docx", |
|||
"xlsx", |
|||
"pptx", |
|||
"md", |
|||
"adoc", |
|||
"asciidoc", |
|||
"html", |
|||
"xhtml", |
|||
"csv", |
|||
"png", |
|||
"jpg", |
|||
"jpeg", |
|||
"tif", |
|||
"tiff", |
|||
"bmp", |
|||
] |
@ -0,0 +1,94 @@ |
|||
import json |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
|
|||
|
|||
class JsonFileLoader(BaseLoader): |
|||
""" |
|||
Loader for JSON and JSONL files. |
|||
|
|||
This loader handles JSON and JSONL files, extracting text content from a specified key |
|||
and converting each entry into Document objects for further processing. |
|||
""" |
|||
|
|||
def __init__(self, text_key: str): |
|||
""" |
|||
Initialize the JsonFileLoader. |
|||
|
|||
Args: |
|||
text_key: The key in the JSON data that contains the text content to be extracted. |
|||
""" |
|||
self.text_key = text_key |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a JSON or JSONL file and convert it to Document objects. |
|||
|
|||
Args: |
|||
file_path: Path to the JSON or JSONL file to be loaded. |
|||
|
|||
Returns: |
|||
A list of Document objects, one for each entry in the JSON/JSONL file. |
|||
""" |
|||
if file_path.endswith(".jsonl"): |
|||
data_list: list[dict] = self._read_jsonl_file(file_path) |
|||
else: |
|||
data_list: list[dict] = self._read_json_file(file_path) |
|||
documents = [] |
|||
for data_dict in data_list: |
|||
page_content = data_dict.pop(self.text_key) |
|||
data_dict.update({"reference": file_path}) |
|||
document = Document(page_content=page_content, metadata=data_dict) |
|||
documents.append(document) |
|||
return documents |
|||
|
|||
def _read_json_file(self, file_path: str) -> list[dict]: |
|||
""" |
|||
Read and parse a JSON file. |
|||
|
|||
Args: |
|||
file_path: Path to the JSON file. |
|||
|
|||
Returns: |
|||
A list of dictionaries parsed from the JSON file. |
|||
|
|||
Raises: |
|||
ValueError: If the JSON file does not contain a list of dictionaries. |
|||
""" |
|||
json_data = json.load(open(file_path)) |
|||
if not isinstance(json_data, list): |
|||
raise ValueError("JSON file must contain a list of dictionaries.") |
|||
return json_data |
|||
|
|||
def _read_jsonl_file(self, file_path: str) -> List[dict]: |
|||
""" |
|||
Read and parse a JSONL file (JSON Lines format). |
|||
|
|||
Args: |
|||
file_path: Path to the JSONL file. |
|||
|
|||
Returns: |
|||
A list of dictionaries parsed from the JSONL file. |
|||
""" |
|||
data_list = [] |
|||
with open(file_path, "r", encoding="utf-8") as file: |
|||
for line in file: |
|||
try: |
|||
json_data = json.loads(line) |
|||
data_list.append(json_data) |
|||
except json.JSONDecodeError: |
|||
print(f"Failed to decode line: {line}") |
|||
return data_list |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Get the list of file extensions supported by this loader. |
|||
|
|||
Returns: |
|||
A list of supported file extensions: ["txt", "md"]. |
|||
""" |
|||
return ["txt", "md"] |
@ -0,0 +1,54 @@ |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
|
|||
|
|||
class PDFLoader(BaseLoader): |
|||
""" |
|||
Loader for PDF files. |
|||
|
|||
This loader handles PDF files and also supports text files with extensions like .txt and .md, |
|||
converting them into Document objects for further processing. |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Initialize the PDFLoader. |
|||
""" |
|||
pass |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a PDF file and convert it to a Document object. |
|||
|
|||
Args: |
|||
file_path: Path to the PDF file to be loaded. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the file content and reference. |
|||
|
|||
Note: |
|||
This loader also supports .txt and .md files for convenience. |
|||
""" |
|||
import pdfplumber |
|||
|
|||
if file_path.endswith(".pdf"): |
|||
with pdfplumber.open(file_path) as file: |
|||
page_content = "\n\n".join([page.extract_text() for page in file.pages]) |
|||
return [Document(page_content=page_content, metadata={"reference": file_path})] |
|||
elif file_path.endswith(".txt") or file_path.endswith(".md"): |
|||
with open(file_path, "r", encoding="utf-8") as file: |
|||
page_content = file.read() |
|||
return [Document(page_content=page_content, metadata={"reference": file_path})] |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Get the list of file extensions supported by this loader. |
|||
|
|||
Returns: |
|||
A list of supported file extensions: ["pdf", "md", "txt"]. |
|||
""" |
|||
return ["pdf", "md", "txt"] |
@ -0,0 +1,43 @@ |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
|
|||
|
|||
class TextLoader(BaseLoader): |
|||
""" |
|||
Loader for plain text files. |
|||
|
|||
This loader handles text files with extensions like .txt and .md, |
|||
converting them into Document objects for further processing. |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Initialize the TextLoader. |
|||
""" |
|||
pass |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a text file and convert it to a Document object. |
|||
|
|||
Args: |
|||
file_path: Path to the text file to be loaded. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the file content and reference. |
|||
""" |
|||
with open(file_path, "r", encoding="utf-8") as f: |
|||
return [Document(page_content=f.read(), metadata={"reference": file_path})] |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Get the list of file extensions supported by this loader. |
|||
|
|||
Returns: |
|||
A list of supported file extensions: ["txt", "md"]. |
|||
""" |
|||
return ["txt", "md"] |
@ -0,0 +1,201 @@ |
|||
import os |
|||
import shutil |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.file_loader.base import BaseLoader |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class UnstructuredLoader(BaseLoader): |
|||
""" |
|||
Loader for unstructured documents using the unstructured-io library. |
|||
|
|||
This loader processes various document formats using the unstructured-io library's |
|||
processing pipeline, extracting text and metadata from complex document formats. |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Initialize the UnstructuredLoader. |
|||
|
|||
Creates a temporary directory for processed outputs and cleans up any existing ones. |
|||
""" |
|||
self.directory_with_results = "./pdf_processed_outputs" |
|||
if os.path.exists(self.directory_with_results): |
|||
shutil.rmtree(self.directory_with_results) |
|||
os.makedirs(self.directory_with_results) |
|||
|
|||
def load_pipeline(self, input_path: str) -> List[Document]: |
|||
""" |
|||
Process documents using the unstructured-io pipeline. |
|||
|
|||
Args: |
|||
input_path: Path to the file or directory to be processed. |
|||
|
|||
Returns: |
|||
A list of Document objects extracted from the processed files. |
|||
|
|||
Note: |
|||
If UNSTRUCTURED_API_KEY and UNSTRUCTURED_API_URL environment variables are set, |
|||
the API-based partitioning will be used. Otherwise, local partitioning will be used. |
|||
""" |
|||
from unstructured_ingest.interfaces import ProcessorConfig |
|||
from unstructured_ingest.pipeline.pipeline import Pipeline |
|||
from unstructured_ingest.processes.connectors.local import ( |
|||
LocalConnectionConfig, |
|||
LocalDownloaderConfig, |
|||
LocalIndexerConfig, |
|||
LocalUploaderConfig, |
|||
) |
|||
from unstructured_ingest.processes.partitioner import PartitionerConfig |
|||
|
|||
# Check if API environment variables are set |
|||
api_key = os.getenv("UNSTRUCTURED_API_KEY") |
|||
api_url = os.getenv("UNSTRUCTURED_API_URL") |
|||
use_api = api_key is not None and api_url is not None |
|||
|
|||
if use_api: |
|||
log.color_print("Using Unstructured API for document processing") |
|||
else: |
|||
log.color_print( |
|||
"Using local processing for documents (UNSTRUCTURED_API_KEY or UNSTRUCTURED_API_URL not set)" |
|||
) |
|||
|
|||
Pipeline.from_configs( |
|||
context=ProcessorConfig(), |
|||
indexer_config=LocalIndexerConfig(input_path=input_path), |
|||
downloader_config=LocalDownloaderConfig(), |
|||
source_connection_config=LocalConnectionConfig(), |
|||
partitioner_config=PartitionerConfig( |
|||
partition_by_api=use_api, |
|||
api_key=api_key, |
|||
partition_endpoint=api_url, |
|||
strategy="hi_res", |
|||
), |
|||
uploader_config=LocalUploaderConfig(output_dir=self.directory_with_results), |
|||
).run() |
|||
|
|||
from unstructured.staging.base import elements_from_json |
|||
|
|||
elements = [] |
|||
for filename in os.listdir(self.directory_with_results): |
|||
if filename.endswith(".json"): |
|||
file_path = os.path.join(self.directory_with_results, filename) |
|||
try: |
|||
elements.extend(elements_from_json(filename=file_path)) |
|||
except IOError: |
|||
log.color_print(f"Error: Could not read file {filename}.") |
|||
|
|||
documents = [] |
|||
for element in elements: |
|||
metadata = element.metadata.to_dict() |
|||
metadata["reference"] = input_path # TODO test it |
|||
documents.append( |
|||
Document( |
|||
page_content=element.text, |
|||
metadata=metadata, |
|||
) |
|||
) |
|||
return documents |
|||
|
|||
def load_file(self, file_path: str) -> List[Document]: |
|||
""" |
|||
Load a single file using the unstructured-io pipeline. |
|||
|
|||
Args: |
|||
file_path: Path to the file to be processed. |
|||
|
|||
Returns: |
|||
A list of Document objects extracted from the processed file. |
|||
""" |
|||
return self.load_pipeline(file_path) |
|||
|
|||
def load_directory(self, directory: str) -> List[Document]: |
|||
""" |
|||
Load all supported files from a directory using the unstructured-io pipeline. |
|||
|
|||
Args: |
|||
directory: Path to the directory containing files to be processed. |
|||
|
|||
Returns: |
|||
A list of Document objects extracted from all processed files. |
|||
""" |
|||
return self.load_pipeline(directory) |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Get the list of file extensions supported by the unstructured-io library. Please refer to the Unstructured documentation for more details: https://docs.unstructured.io/ui/supported-file-types. |
|||
|
|||
Returns: |
|||
A comprehensive list of supported file extensions. |
|||
|
|||
Note: |
|||
The unstructured-io library supports a wide range of document formats |
|||
including office documents, images, emails, and more. |
|||
""" |
|||
return [ |
|||
"abw", |
|||
"bmp", |
|||
"csv", |
|||
"cwk", |
|||
"dbf", |
|||
"dif", |
|||
"doc", |
|||
"docm", |
|||
"docx", |
|||
"dot", |
|||
"dotm", |
|||
"eml", |
|||
"epub", |
|||
"et", |
|||
"eth", |
|||
"fods", |
|||
"gif", |
|||
"heic", |
|||
"htm", |
|||
"html", |
|||
"hwp", |
|||
"jpeg", |
|||
"jpg", |
|||
"md", |
|||
"mcw", |
|||
"mw", |
|||
"odt", |
|||
"org", |
|||
"p7s", |
|||
"pages", |
|||
"pbd", |
|||
"pdf", |
|||
"png", |
|||
"pot", |
|||
"potm", |
|||
"ppt", |
|||
"pptm", |
|||
"pptx", |
|||
"prn", |
|||
"rst", |
|||
"rtf", |
|||
"sdp", |
|||
"sgl", |
|||
"svg", |
|||
"sxg", |
|||
"tiff", |
|||
"txt", |
|||
"tsv", |
|||
"uof", |
|||
"uos1", |
|||
"uos2", |
|||
"web", |
|||
"webp", |
|||
"wk2", |
|||
"xls", |
|||
"xlsb", |
|||
"xlsm", |
|||
"xlsx", |
|||
"xlw", |
|||
"xml", |
|||
"zabw", |
|||
] |
@ -0,0 +1,105 @@ |
|||
## Sentence Window splitting strategy, ref: |
|||
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb |
|||
|
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|||
|
|||
|
|||
class Chunk: |
|||
""" |
|||
Represents a chunk of text with associated metadata and embedding. |
|||
|
|||
A chunk is a segment of text extracted from a document, along with its reference |
|||
information, metadata, and optional embedding vector. |
|||
|
|||
Attributes: |
|||
text: The text content of the chunk. |
|||
reference: A reference to the source of the chunk (e.g., file path, URL). |
|||
metadata: Additional metadata associated with the chunk. |
|||
embedding: The vector embedding of the chunk, if available. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
text: str, |
|||
reference: str, |
|||
metadata: dict = None, |
|||
embedding: List[float] = None, |
|||
): |
|||
""" |
|||
Initialize a Chunk object. |
|||
|
|||
Args: |
|||
text: The text content of the chunk. |
|||
reference: A reference to the source of the chunk. |
|||
metadata: Additional metadata associated with the chunk. Defaults to an empty dict. |
|||
embedding: The vector embedding of the chunk. Defaults to None. |
|||
""" |
|||
self.text = text |
|||
self.reference = reference |
|||
self.metadata = metadata or {} |
|||
self.embedding = embedding or None |
|||
|
|||
|
|||
def _sentence_window_split( |
|||
split_docs: List[Document], original_document: Document, offset: int = 200 |
|||
) -> List[Chunk]: |
|||
""" |
|||
Create chunks with context windows from split documents. |
|||
|
|||
This function takes documents that have been split into smaller pieces and |
|||
adds context from the original document by including text before and after |
|||
each split piece, up to the specified offset. |
|||
|
|||
Args: |
|||
split_docs: List of documents that have been split. |
|||
original_document: The original document before splitting. |
|||
offset: Number of characters to include before and after each split piece. |
|||
|
|||
Returns: |
|||
A list of Chunk objects with context windows. |
|||
""" |
|||
chunks = [] |
|||
original_text = original_document.page_content |
|||
for doc in split_docs: |
|||
doc_text = doc.page_content |
|||
start_index = original_text.index(doc_text) |
|||
end_index = start_index + len(doc_text) - 1 |
|||
wider_text = original_text[ |
|||
max(0, start_index - offset) : min(len(original_text), end_index + offset) |
|||
] |
|||
reference = doc.metadata.pop("reference", "") |
|||
doc.metadata["wider_text"] = wider_text |
|||
chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata) |
|||
chunks.append(chunk) |
|||
return chunks |
|||
|
|||
|
|||
def split_docs_to_chunks( |
|||
documents: List[Document], chunk_size: int = 1500, chunk_overlap=100 |
|||
) -> List[Chunk]: |
|||
""" |
|||
Split documents into chunks with context windows. |
|||
|
|||
This function splits a list of documents into smaller chunks with overlapping text, |
|||
and adds context windows to each chunk by including text before and after the chunk. |
|||
|
|||
Args: |
|||
documents: List of documents to split. |
|||
chunk_size: Size of each chunk in characters. |
|||
chunk_overlap: Number of characters to overlap between chunks. |
|||
|
|||
Returns: |
|||
A list of Chunk objects with context windows. |
|||
""" |
|||
text_splitter = RecursiveCharacterTextSplitter( |
|||
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|||
) |
|||
all_chunks = [] |
|||
for doc in documents: |
|||
split_docs = text_splitter.split_documents([doc]) |
|||
split_chunks = _sentence_window_split(split_docs, doc, offset=300) |
|||
all_chunks.extend(split_chunks) |
|||
return all_chunks |
@ -0,0 +1,11 @@ |
|||
from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler |
|||
from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler |
|||
from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler |
|||
from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler |
|||
|
|||
__all__ = [ |
|||
"FireCrawlCrawler", |
|||
"JinaCrawler", |
|||
"Crawl4AICrawler", |
|||
"DoclingCrawler", |
|||
] |
@ -0,0 +1,55 @@ |
|||
from abc import ABC |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
|
|||
class BaseCrawler(ABC): |
|||
""" |
|||
Abstract base class for web crawlers. |
|||
|
|||
This class defines the interface for crawling web pages and converting them |
|||
into Document objects for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the crawler with optional keyword arguments. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments for specific crawler implementations. |
|||
""" |
|||
pass |
|||
|
|||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl a single URL and convert it to Document objects. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects containing the content and metadata from the URL. |
|||
|
|||
Note: |
|||
Implementations should include the URL reference in the metadata. |
|||
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})] |
|||
""" |
|||
pass |
|||
|
|||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl multiple URLs and return a list of Document objects. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects containing the content and metadata from all URLs. |
|||
""" |
|||
documents = [] |
|||
for url in urls: |
|||
documents.extend(self.crawl_url(url, **crawl_kwargs)) |
|||
return documents |
@ -0,0 +1,140 @@ |
|||
import asyncio |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class Crawl4AICrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using the Crawl4AI library. |
|||
|
|||
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them |
|||
into markdown format for further processing. It supports both single-page crawling |
|||
and batch crawling of multiple pages. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the Crawl4AICrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
browser_config: Configuration for the browser used by Crawl4AI. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.crawler = None # Lazy init |
|||
self.browser_config = kwargs.get("browser_config", None) |
|||
|
|||
def _lazy_init(self): |
|||
""" |
|||
Initialize the crawler lazily when needed. |
|||
|
|||
This method creates the AsyncWebCrawler instance with the provided browser configuration |
|||
only when it's first needed, to avoid unnecessary initialization. |
|||
""" |
|||
from crawl4ai import AsyncWebCrawler, BrowserConfig |
|||
|
|||
if self.crawler is None: |
|||
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None |
|||
self.crawler = AsyncWebCrawler(config=config) |
|||
|
|||
async def _async_crawl(self, url: str) -> Document: |
|||
""" |
|||
Asynchronously crawl a single URL. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A Document object with the markdown content and metadata from the URL. |
|||
""" |
|||
if self.crawler is None: |
|||
self._lazy_init() |
|||
|
|||
async with self.crawler as crawler: |
|||
result = await crawler.arun(url) |
|||
|
|||
markdown_content = result.markdown or "" |
|||
|
|||
metadata = { |
|||
"reference": url, |
|||
"success": result.success, |
|||
"status_code": result.status_code, |
|||
"media": result.media, |
|||
"links": result.links, |
|||
} |
|||
|
|||
if hasattr(result, "metadata") and result.metadata: |
|||
metadata["title"] = result.metadata.get("title", "") |
|||
metadata["author"] = result.metadata.get("author", "") |
|||
|
|||
return Document(page_content=markdown_content, metadata=metadata) |
|||
|
|||
def crawl_url(self, url: str) -> List[Document]: |
|||
""" |
|||
Crawl a single URL. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the markdown content and metadata, |
|||
or an empty list if an error occurs. |
|||
""" |
|||
try: |
|||
document = asyncio.run(self._async_crawl(url)) |
|||
return [document] |
|||
except Exception as e: |
|||
log.error(f"Error during crawling {url}: {e}") |
|||
return [] |
|||
|
|||
async def _async_crawl_many(self, urls: List[str]) -> List[Document]: |
|||
""" |
|||
Asynchronously crawl multiple URLs. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
|
|||
Returns: |
|||
A list of Document objects with the markdown content and metadata from all URLs. |
|||
""" |
|||
if self.crawler is None: |
|||
self._lazy_init() |
|||
async with self.crawler as crawler: |
|||
results = await crawler.arun_many(urls) |
|||
documents = [] |
|||
for result in results: |
|||
markdown_content = result.markdown or "" |
|||
metadata = { |
|||
"reference": result.url, |
|||
"success": result.success, |
|||
"status_code": result.status_code, |
|||
"media": result.media, |
|||
"links": result.links, |
|||
} |
|||
if hasattr(result, "metadata") and result.metadata: |
|||
metadata["title"] = result.metadata.get("title", "") |
|||
metadata["author"] = result.metadata.get("author", "") |
|||
documents.append(Document(page_content=markdown_content, metadata=metadata)) |
|||
return documents |
|||
|
|||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl multiple URLs. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects with the markdown content and metadata from all URLs, |
|||
or an empty list if an error occurs. |
|||
""" |
|||
try: |
|||
return asyncio.run(self._async_crawl_many(urls)) |
|||
except Exception as e: |
|||
log.error(f"Error during crawling {urls}: {e}") |
|||
return [] |
@ -0,0 +1,98 @@ |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class DoclingCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using Docling's DocumentConverter and HierarchicalChunker. |
|||
|
|||
This crawler leverages Docling's capabilities to convert web pages into structured |
|||
documents and chunk them appropriately for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
from docling.document_converter import DocumentConverter |
|||
from docling_core.transforms.chunker import HierarchicalChunker |
|||
|
|||
self.converter = DocumentConverter() |
|||
self.chunker = HierarchicalChunker() |
|||
|
|||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl a single URL using Docling's conversion and perform hierarchical chunking. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects, each representing a chunk from the crawled URL. |
|||
|
|||
Raises: |
|||
IOError: If there is an error processing the URL. |
|||
""" |
|||
try: |
|||
# Use Docling to convert the URL to a document |
|||
conversion_result = self.converter.convert(url) |
|||
docling_document = conversion_result.document |
|||
|
|||
# Chunk the document using hierarchical chunking |
|||
chunks = list(self.chunker.chunk(docling_document)) |
|||
|
|||
documents = [] |
|||
for chunk in chunks: |
|||
metadata = {"reference": url, "text": chunk.text} |
|||
documents.append(Document(page_content=chunk.text, metadata=metadata)) |
|||
|
|||
return documents |
|||
|
|||
except Exception as e: |
|||
log.color_print(f"Error processing URL {url}: {str(e)}") |
|||
raise IOError(f"Failed to process URL {url}: {str(e)}") |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Return the list of file types and formats supported by Docling. |
|||
|
|||
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/): |
|||
- PDF |
|||
- Office formats: DOCX, XLSX, PPTX |
|||
- Markdown |
|||
- AsciiDoc |
|||
- HTML, XHTML |
|||
- CSV |
|||
- Images: PNG, JPEG, TIFF, BMP |
|||
|
|||
Returns: |
|||
A list of file extensions supported by this crawler. |
|||
""" |
|||
return [ |
|||
"pdf", |
|||
"docx", |
|||
"xlsx", |
|||
"pptx", |
|||
"md", |
|||
"adoc", |
|||
"asciidoc", |
|||
"html", |
|||
"xhtml", |
|||
"csv", |
|||
"png", |
|||
"jpg", |
|||
"jpeg", |
|||
"tif", |
|||
"tiff", |
|||
"bmp", |
|||
] |
@ -0,0 +1,88 @@ |
|||
import os |
|||
from typing import List, Optional |
|||
|
|||
from firecrawl import FirecrawlApp, ScrapeOptions |
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
|
|||
|
|||
class FireCrawlCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using the FireCrawl service. |
|||
|
|||
This crawler uses the FireCrawl service to crawl web pages and convert them |
|||
into markdown format for further processing. It supports both single-page scraping |
|||
and recursive crawling of multiple pages. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the FireCrawlCrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.app = None |
|||
|
|||
def crawl_url( |
|||
self, |
|||
url: str, |
|||
max_depth: Optional[int] = None, |
|||
limit: Optional[int] = None, |
|||
allow_backward_links: Optional[bool] = None, |
|||
) -> List[Document]: |
|||
""" |
|||
Dynamically crawls a URL using either scrape_url or crawl_url: |
|||
|
|||
- Uses scrape_url for single-page extraction if no params are provided. |
|||
- Uses crawl_url to recursively gather pages when any param is provided. |
|||
|
|||
Args: |
|||
url (str): The starting URL to crawl. |
|||
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2). |
|||
limit (Optional[int]): Maximum number of pages to crawl (default: 20). |
|||
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False). |
|||
|
|||
Returns: |
|||
List[Document]: List of Document objects with page content and metadata. |
|||
""" |
|||
# Lazy init |
|||
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) |
|||
|
|||
# if user just inputs a single url as param |
|||
# scrape single page |
|||
if max_depth is None and limit is None and allow_backward_links is None: |
|||
# Call the new Firecrawl API, passing formats directly |
|||
scrape_response = self.app.scrape_url(url=url, formats=["markdown"]) |
|||
data = scrape_response.model_dump() |
|||
return [ |
|||
Document( |
|||
page_content=data.get("markdown", ""), |
|||
metadata={"reference": url, **data.get("metadata", {})}, |
|||
) |
|||
] |
|||
|
|||
# else, crawl multiple pages based on users' input params |
|||
# set default values if not provided |
|||
crawl_response = self.app.crawl_url( |
|||
url=url, |
|||
limit=limit or 20, |
|||
max_depth=max_depth or 2, |
|||
allow_backward_links=allow_backward_links or False, |
|||
scrape_options=ScrapeOptions(formats=["markdown"]), |
|||
poll_interval=5, |
|||
) |
|||
items = crawl_response.model_dump().get("data", []) |
|||
|
|||
documents: List[Document] = [] |
|||
for item in items: |
|||
# Support items that are either dicts or Pydantic sub-models |
|||
item_dict = item.model_dump() if hasattr(item, "model_dump") else item |
|||
md = item_dict.get("markdown", "") |
|||
meta = item_dict.get("metadata", {}) |
|||
meta["reference"] = meta.get("url", url) |
|||
documents.append(Document(page_content=md, metadata=meta)) |
|||
|
|||
return documents |
@ -0,0 +1,62 @@ |
|||
import os |
|||
from typing import List |
|||
|
|||
import requests |
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
|
|||
|
|||
class JinaCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using Jina AI's rendering service. |
|||
|
|||
This crawler uses Jina AI's rendering service to crawl web pages and convert them |
|||
into markdown format for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the JinaCrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
|
|||
Raises: |
|||
ValueError: If the JINA_API_TOKEN environment variable is not set. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY") |
|||
if not self.jina_api_token: |
|||
raise ValueError("Missing JINA_API_TOKEN environment variable") |
|||
|
|||
def crawl_url(self, url: str) -> List[Document]: |
|||
""" |
|||
Crawl a single URL using Jina AI's rendering service. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the markdown content and metadata. |
|||
|
|||
Raises: |
|||
HTTPError: If the request to Jina AI's service fails. |
|||
""" |
|||
jina_url = f"https://r.jina.ai/{url}" |
|||
headers = { |
|||
"Authorization": f"Bearer {self.jina_api_token}", |
|||
"X-Return-Format": "markdown", |
|||
} |
|||
|
|||
response = requests.get(jina_url, headers=headers) |
|||
response.raise_for_status() |
|||
|
|||
markdown_content = response.text |
|||
metadata = { |
|||
"reference": url, |
|||
"status_code": response.status_code, |
|||
"headers": dict(response.headers), |
|||
} |
|||
|
|||
return [Document(page_content=markdown_content, metadata=metadata)] |
@ -0,0 +1,119 @@ |
|||
import os |
|||
from typing import List, Union |
|||
|
|||
from tqdm import tqdm |
|||
|
|||
# from deepsearcher.configuration import embedding_model, vector_db, file_loader |
|||
from deepsearcher import configuration |
|||
from deepsearcher.loader.splitter import split_docs_to_chunks |
|||
|
|||
|
|||
def load_from_local_files( |
|||
paths_or_directory: Union[str, List[str]], |
|||
collection_name: str = None, |
|||
collection_description: str = None, |
|||
force_new_collection: bool = False, |
|||
chunk_size: int = 1500, |
|||
chunk_overlap: int = 100, |
|||
batch_size: int = 256, |
|||
): |
|||
""" |
|||
Load knowledge from local files or directories into the vector database. |
|||
|
|||
This function processes files from the specified paths or directories, |
|||
splits them into chunks, embeds the chunks, and stores them in the vector database. |
|||
|
|||
Args: |
|||
paths_or_directory: A single path or a list of paths to files or directories to load. |
|||
collection_name: Name of the collection to store the data in. If None, uses the default collection. |
|||
collection_description: Description of the collection. If None, no description is set. |
|||
force_new_collection: If True, drops the existing collection and creates a new one. |
|||
chunk_size: Size of each chunk in characters. |
|||
chunk_overlap: Number of characters to overlap between chunks. |
|||
batch_size: Number of chunks to process at once during embedding. |
|||
|
|||
Raises: |
|||
FileNotFoundError: If any of the specified paths do not exist. |
|||
""" |
|||
vector_db = configuration.vector_db |
|||
if collection_name is None: |
|||
collection_name = vector_db.default_collection |
|||
collection_name = collection_name.replace(" ", "_").replace("-", "_") |
|||
embedding_model = configuration.embedding_model |
|||
file_loader = configuration.file_loader |
|||
vector_db.init_collection( |
|||
dim=embedding_model.dimension, |
|||
collection=collection_name, |
|||
description=collection_description, |
|||
force_new_collection=force_new_collection, |
|||
) |
|||
if isinstance(paths_or_directory, str): |
|||
paths_or_directory = [paths_or_directory] |
|||
all_docs = [] |
|||
for path in tqdm(paths_or_directory, desc="Loading files"): |
|||
if not os.path.exists(path): |
|||
raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.") |
|||
if os.path.isdir(path): |
|||
docs = file_loader.load_directory(path) |
|||
else: |
|||
docs = file_loader.load_file(path) |
|||
all_docs.extend(docs) |
|||
# print("Splitting docs to chunks...") |
|||
chunks = split_docs_to_chunks( |
|||
all_docs, |
|||
chunk_size=chunk_size, |
|||
chunk_overlap=chunk_overlap, |
|||
) |
|||
|
|||
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size) |
|||
vector_db.insert_data(collection=collection_name, chunks=chunks) |
|||
|
|||
|
|||
def load_from_website( |
|||
urls: Union[str, List[str]], |
|||
collection_name: str = None, |
|||
collection_description: str = None, |
|||
force_new_collection: bool = False, |
|||
chunk_size: int = 1500, |
|||
chunk_overlap: int = 100, |
|||
batch_size: int = 256, |
|||
**crawl_kwargs, |
|||
): |
|||
""" |
|||
Load knowledge from websites into the vector database. |
|||
|
|||
This function crawls the specified URLs, processes the content, |
|||
splits it into chunks, embeds the chunks, and stores them in the vector database. |
|||
|
|||
Args: |
|||
urls: A single URL or a list of URLs to crawl. |
|||
collection_name: Name of the collection to store the data in. If None, uses the default collection. |
|||
collection_description: Description of the collection. If None, no description is set. |
|||
force_new_collection: If True, drops the existing collection and creates a new one. |
|||
chunk_size: Size of each chunk in characters. |
|||
chunk_overlap: Number of characters to overlap between chunks. |
|||
batch_size: Number of chunks to process at once during embedding. |
|||
**crawl_kwargs: Additional keyword arguments to pass to the web crawler. |
|||
""" |
|||
if isinstance(urls, str): |
|||
urls = [urls] |
|||
vector_db = configuration.vector_db |
|||
embedding_model = configuration.embedding_model |
|||
web_crawler = configuration.web_crawler |
|||
|
|||
vector_db.init_collection( |
|||
dim=embedding_model.dimension, |
|||
collection=collection_name, |
|||
description=collection_description, |
|||
force_new_collection=force_new_collection, |
|||
) |
|||
|
|||
all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs) |
|||
|
|||
chunks = split_docs_to_chunks( |
|||
all_docs, |
|||
chunk_size=chunk_size, |
|||
chunk_overlap=chunk_overlap, |
|||
) |
|||
chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size) |
|||
vector_db.insert_data(collection=collection_name, chunks=chunks) |
@ -0,0 +1,96 @@ |
|||
from typing import List, Tuple |
|||
|
|||
# from deepsearcher.configuration import vector_db, embedding_model, llm |
|||
from deepsearcher import configuration |
|||
from deepsearcher.vector_db.base import RetrievalResult |
|||
|
|||
|
|||
def query(original_query: str, max_iter: int = 3) -> Tuple[str, List[RetrievalResult], int]: |
|||
""" |
|||
Query the knowledge base with a question and get an answer. |
|||
|
|||
This function uses the default searcher to query the knowledge base and generate |
|||
an answer based on the retrieved information. |
|||
|
|||
Args: |
|||
original_query: The question or query to search for. |
|||
max_iter: Maximum number of iterations for the search process. |
|||
|
|||
Returns: |
|||
A tuple containing: |
|||
- The generated answer as a string |
|||
- A list of retrieval results that were used to generate the answer |
|||
- The number of tokens consumed during the process |
|||
""" |
|||
default_searcher = configuration.default_searcher |
|||
return default_searcher.query(original_query, max_iter=max_iter) |
|||
|
|||
|
|||
def retrieve( |
|||
original_query: str, max_iter: int = 3 |
|||
) -> Tuple[List[RetrievalResult], List[str], int]: |
|||
""" |
|||
Retrieve relevant information from the knowledge base without generating an answer. |
|||
|
|||
This function uses the default searcher to retrieve information from the knowledge base |
|||
that is relevant to the query. |
|||
|
|||
Args: |
|||
original_query: The question or query to search for. |
|||
max_iter: Maximum number of iterations for the search process. |
|||
|
|||
Returns: |
|||
A tuple containing: |
|||
- A list of retrieval results |
|||
- An empty list (placeholder for future use) |
|||
- The number of tokens consumed during the process |
|||
""" |
|||
default_searcher = configuration.default_searcher |
|||
retrieved_results, consume_tokens, metadata = default_searcher.retrieve( |
|||
original_query, max_iter=max_iter |
|||
) |
|||
return retrieved_results, [], consume_tokens |
|||
|
|||
|
|||
def naive_retrieve(query: str, collection: str = None, top_k=10) -> List[RetrievalResult]: |
|||
""" |
|||
Perform a simple retrieval from the knowledge base using the naive RAG approach. |
|||
|
|||
This function uses the naive RAG agent to retrieve information from the knowledge base |
|||
without any advanced techniques like iterative refinement. |
|||
|
|||
Args: |
|||
query: The question or query to search for. |
|||
collection: The name of the collection to search in. If None, searches in all collections. |
|||
top_k: The maximum number of results to return. |
|||
|
|||
Returns: |
|||
A list of retrieval results. |
|||
""" |
|||
naive_rag = configuration.naive_rag |
|||
all_retrieved_results, consume_tokens, _ = naive_rag.retrieve(query) |
|||
return all_retrieved_results |
|||
|
|||
|
|||
def naive_rag_query( |
|||
query: str, collection: str = None, top_k=10 |
|||
) -> Tuple[str, List[RetrievalResult]]: |
|||
""" |
|||
Query the knowledge base using the naive RAG approach and get an answer. |
|||
|
|||
This function uses the naive RAG agent to query the knowledge base and generate |
|||
an answer based on the retrieved information, without any advanced techniques. |
|||
|
|||
Args: |
|||
query: The question or query to search for. |
|||
collection: The name of the collection to search in. If None, searches in all collections. |
|||
top_k: The maximum number of results to consider. |
|||
|
|||
Returns: |
|||
A tuple containing: |
|||
- The generated answer as a string |
|||
- A list of retrieval results that were used to generate the answer |
|||
""" |
|||
naive_rag = configuration.naive_rag |
|||
answer, retrieved_results, consume_tokens = naive_rag.query(query) |
|||
return answer, retrieved_results |
@ -0,0 +1,160 @@ |
|||
import logging |
|||
|
|||
from termcolor import colored |
|||
|
|||
|
|||
class ColoredFormatter(logging.Formatter): |
|||
""" |
|||
A custom formatter for logging that adds colors to log messages. |
|||
|
|||
This formatter adds colors to log messages based on their level, |
|||
making it easier to distinguish between different types of logs. |
|||
|
|||
Attributes: |
|||
COLORS: A dictionary mapping log levels to colors. |
|||
""" |
|||
|
|||
COLORS = { |
|||
"DEBUG": "cyan", |
|||
"INFO": "green", |
|||
"WARNING": "yellow", |
|||
"ERROR": "red", |
|||
"CRITICAL": "magenta", |
|||
} |
|||
|
|||
def format(self, record): |
|||
""" |
|||
Format a log record with colors. |
|||
|
|||
Args: |
|||
record: The log record to format. |
|||
|
|||
Returns: |
|||
The formatted log message with colors. |
|||
""" |
|||
# all line in log will be colored |
|||
log_message = super().format(record) |
|||
return colored(log_message, self.COLORS.get(record.levelname, "white")) |
|||
|
|||
# only log level will be colored |
|||
# levelname_colored = colored(record.levelname, self.COLORS.get(record.levelname, 'white')) |
|||
# record.levelname = levelname_colored |
|||
# return super().format(record) |
|||
|
|||
# only keywords will be colored |
|||
# message = record.msg |
|||
# for word, color in self.KEYWORDS.items(): |
|||
# if word in message: |
|||
# message = message.replace(word, colored(word, color)) |
|||
# record.msg = message |
|||
# return super().format(record) |
|||
|
|||
|
|||
# config log |
|||
dev_logger = logging.getLogger("dev") |
|||
dev_formatter = ColoredFormatter("%(asctime)s - %(levelname)s - %(message)s") |
|||
dev_handler = logging.StreamHandler() |
|||
dev_handler.setFormatter(dev_formatter) |
|||
dev_logger.addHandler(dev_handler) |
|||
dev_logger.setLevel(logging.INFO) |
|||
|
|||
progress_logger = logging.getLogger("progress") |
|||
progress_handler = logging.StreamHandler() |
|||
progress_handler.setFormatter(ColoredFormatter("%(message)s")) |
|||
progress_logger.addHandler(progress_handler) |
|||
progress_logger.setLevel(logging.INFO) |
|||
|
|||
dev_mode = False |
|||
|
|||
|
|||
def set_dev_mode(mode: bool): |
|||
""" |
|||
Set the development mode. |
|||
|
|||
When in development mode, debug, info, and warning logs are displayed. |
|||
When not in development mode, only error and critical logs are displayed. |
|||
|
|||
Args: |
|||
mode: True to enable development mode, False to disable it. |
|||
""" |
|||
global dev_mode |
|||
dev_mode = mode |
|||
|
|||
|
|||
def set_level(level): |
|||
""" |
|||
Set the logging level for the development logger. |
|||
|
|||
Args: |
|||
level: The logging level to set (e.g., logging.DEBUG, logging.INFO). |
|||
""" |
|||
dev_logger.setLevel(level) |
|||
|
|||
|
|||
def debug(message): |
|||
""" |
|||
Log a debug message. |
|||
|
|||
Args: |
|||
message: The message to log. |
|||
""" |
|||
if dev_mode: |
|||
dev_logger.debug(message) |
|||
|
|||
|
|||
def info(message): |
|||
""" |
|||
Log an info message. |
|||
|
|||
Args: |
|||
message: The message to log. |
|||
""" |
|||
if dev_mode: |
|||
dev_logger.info(message) |
|||
|
|||
|
|||
def warning(message): |
|||
""" |
|||
Log a warning message. |
|||
|
|||
Args: |
|||
message: The message to log. |
|||
""" |
|||
if dev_mode: |
|||
dev_logger.warning(message) |
|||
|
|||
|
|||
def error(message): |
|||
""" |
|||
Log an error message. |
|||
|
|||
Args: |
|||
message: The message to log. |
|||
""" |
|||
if dev_mode: |
|||
dev_logger.error(message) |
|||
|
|||
|
|||
def critical(message): |
|||
""" |
|||
Log a critical message and raise a RuntimeError. |
|||
|
|||
Args: |
|||
message: The message to log. |
|||
|
|||
Raises: |
|||
RuntimeError: Always raised with the provided message. |
|||
""" |
|||
dev_logger.critical(message) |
|||
raise RuntimeError(message) |
|||
|
|||
|
|||
def color_print(message, **kwargs): |
|||
""" |
|||
Print a colored message to the progress logger. |
|||
|
|||
Args: |
|||
message: The message to print. |
|||
**kwargs: Additional keyword arguments to pass to the logger. |
|||
""" |
|||
progress_logger.info(message) |
@ -0,0 +1,6 @@ |
|||
from .azure_search import AzureSearch |
|||
from .milvus import Milvus, RetrievalResult |
|||
from .oracle import OracleDB |
|||
from .qdrant import Qdrant |
|||
|
|||
__all__ = ["Milvus", "RetrievalResult", "OracleDB", "Qdrant", "AzureSearch"] |
@ -0,0 +1,279 @@ |
|||
import uuid |
|||
from typing import Any, Dict, List, Optional |
|||
|
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
|
|||
class AzureSearch(BaseVectorDB): |
|||
def __init__(self, endpoint, index_name, api_key, vector_field): |
|||
super().__init__(default_collection=index_name) |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents import SearchClient |
|||
|
|||
self.client = SearchClient( |
|||
endpoint=endpoint, |
|||
index_name=index_name, |
|||
credential=AzureKeyCredential(api_key), |
|||
) |
|||
self.vector_field = vector_field |
|||
self.endpoint = endpoint |
|||
self.index_name = index_name |
|||
self.api_key = api_key |
|||
|
|||
def init_collection(self): |
|||
"""Initialize Azure Search index with proper schema""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.core.exceptions import ResourceNotFoundError |
|||
from azure.search.documents.indexes import SearchIndexClient |
|||
from azure.search.documents.indexes.models import ( |
|||
SearchableField, |
|||
SearchField, |
|||
SearchIndex, |
|||
SimpleField, |
|||
) |
|||
|
|||
index_client = SearchIndexClient( |
|||
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key) |
|||
) |
|||
|
|||
# Create the index (simplified for compatibility with older SDK versions) |
|||
fields = [ |
|||
SimpleField(name="id", type="Edm.String", key=True), |
|||
SearchableField(name="content", type="Edm.String"), |
|||
SearchField( |
|||
name="content_vector", |
|||
type="Collection(Edm.Single)", |
|||
searchable=True, |
|||
vector_search_dimensions=1536, |
|||
), |
|||
] |
|||
|
|||
# Create index with fields |
|||
index = SearchIndex(name=self.index_name, fields=fields) |
|||
|
|||
try: |
|||
# Try to delete existing index |
|||
try: |
|||
index_client.delete_index(self.index_name) |
|||
except ResourceNotFoundError: |
|||
pass |
|||
|
|||
# Create the index |
|||
index_client.create_index(index) |
|||
except Exception as e: |
|||
print(f"Error creating index: {str(e)}") |
|||
|
|||
def insert_data(self, documents: List[dict]): |
|||
"""Batch insert documents with vector embeddings""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents import SearchClient |
|||
|
|||
search_client = SearchClient( |
|||
endpoint=self.endpoint, |
|||
index_name=self.index_name, |
|||
credential=AzureKeyCredential(self.api_key), |
|||
) |
|||
|
|||
actions = [ |
|||
{ |
|||
"@search.action": "upload" if doc.get("id") else "merge", |
|||
"id": doc.get("id", str(uuid.uuid4())), |
|||
"content": doc["text"], |
|||
"content_vector": doc["vector"], |
|||
} |
|||
for doc in documents |
|||
] |
|||
|
|||
result = search_client.upload_documents(actions) |
|||
return [x.succeeded for x in result] |
|||
|
|||
def search_data( |
|||
self, collection: Optional[str], vector: List[float], top_k: int = 50 |
|||
) -> List[RetrievalResult]: |
|||
"""Azure Cognitive Search implementation with compatibility for older SDK versions""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents import SearchClient |
|||
|
|||
search_client = SearchClient( |
|||
endpoint=self.endpoint, |
|||
index_name=collection or self.index_name, |
|||
credential=AzureKeyCredential(self.api_key), |
|||
) |
|||
|
|||
# Validate that vector is not empty |
|||
if not vector or len(vector) == 0: |
|||
print("Error: Empty vector provided for search. Vector must have 1536 dimensions.") |
|||
return [] |
|||
|
|||
# Debug vector and field info |
|||
print(f"Vector length for search: {len(vector)}") |
|||
print(f"Vector field name: {self.vector_field}") |
|||
|
|||
# Ensure vector has the right dimensions |
|||
if len(vector) != 1536: |
|||
print(f"Warning: Vector length {len(vector)} does not match expected 1536 dimensions") |
|||
return [] |
|||
|
|||
# Execute search with direct parameters - simpler approach |
|||
try: |
|||
print(f"Executing search with top_k={top_k}") |
|||
|
|||
# Directly use the search_by_vector method for compatibility |
|||
body = { |
|||
"search": "*", |
|||
"select": "id,content", |
|||
"top": top_k, |
|||
"vectorQueries": [ |
|||
{ |
|||
"vector": vector, |
|||
"fields": self.vector_field, |
|||
"k": top_k, |
|||
"kind": "vector", |
|||
} |
|||
], |
|||
} |
|||
|
|||
# Print the search request body for debugging |
|||
print(f"Search request body: {body}") |
|||
|
|||
# Use the REST API directly |
|||
result = search_client._client.documents.search_post( |
|||
search_request=body, headers={"api-key": self.api_key} |
|||
) |
|||
|
|||
# Format results |
|||
search_results = [] |
|||
if hasattr(result, "results"): |
|||
for doc in result.results: |
|||
try: |
|||
doc_dict = doc.as_dict() if hasattr(doc, "as_dict") else doc |
|||
content = doc_dict.get("content", "") |
|||
doc_id = doc_dict.get("id", "") |
|||
score = doc_dict.get("@search.score", 0.0) |
|||
|
|||
result = RetrievalResult( |
|||
embedding=[], # We don't get the vectors back |
|||
text=content, |
|||
reference=doc_id, |
|||
metadata={"source": doc_id}, |
|||
score=score, |
|||
) |
|||
search_results.append(result) |
|||
except Exception as e: |
|||
print(f"Error processing result: {str(e)}") |
|||
|
|||
return search_results |
|||
except Exception as e: |
|||
print(f"Search error: {str(e)}") |
|||
|
|||
# Try another approach if the first one fails |
|||
try: |
|||
print("Trying alternative search method...") |
|||
results = search_client.search(search_text="*", select=["id", "content"], top=top_k) |
|||
|
|||
# Process results |
|||
alt_results = [] |
|||
for doc in results: |
|||
try: |
|||
# Handle different result formats |
|||
if isinstance(doc, dict): |
|||
content = doc.get("content", "") |
|||
doc_id = doc.get("id", "") |
|||
score = doc.get("@search.score", 0.0) |
|||
else: |
|||
content = getattr(doc, "content", "") |
|||
doc_id = getattr(doc, "id", "") |
|||
score = getattr(doc, "@search.score", 0.0) |
|||
|
|||
result = RetrievalResult( |
|||
embedding=[], |
|||
text=content, |
|||
reference=doc_id, |
|||
metadata={"source": doc_id}, |
|||
score=score, |
|||
) |
|||
alt_results.append(result) |
|||
except Exception as e: |
|||
print(f"Error processing result: {str(e)}") |
|||
|
|||
return alt_results |
|||
except Exception as e: |
|||
print(f"Alternative search failed: {str(e)}") |
|||
return [] |
|||
|
|||
def clear_db(self): |
|||
"""Delete all documents in the index""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents import SearchClient |
|||
|
|||
search_client = SearchClient( |
|||
endpoint=self.endpoint, |
|||
index_name=self.index_name, |
|||
credential=AzureKeyCredential(self.api_key), |
|||
) |
|||
|
|||
docs = search_client.search(search_text="*", include_total_count=True, select=["id"]) |
|||
ids = [doc["id"] for doc in docs] |
|||
|
|||
if ids: |
|||
search_client.delete_documents([{"id": id} for id in ids]) |
|||
|
|||
return len(ids) |
|||
|
|||
def get_all_collections(self) -> List[str]: |
|||
"""List all search indices in Azure Cognitive Search""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents.indexes import SearchIndexClient |
|||
|
|||
try: |
|||
index_client = SearchIndexClient( |
|||
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key) |
|||
) |
|||
return [index.name for index in index_client.list_indexes()] |
|||
except Exception as e: |
|||
print(f"Failed to list indices: {str(e)}") |
|||
return [] |
|||
|
|||
def get_collection_info(self, name: str) -> Dict[str, Any]: |
|||
"""Retrieve index metadata""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents.indexes import SearchIndexClient |
|||
|
|||
index_client = SearchIndexClient( |
|||
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key) |
|||
) |
|||
return index_client.get_index(name).__dict__ |
|||
|
|||
def collection_exists(self, name: str) -> bool: |
|||
"""Check index existence""" |
|||
from azure.core.exceptions import ResourceNotFoundError |
|||
|
|||
try: |
|||
self.get_collection_info(name) |
|||
return True |
|||
except ResourceNotFoundError: |
|||
return False |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
"""List all Azure Search indices with metadata""" |
|||
from azure.core.credentials import AzureKeyCredential |
|||
from azure.search.documents.indexes import SearchIndexClient |
|||
|
|||
try: |
|||
index_client = SearchIndexClient( |
|||
endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key) |
|||
) |
|||
|
|||
collections = [] |
|||
for index in index_client.list_indexes(): |
|||
collections.append( |
|||
CollectionInfo( |
|||
collection_name=index.name, |
|||
description=f"Azure Search Index with {len(index.fields) if hasattr(index, 'fields') else 0} fields", |
|||
) |
|||
) |
|||
return collections |
|||
|
|||
except Exception as e: |
|||
print(f"Collection listing failed: {str(e)}") |
|||
return [] |
@ -0,0 +1,207 @@ |
|||
from abc import ABC, abstractmethod |
|||
from typing import List, Union |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
|
|||
|
|||
class RetrievalResult: |
|||
""" |
|||
Represents a result retrieved from the vector database. |
|||
|
|||
This class encapsulates the information about a retrieved document, |
|||
including its embedding, text content, reference, metadata, and similarity score. |
|||
|
|||
Attributes: |
|||
embedding: The vector embedding of the document. |
|||
text: The text content of the document. |
|||
reference: A reference to the source of the document. |
|||
metadata: Additional metadata associated with the document. |
|||
score: The similarity score of the document to the query. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
embedding: np.array, |
|||
text: str, |
|||
reference: str, |
|||
metadata: dict, |
|||
score: float = 0.0, |
|||
): |
|||
""" |
|||
Initialize a RetrievalResult object. |
|||
|
|||
Args: |
|||
embedding: The vector embedding of the document. |
|||
text: The text content of the document. |
|||
reference: A reference to the source of the document. |
|||
metadata: Additional metadata associated with the document. |
|||
score: The similarity score of the document to the query. Defaults to 0.0. |
|||
""" |
|||
self.embedding = embedding |
|||
self.text = text |
|||
self.reference = reference |
|||
self.metadata = metadata |
|||
self.score: float = score |
|||
|
|||
def __repr__(self): |
|||
""" |
|||
Return a string representation of the RetrievalResult. |
|||
|
|||
Returns: |
|||
A string representation of the RetrievalResult object. |
|||
""" |
|||
return f"RetrievalResult(score={self.score}, embedding={self.embedding}, text={self.text}, reference={self.reference}), metadata={self.metadata}" |
|||
|
|||
|
|||
def deduplicate_results(results: List[RetrievalResult]) -> List[RetrievalResult]: |
|||
""" |
|||
Remove duplicate results based on text content. |
|||
|
|||
This function removes duplicate results from a list of RetrievalResult objects |
|||
by keeping only the first occurrence of each unique text content. |
|||
|
|||
Args: |
|||
results: A list of RetrievalResult objects to deduplicate. |
|||
|
|||
Returns: |
|||
A list of deduplicated RetrievalResult objects. |
|||
""" |
|||
all_text_set = set() |
|||
deduplicated_results = [] |
|||
for result in results: |
|||
if result.text not in all_text_set: |
|||
all_text_set.add(result.text) |
|||
deduplicated_results.append(result) |
|||
return deduplicated_results |
|||
|
|||
|
|||
class CollectionInfo: |
|||
""" |
|||
Represents information about a collection in the vector database. |
|||
|
|||
This class encapsulates the name and description of a collection. |
|||
|
|||
Attributes: |
|||
collection_name: The name of the collection. |
|||
description: The description of the collection. |
|||
""" |
|||
|
|||
def __init__(self, collection_name: str, description: str): |
|||
""" |
|||
Initialize a CollectionInfo object. |
|||
|
|||
Args: |
|||
collection_name: The name of the collection. |
|||
description: The description of the collection. |
|||
""" |
|||
self.collection_name = collection_name |
|||
self.description = description |
|||
|
|||
|
|||
class BaseVectorDB(ABC): |
|||
""" |
|||
Abstract base class for vector database implementations. |
|||
|
|||
This class defines the interface for vector database implementations, |
|||
including methods for initializing collections, inserting data, searching, |
|||
listing collections, and clearing the database. |
|||
|
|||
Attributes: |
|||
default_collection: The name of the default collection. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
default_collection: str = "deepsearcher", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a BaseVectorDB object. |
|||
|
|||
Args: |
|||
default_collection: The name of the default collection. Defaults to "deepsearcher". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
self.default_collection = default_collection |
|||
|
|||
@abstractmethod |
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: str, |
|||
description: str, |
|||
force_new_collection=False, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in the vector database. |
|||
|
|||
Args: |
|||
dim: The dimensionality of the vectors in the collection. |
|||
collection: The name of the collection. |
|||
description: The description of the collection. |
|||
force_new_collection: If True, drop the existing collection and create a new one. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def insert_data(self, collection: str, chunks: List[Chunk], *args, **kwargs): |
|||
""" |
|||
Insert data into a collection in the vector database. |
|||
|
|||
Args: |
|||
collection: The name of the collection. |
|||
chunks: A list of Chunk objects to insert. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def search_data( |
|||
self, collection: str, vector: Union[np.array, List[float]], *args, **kwargs |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a collection. |
|||
|
|||
Args: |
|||
collection: The name of the collection. |
|||
vector: The query vector to search for. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
A list of RetrievalResult objects representing the search results. |
|||
""" |
|||
pass |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the vector database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
A list of CollectionInfo objects representing the collections. |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def clear_db(self, *args, **kwargs): |
|||
""" |
|||
Clear the vector database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
pass |
@ -0,0 +1,305 @@ |
|||
from typing import List, Optional, Union |
|||
|
|||
import numpy as np |
|||
from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, RRFRanker |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
|
|||
class Milvus(BaseVectorDB): |
|||
"""Milvus class is a subclass of DB class.""" |
|||
|
|||
client: MilvusClient = None |
|||
|
|||
def __init__( |
|||
self, |
|||
default_collection: str = "deepsearcher", |
|||
uri: str = "http://localhost:19530", |
|||
token: str = "root:Milvus", |
|||
user: str = "", |
|||
password: str = "", |
|||
db: str = "default", |
|||
hybrid: bool = False, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize the Milvus client. |
|||
|
|||
Args: |
|||
default_collection (str, optional): Default collection name. Defaults to "deepsearcher". |
|||
uri (str, optional): URI for connecting to Milvus server. Defaults to "http://localhost:19530". |
|||
token (str, optional): Authentication token for Milvus. Defaults to "root:Milvus". |
|||
user (str, optional): Username for authentication. Defaults to "". |
|||
password (str, optional): Password for authentication. Defaults to "". |
|||
db (str, optional): Database name. Defaults to "default". |
|||
hybrid (bool, optional): Whether to enable hybrid search. Defaults to False. |
|||
**kwargs: Additional keyword arguments to pass to the MilvusClient. |
|||
""" |
|||
super().__init__(default_collection) |
|||
self.default_collection = default_collection |
|||
self.client = MilvusClient( |
|||
uri=uri, user=user, password=password, token=token, db_name=db, timeout=30, **kwargs |
|||
) |
|||
|
|||
self.hybrid = hybrid |
|||
|
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: Optional[str] = "deepsearcher", |
|||
description: Optional[str] = "", |
|||
force_new_collection: bool = False, |
|||
text_max_length: int = 65_535, |
|||
reference_max_length: int = 2048, |
|||
metric_type: str = "L2", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in Milvus. |
|||
|
|||
Args: |
|||
dim (int): Dimension of the vector embeddings. |
|||
collection (Optional[str], optional): Collection name. Defaults to "deepsearcher". |
|||
description (Optional[str], optional): Collection description. Defaults to "". |
|||
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False. |
|||
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535. |
|||
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048. |
|||
metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
if description is None: |
|||
description = "" |
|||
|
|||
self.metric_type = metric_type |
|||
|
|||
try: |
|||
has_collection = self.client.has_collection(collection, timeout=5) |
|||
if force_new_collection and has_collection: |
|||
self.client.drop_collection(collection) |
|||
elif has_collection: |
|||
return |
|||
schema = self.client.create_schema( |
|||
enable_dynamic_field=False, auto_id=True, description=description |
|||
) |
|||
schema.add_field("id", DataType.INT64, is_primary=True) |
|||
schema.add_field("embedding", DataType.FLOAT_VECTOR, dim=dim) |
|||
|
|||
if self.hybrid: |
|||
analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]} |
|||
schema.add_field( |
|||
"text", |
|||
DataType.VARCHAR, |
|||
max_length=text_max_length, |
|||
analyzer_params=analyzer_params, |
|||
enable_match=True, |
|||
enable_analyzer=True, |
|||
) |
|||
else: |
|||
schema.add_field("text", DataType.VARCHAR, max_length=text_max_length) |
|||
|
|||
schema.add_field("reference", DataType.VARCHAR, max_length=reference_max_length) |
|||
schema.add_field("metadata", DataType.JSON) |
|||
|
|||
if self.hybrid: |
|||
schema.add_field("sparse_vector", DataType.SPARSE_FLOAT_VECTOR) |
|||
bm25_function = Function( |
|||
name="bm25", |
|||
function_type=FunctionType.BM25, |
|||
input_field_names=["text"], |
|||
output_field_names="sparse_vector", |
|||
) |
|||
schema.add_function(bm25_function) |
|||
|
|||
index_params = self.client.prepare_index_params() |
|||
index_params.add_index(field_name="embedding", metric_type=metric_type) |
|||
|
|||
if self.hybrid: |
|||
index_params.add_index( |
|||
field_name="sparse_vector", |
|||
index_type="SPARSE_INVERTED_INDEX", |
|||
metric_type="BM25", |
|||
) |
|||
|
|||
self.client.create_collection( |
|||
collection, |
|||
schema=schema, |
|||
index_params=index_params, |
|||
consistency_level="Strong", |
|||
) |
|||
log.color_print(f"create collection [{collection}] successfully") |
|||
except Exception as e: |
|||
log.critical(f"fail to init db for milvus, error info: {e}") |
|||
|
|||
def insert_data( |
|||
self, |
|||
collection: Optional[str], |
|||
chunks: List[Chunk], |
|||
batch_size: int = 256, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Insert data into a Milvus collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
chunks (List[Chunk]): List of Chunk objects to insert. |
|||
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
texts = [chunk.text for chunk in chunks] |
|||
references = [chunk.reference for chunk in chunks] |
|||
metadatas = [chunk.metadata for chunk in chunks] |
|||
embeddings = [chunk.embedding for chunk in chunks] |
|||
|
|||
datas = [ |
|||
{ |
|||
"embedding": embedding, |
|||
"text": text, |
|||
"reference": reference, |
|||
"metadata": metadata, |
|||
} |
|||
for embedding, text, reference, metadata in zip( |
|||
embeddings, texts, references, metadatas |
|||
) |
|||
] |
|||
batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)] |
|||
try: |
|||
for batch_data in batch_datas: |
|||
self.client.insert(collection_name=collection, data=batch_data) |
|||
except Exception as e: |
|||
log.critical(f"fail to insert data, error info: {e}") |
|||
|
|||
def search_data( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
query_text: Optional[str] = None, |
|||
*args, |
|||
**kwargs, |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a Milvus collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
query_text (Optional[str], optional): Original query text for hybrid search. Defaults to None. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: List of retrieval results containing similar vectors. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
use_hybrid = self.hybrid and query_text |
|||
|
|||
if use_hybrid: |
|||
sparse_search_params = {"metric_type": "BM25"} |
|||
sparse_request = AnnSearchRequest( |
|||
[query_text], "sparse_vector", sparse_search_params, limit=top_k |
|||
) |
|||
|
|||
dense_search_params = {"metric_type": self.metric_type} |
|||
dense_request = AnnSearchRequest( |
|||
[vector], "embedding", dense_search_params, limit=top_k |
|||
) |
|||
|
|||
search_results = self.client.hybrid_search( |
|||
collection_name=collection, |
|||
reqs=[sparse_request, dense_request], |
|||
ranker=RRFRanker(), |
|||
limit=top_k, |
|||
output_fields=["embedding", "text", "reference", "metadata"], |
|||
timeout=10, |
|||
) |
|||
else: |
|||
search_results = self.client.search( |
|||
collection_name=collection, |
|||
data=[vector], |
|||
limit=top_k, |
|||
output_fields=["embedding", "text", "reference", "metadata"], |
|||
timeout=10, |
|||
) |
|||
|
|||
return [ |
|||
RetrievalResult( |
|||
embedding=b["entity"]["embedding"], |
|||
text=b["entity"]["text"], |
|||
reference=b["entity"]["reference"], |
|||
score=b["distance"], |
|||
metadata=b["entity"]["metadata"], |
|||
) |
|||
for a in search_results |
|||
for b in a |
|||
] |
|||
except Exception as e: |
|||
log.critical(f"fail to search data, error info: {e}") |
|||
return [] |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the Milvus database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[CollectionInfo]: List of collection information objects. |
|||
""" |
|||
collection_infos = [] |
|||
dim = kwargs.pop("dim", 0) |
|||
try: |
|||
collections = self.client.list_collections() |
|||
for collection in collections: |
|||
description = self.client.describe_collection(collection) |
|||
if dim != 0: |
|||
skip = False |
|||
for field_dict in description["fields"]: |
|||
if ( |
|||
field_dict["name"] == "embedding" |
|||
and field_dict["type"] == DataType.FLOAT_VECTOR |
|||
): |
|||
if field_dict["params"]["dim"] != dim: |
|||
skip = True |
|||
if skip: |
|||
continue |
|||
collection_infos.append( |
|||
CollectionInfo( |
|||
collection_name=collection, |
|||
description=description["description"], |
|||
) |
|||
) |
|||
except Exception as e: |
|||
log.critical(f"fail to list collections, error info: {e}") |
|||
return collection_infos |
|||
|
|||
def clear_db(self, collection: str = "deepsearcher", *args, **kwargs): |
|||
""" |
|||
Clear (drop) a collection from the Milvus database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. Defaults to "deepsearcher". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
self.client.drop_collection(collection) |
|||
except Exception as e: |
|||
log.warning(f"fail to clear db, error info: {e}") |
@ -0,0 +1,536 @@ |
|||
import array |
|||
import json |
|||
from typing import List, Optional, Union |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
|
|||
class OracleDB(BaseVectorDB): |
|||
"""OracleDB class is a subclass of DB class.""" |
|||
|
|||
client = None |
|||
|
|||
def __init__( |
|||
self, |
|||
user: str, |
|||
password: str, |
|||
dsn: str, |
|||
config_dir: str, |
|||
wallet_location: str, |
|||
wallet_password: str, |
|||
min: int = 1, |
|||
max: int = 10, |
|||
increment: int = 1, |
|||
default_collection: str = "deepsearcher", |
|||
): |
|||
""" |
|||
Initialize the Oracle database connection. |
|||
|
|||
Args: |
|||
user (str): Oracle database username. |
|||
password (str): Oracle database password. |
|||
dsn (str): Oracle database connection string. |
|||
config_dir (str): Directory containing Oracle configuration files. |
|||
wallet_location (str): Location of the Oracle wallet. |
|||
wallet_password (str): Password for the Oracle wallet. |
|||
min (int, optional): Minimum number of connections in the pool. Defaults to 1. |
|||
max (int, optional): Maximum number of connections in the pool. Defaults to 10. |
|||
increment (int, optional): Increment for adding new connections. Defaults to 1. |
|||
default_collection (str, optional): Default collection name. Defaults to "deepsearcher". |
|||
""" |
|||
super().__init__(default_collection) |
|||
self.default_collection = default_collection |
|||
|
|||
import oracledb |
|||
|
|||
oracledb.defaults.fetch_lobs = False |
|||
self.DB_TYPE_VECTOR = oracledb.DB_TYPE_VECTOR |
|||
|
|||
try: |
|||
self.client = oracledb.create_pool( |
|||
user=user, |
|||
password=password, |
|||
dsn=dsn, |
|||
config_dir=config_dir, |
|||
wallet_location=wallet_location, |
|||
wallet_password=wallet_password, |
|||
min=min, |
|||
max=max, |
|||
increment=increment, |
|||
) |
|||
log.color_print(f"Connected to Oracle database at {dsn}") |
|||
self.check_table() |
|||
except Exception as e: |
|||
log.critical(f"Failed to connect to Oracle database at {dsn}") |
|||
log.critical(f"Oracle database error in init: {e}") |
|||
raise |
|||
|
|||
def numpy_converter_in(self, value): |
|||
"""Convert numpy array to array.array""" |
|||
if value.dtype == np.float64: |
|||
dtype = "d" |
|||
elif value.dtype == np.float32: |
|||
dtype = "f" |
|||
else: |
|||
dtype = "b" |
|||
return array.array(dtype, value) |
|||
|
|||
def input_type_handler(self, cursor, value, arraysize): |
|||
"""Set the type handler for the input data""" |
|||
if isinstance(value, np.ndarray): |
|||
return cursor.var( |
|||
self.DB_TYPE_VECTOR, |
|||
arraysize=arraysize, |
|||
inconverter=self.numpy_converter_in, |
|||
) |
|||
|
|||
def numpy_converter_out(self, value): |
|||
"""Convert array.array to numpy array""" |
|||
if value.typecode == "b": |
|||
dtype = np.int8 |
|||
elif value.typecode == "f": |
|||
dtype = np.float32 |
|||
else: |
|||
dtype = np.float64 |
|||
return np.array(value, copy=False, dtype=dtype) |
|||
|
|||
def output_type_handler(self, cursor, metadata): |
|||
"""Set the type handler for the output data""" |
|||
if metadata.type_code is self.DB_TYPE_VECTOR: |
|||
return cursor.var( |
|||
metadata.type_code, |
|||
arraysize=cursor.arraysize, |
|||
outconverter=self.numpy_converter_out, |
|||
) |
|||
|
|||
def query(self, sql: str, params: dict = None) -> Union[dict, None]: |
|||
""" |
|||
Execute a SQL query and return the results. |
|||
|
|||
Args: |
|||
sql (str): SQL query to execute. |
|||
params (dict, optional): Parameters for the SQL query. Defaults to None. |
|||
|
|||
Returns: |
|||
Union[dict, None]: Query results as a dictionary or None if no results. |
|||
|
|||
Raises: |
|||
Exception: If there's an error executing the query. |
|||
""" |
|||
with self.client.acquire() as connection: |
|||
connection.inputtypehandler = self.input_type_handler |
|||
connection.outputtypehandler = self.output_type_handler |
|||
with connection.cursor() as cursor: |
|||
try: |
|||
if log.dev_mode: |
|||
print("sql:\n", sql) |
|||
# log.debug("def query:"+params) |
|||
# print("sql:\n",sql) |
|||
# print("params:\n",params) |
|||
cursor.execute(sql, params) |
|||
except Exception as e: |
|||
log.critical(f"Oracle database error in query: {e}") |
|||
raise |
|||
columns = [column[0].lower() for column in cursor.description] |
|||
rows = cursor.fetchall() |
|||
if rows: |
|||
data = [dict(zip(columns, row)) for row in rows] |
|||
else: |
|||
data = [] |
|||
if log.dev_mode: |
|||
print("data:\n", data) |
|||
return data |
|||
# self.client.drop(connection) |
|||
|
|||
def execute(self, sql: str, data: Union[list, dict] = None): |
|||
""" |
|||
Execute a SQL statement without returning results. |
|||
|
|||
Args: |
|||
sql (str): SQL statement to execute. |
|||
data (Union[list, dict], optional): Data for the SQL statement. Defaults to None. |
|||
|
|||
Raises: |
|||
Exception: If there's an error executing the statement. |
|||
""" |
|||
try: |
|||
with self.client.acquire() as connection: |
|||
connection.inputtypehandler = self.input_type_handler |
|||
connection.outputtypehandler = self.output_type_handler |
|||
with connection.cursor() as cursor: |
|||
# print("sql:\n",sql) |
|||
# print("data:\n",data) |
|||
if data is None: |
|||
cursor.execute(sql) |
|||
else: |
|||
cursor.execute(sql, data) |
|||
connection.commit() |
|||
except Exception as e: |
|||
log.critical(f"Oracle database error in execute: {e}") |
|||
log.error("ERROR sql:\n" + sql) |
|||
log.error("ERROR data:\n" + data) |
|||
raise |
|||
|
|||
def has_collection(self, collection: str = "deepsearcher"): |
|||
""" |
|||
Check if a collection exists in the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to check. Defaults to "deepsearcher". |
|||
|
|||
Returns: |
|||
bool: True if the collection exists, False otherwise. |
|||
""" |
|||
SQL = SQL_TEMPLATES["has_collection"] |
|||
params = {"collection": collection} |
|||
res = self.query(SQL, params) |
|||
if res: |
|||
if res[0]["rowcnt"] > 0: |
|||
return True |
|||
else: |
|||
return False |
|||
else: |
|||
return False |
|||
|
|||
def check_table(self): |
|||
""" |
|||
Check if required tables exist and create them if they don't. |
|||
|
|||
Raises: |
|||
Exception: If there's an error checking or creating tables. |
|||
""" |
|||
SQL = SQL_TEMPLATES["has_table"] |
|||
try: |
|||
res = self.query(SQL) |
|||
if len(res) < 2: |
|||
missing_table = TABLES.keys() - set([i["table_name"] for i in res]) |
|||
for table in missing_table: |
|||
self.create_tables(table) |
|||
except Exception as e: |
|||
log.critical(f"Failed to check table in Oracle database, error info: {e}") |
|||
raise |
|||
|
|||
def create_tables(self, table_name): |
|||
""" |
|||
Create a table in the database. |
|||
|
|||
Args: |
|||
table_name: Name of the table to create. |
|||
|
|||
Raises: |
|||
Exception: If there's an error creating the table. |
|||
""" |
|||
SQL = TABLES[table_name] |
|||
try: |
|||
self.execute(SQL) |
|||
log.color_print(f"Created table {table_name} in Oracle database") |
|||
except Exception as e: |
|||
log.critical(f"Failed to create table {table_name} in Oracle database, error info: {e}") |
|||
raise |
|||
|
|||
def drop_collection(self, collection: str = "deepsearcher"): |
|||
""" |
|||
Drop a collection from the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. Defaults to "deepsearcher". |
|||
|
|||
Raises: |
|||
Exception: If there's an error dropping the collection. |
|||
""" |
|||
try: |
|||
params = {"collection": collection} |
|||
SQL = SQL_TEMPLATES["drop_collection"] |
|||
self.execute(SQL, params) |
|||
|
|||
SQL = SQL_TEMPLATES["drop_collection_item"] |
|||
self.execute(SQL, params) |
|||
log.color_print(f"Collection {collection} dropped") |
|||
except Exception as e: |
|||
log.critical(f"fail to drop collection, error info: {e}") |
|||
raise |
|||
|
|||
def insertone(self, data): |
|||
""" |
|||
Insert a single record into the database. |
|||
|
|||
Args: |
|||
data: Data to insert. |
|||
""" |
|||
SQL = SQL_TEMPLATES["insert"] |
|||
self.execute(SQL, data) |
|||
log.debug("insert done!") |
|||
|
|||
def searchone( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
): |
|||
""" |
|||
Search for similar vectors in a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name to search in. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
|
|||
Returns: |
|||
list: List of search results. |
|||
|
|||
Raises: |
|||
Exception: If there's an error during search. |
|||
""" |
|||
log.debug("def searchone:" + collection) |
|||
try: |
|||
if isinstance(vector, List): |
|||
vector = np.array(vector) |
|||
embedding_string = "[" + ", ".join(map(str, vector.tolist())) + "]" |
|||
dimension = vector.shape[0] |
|||
dtype = str(vector.dtype).upper() |
|||
|
|||
SQL = SQL_TEMPLATES["search"].format(dimension=dimension, dtype=dtype) |
|||
max_distance = 0.8 |
|||
params = { |
|||
"collection": collection, |
|||
"embedding_string": embedding_string, |
|||
"top_k": top_k, |
|||
"max_distance": max_distance, |
|||
} |
|||
res = self.query(SQL, params) |
|||
if res: |
|||
return res |
|||
else: |
|||
return [] |
|||
except Exception as e: |
|||
log.critical(f"fail to search data, error info: {e}") |
|||
raise |
|||
|
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: Optional[str] = "deepsearcher", |
|||
description: Optional[str] = "", |
|||
force_new_collection: bool = False, |
|||
text_max_length: int = 65_535, |
|||
reference_max_length: int = 2048, |
|||
metric_type: str = "L2", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in the database. |
|||
|
|||
Args: |
|||
dim (int): Dimension of the vector embeddings. |
|||
collection (Optional[str], optional): Collection name. Defaults to "deepsearcher". |
|||
description (Optional[str], optional): Collection description. Defaults to "". |
|||
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False. |
|||
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535. |
|||
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048. |
|||
metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Raises: |
|||
Exception: If there's an error initializing the collection. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
if description is None: |
|||
description = "" |
|||
try: |
|||
has_collection = self.has_collection(collection) |
|||
if force_new_collection and has_collection: |
|||
self.drop_collection(collection) |
|||
elif has_collection: |
|||
return |
|||
# insert collection info |
|||
SQL = SQL_TEMPLATES["insert_collection"] |
|||
params = {"collection": collection, "description": description} |
|||
self.execute(SQL, params) |
|||
except Exception as e: |
|||
log.critical(f"fail to init_collection for oracle, error info: {e}") |
|||
|
|||
def insert_data( |
|||
self, |
|||
collection: Optional[str], |
|||
chunks: List[Chunk], |
|||
batch_size: int = 256, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Insert data into a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
chunks (List[Chunk]): List of Chunk objects to insert. |
|||
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Raises: |
|||
Exception: If there's an error inserting data. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
|
|||
datas = [] |
|||
for chunk in chunks: |
|||
_data = { |
|||
"embedding": self.numpy_converter_in(np.array(chunk.embedding)), |
|||
"text": chunk.text, |
|||
"reference": chunk.reference, |
|||
"metadata": json.dumps(chunk.metadata), |
|||
"collection": collection, |
|||
} |
|||
datas.append(_data) |
|||
|
|||
batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)] |
|||
try: |
|||
for batch_data in batch_datas: |
|||
for _data in batch_data: |
|||
self.insertone(data=_data) |
|||
log.color_print(f"Successfully insert {len(datas)} data") |
|||
except Exception as e: |
|||
log.critical(f"fail to insert data, error info: {e}") |
|||
raise |
|||
|
|||
def search_data( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
*args, |
|||
**kwargs, |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. If None, uses default_collection. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: List of retrieval results containing similar vectors. |
|||
|
|||
Raises: |
|||
Exception: If there's an error during search. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
# print("def search_data:",collection) |
|||
# print("def search_data:",type(vector)) |
|||
search_results = self.searchone(collection=collection, vector=vector, top_k=top_k) |
|||
# print("def search_data: search_results",search_results) |
|||
|
|||
return [ |
|||
RetrievalResult( |
|||
embedding=b["embedding"], |
|||
text=b["text"], |
|||
reference=b["reference"], |
|||
score=b["distance"], |
|||
metadata=json.loads(b["metadata"]), |
|||
) |
|||
for b in search_results |
|||
] |
|||
except Exception as e: |
|||
log.critical(f"fail to search data, error info: {e}") |
|||
raise |
|||
# return [] |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[CollectionInfo]: List of collection information objects. |
|||
""" |
|||
collection_infos = [] |
|||
try: |
|||
SQL = SQL_TEMPLATES["list_collections"] |
|||
log.debug("def list_collections:" + SQL) |
|||
collections = self.query(SQL) |
|||
if collections: |
|||
for collection in collections: |
|||
collection_infos.append( |
|||
CollectionInfo( |
|||
collection_name=collection["collection"], |
|||
description=collection["description"], |
|||
) |
|||
) |
|||
return collection_infos |
|||
except Exception as e: |
|||
log.critical(f"fail to list collections, error info: {e}") |
|||
raise |
|||
|
|||
def clear_db(self, collection: str = "deepsearcher", *args, **kwargs): |
|||
""" |
|||
Clear (drop) a collection from the database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. Defaults to "deepsearcher". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
if not collection: |
|||
collection = self.default_collection |
|||
try: |
|||
self.client.drop_collection(collection) |
|||
except Exception as e: |
|||
log.warning(f"fail to clear db, error info: {e}") |
|||
raise |
|||
|
|||
|
|||
TABLES = { |
|||
"DEEPSEARCHER_COLLECTION_INFO": """CREATE TABLE DEEPSEARCHER_COLLECTION_INFO ( |
|||
id INT generated by default as identity primary key, |
|||
collection varchar(256), |
|||
description CLOB, |
|||
status NUMBER DEFAULT 1, |
|||
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|||
updatetime TIMESTAMP DEFAULT NULL)""", |
|||
"DEEPSEARCHER_COLLECTION_ITEM": """CREATE TABLE DEEPSEARCHER_COLLECTION_ITEM ( |
|||
id INT generated by default as identity primary key, |
|||
collection varchar(256), |
|||
embedding VECTOR, |
|||
text CLOB, |
|||
reference varchar(4000), |
|||
metadata CLOB, |
|||
status NUMBER DEFAULT 1, |
|||
createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|||
updatetime TIMESTAMP DEFAULT NULL)""", |
|||
} |
|||
|
|||
SQL_TEMPLATES = { |
|||
"has_table": f"""SELECT table_name FROM all_tables |
|||
WHERE table_name in ({",".join([f"'{k}'" for k in TABLES.keys()])})""", |
|||
"has_collection": "select count(*) as rowcnt from DEEPSEARCHER_COLLECTION_INFO where collection=:collection and status=1", |
|||
"list_collections": "select collection,description from DEEPSEARCHER_COLLECTION_INFO where status=1", |
|||
"drop_collection": "update DEEPSEARCHER_COLLECTION_INFO set status=0 where collection=:collection and status=1", |
|||
"drop_collection_item": "update DEEPSEARCHER_COLLECTION_ITEM set status=0 where collection=:collection and status=1", |
|||
"insert_collection": """INSERT INTO DEEPSEARCHER_COLLECTION_INFO (collection,description) |
|||
values (:collection,:description)""", |
|||
"insert": """INSERT INTO DEEPSEARCHER_COLLECTION_ITEM (collection,embedding,text,reference,metadata) |
|||
values (:collection,:embedding,:text,:reference,:metadata)""", |
|||
"search": """SELECT * FROM |
|||
(SELECT t.*, |
|||
VECTOR_DISTANCE(t.embedding,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance |
|||
FROM DEEPSEARCHER_COLLECTION_ITEM t |
|||
JOIN DEEPSEARCHER_COLLECTION_INFO c ON t.collection=c.collection |
|||
WHERE t.collection=:collection AND t.status=1 AND c.status=1) |
|||
WHERE distance<:max_distance ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""", |
|||
} |
@ -0,0 +1,290 @@ |
|||
import uuid |
|||
from typing import List, Optional, Union |
|||
|
|||
import numpy as np |
|||
|
|||
from deepsearcher.loader.splitter import Chunk |
|||
from deepsearcher.utils import log |
|||
from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult |
|||
|
|||
DEFAULT_COLLECTION_NAME = "deepsearcher" |
|||
|
|||
TEXT_PAYLOAD_KEY = "text" |
|||
REFERENCE_PAYLOAD_KEY = "reference" |
|||
METADATA_PAYLOAD_KEY = "metadata" |
|||
|
|||
|
|||
class Qdrant(BaseVectorDB): |
|||
"""Vector DB implementation powered by [Qdrant](https://qdrant.tech/)""" |
|||
|
|||
def __init__( |
|||
self, |
|||
location: Optional[str] = None, |
|||
url: Optional[str] = None, |
|||
port: Optional[int] = 6333, |
|||
grpc_port: int = 6334, |
|||
prefer_grpc: bool = False, |
|||
https: Optional[bool] = None, |
|||
api_key: Optional[str] = None, |
|||
prefix: Optional[str] = None, |
|||
timeout: Optional[int] = None, |
|||
host: Optional[str] = None, |
|||
path: Optional[str] = None, |
|||
default_collection: str = DEFAULT_COLLECTION_NAME, |
|||
): |
|||
""" |
|||
Initialize the Qdrant client with flexible connection options. |
|||
|
|||
Args: |
|||
location (Optional[str], optional): |
|||
- If ":memory:" - use in-memory Qdrant instance. |
|||
- If str - use it as a URL parameter. |
|||
- If None - use default values for host and port. |
|||
Defaults to None. |
|||
|
|||
url (Optional[str], optional): |
|||
URL for Qdrant service, can include scheme, host, port, and prefix. |
|||
Allows flexible connection string specification. |
|||
Defaults to None. |
|||
|
|||
port (Optional[int], optional): |
|||
Port of the REST API interface. |
|||
Defaults to 6333. |
|||
|
|||
grpc_port (int, optional): |
|||
Port of the gRPC interface. |
|||
Defaults to 6334. |
|||
|
|||
prefer_grpc (bool, optional): |
|||
If True, use gRPC interface whenever possible in custom methods. |
|||
Defaults to False. |
|||
|
|||
https (Optional[bool], optional): |
|||
If True, use HTTPS (SSL) protocol. |
|||
Defaults to None. |
|||
|
|||
api_key (Optional[str], optional): |
|||
API key for authentication in Qdrant Cloud. |
|||
Defaults to None. |
|||
|
|||
prefix (Optional[str], optional): |
|||
If not None, add prefix to the REST URL path. |
|||
Example: 'service/v1' results in 'http://localhost:6333/service/v1/{qdrant-endpoint}' |
|||
Defaults to None. |
|||
|
|||
timeout (Optional[int], optional): |
|||
Timeout for REST and gRPC API requests. |
|||
Default is 5 seconds for REST and unlimited for gRPC. |
|||
Defaults to None. |
|||
|
|||
host (Optional[str], optional): |
|||
Host name of Qdrant service. |
|||
If url and host are None, defaults to 'localhost'. |
|||
Defaults to None. |
|||
|
|||
path (Optional[str], optional): |
|||
Persistence path for QdrantLocal. |
|||
Defaults to None. |
|||
|
|||
default_collection (str, optional): |
|||
Default collection name to be used. |
|||
""" |
|||
try: |
|||
from qdrant_client import QdrantClient |
|||
except ImportError as original_error: |
|||
raise ImportError( |
|||
"Qdrant client is not installed. Install it using: pip install qdrant-client\n" |
|||
) from original_error |
|||
|
|||
super().__init__(default_collection) |
|||
self.client = QdrantClient( |
|||
location=location, |
|||
url=url, |
|||
port=port, |
|||
grpc_port=grpc_port, |
|||
prefer_grpc=prefer_grpc, |
|||
https=https, |
|||
api_key=api_key, |
|||
prefix=prefix, |
|||
timeout=timeout, |
|||
host=host, |
|||
path=path, |
|||
) |
|||
|
|||
def init_collection( |
|||
self, |
|||
dim: int, |
|||
collection: Optional[str] = None, |
|||
description: Optional[str] = "", |
|||
force_new_collection: bool = False, |
|||
text_max_length: int = 65_535, |
|||
reference_max_length: int = 2048, |
|||
distance_metric: str = "Cosine", |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Initialize a collection in Qdrant. |
|||
|
|||
Args: |
|||
dim (int): Dimension of the vector embeddings. |
|||
collection (Optional[str], optional): Collection name. |
|||
description (Optional[str], optional): Collection description. Defaults to "". |
|||
force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False. |
|||
text_max_length (int, optional): Maximum length for text field. Defaults to 65_535. |
|||
reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048. |
|||
distance_metric (str, optional): Metric type for vector similarity search. Defaults to "Cosine". |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
from qdrant_client import models |
|||
|
|||
collection = collection or self.default_collection |
|||
|
|||
try: |
|||
collection_exists = self.client.collection_exists(collection_name=collection) |
|||
|
|||
if force_new_collection and collection_exists: |
|||
self.client.delete_collection(collection_name=collection) |
|||
collection_exists = False |
|||
|
|||
if not collection_exists: |
|||
self.client.create_collection( |
|||
collection_name=collection, |
|||
vectors_config=models.VectorParams(size=dim, distance=distance_metric), |
|||
*args, |
|||
**kwargs, |
|||
) |
|||
|
|||
log.color_print(f"Created collection [{collection}] successfully") |
|||
except Exception as e: |
|||
log.critical(f"Failed to init Qdrant collection, error info: {e}") |
|||
|
|||
def insert_data( |
|||
self, |
|||
collection: Optional[str], |
|||
chunks: List[Chunk], |
|||
batch_size: int = 256, |
|||
*args, |
|||
**kwargs, |
|||
): |
|||
""" |
|||
Insert data into a Qdrant collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name. |
|||
chunks (List[Chunk]): List of Chunk objects to insert. |
|||
batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
from qdrant_client import models |
|||
|
|||
try: |
|||
for i in range(0, len(chunks), batch_size): |
|||
batch_chunks = chunks[i : i + batch_size] |
|||
|
|||
points = [ |
|||
models.PointStruct( |
|||
id=uuid.uuid4().hex, |
|||
vector=chunk.embedding, |
|||
payload={ |
|||
TEXT_PAYLOAD_KEY: chunk.text, |
|||
REFERENCE_PAYLOAD_KEY: chunk.reference, |
|||
METADATA_PAYLOAD_KEY: chunk.metadata, |
|||
}, |
|||
) |
|||
for chunk in batch_chunks |
|||
] |
|||
|
|||
self.client.upsert( |
|||
collection_name=collection or self.default_collection, points=points |
|||
) |
|||
except Exception as e: |
|||
log.critical(f"Failed to insert data, error info: {e}") |
|||
|
|||
def search_data( |
|||
self, |
|||
collection: Optional[str], |
|||
vector: Union[np.array, List[float]], |
|||
top_k: int = 5, |
|||
*args, |
|||
**kwargs, |
|||
) -> List[RetrievalResult]: |
|||
""" |
|||
Search for similar vectors in a Qdrant collection. |
|||
|
|||
Args: |
|||
collection (Optional[str]): Collection name.. |
|||
vector (Union[np.array, List[float]]): Query vector for similarity search. |
|||
top_k (int, optional): Number of results to return. Defaults to 5. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[RetrievalResult]: List of retrieval results containing similar vectors. |
|||
""" |
|||
try: |
|||
results = self.client.query_points( |
|||
collection_name=collection or self.default_collection, |
|||
query=vector, |
|||
limit=top_k, |
|||
with_payload=True, |
|||
with_vectors=True, |
|||
).points |
|||
|
|||
return [ |
|||
RetrievalResult( |
|||
embedding=result.vector, |
|||
text=result.payload.get(TEXT_PAYLOAD_KEY, ""), |
|||
reference=result.payload.get(REFERENCE_PAYLOAD_KEY, ""), |
|||
score=result.score, |
|||
metadata=result.payload.get(METADATA_PAYLOAD_KEY, {}), |
|||
) |
|||
for result in results |
|||
] |
|||
except Exception as e: |
|||
log.critical(f"Failed to search data, error info: {e}") |
|||
return [] |
|||
|
|||
def list_collections(self, *args, **kwargs) -> List[CollectionInfo]: |
|||
""" |
|||
List all collections in the Qdrant database. |
|||
|
|||
Args: |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
|
|||
Returns: |
|||
List[CollectionInfo]: List of collection information objects. |
|||
""" |
|||
collection_infos = [] |
|||
|
|||
try: |
|||
collections = self.client.get_collections().collections |
|||
for collection in collections: |
|||
collection_infos.append( |
|||
CollectionInfo( |
|||
collection_name=collection.name, |
|||
# Qdrant doesn't have a native description field |
|||
description=collection.name, |
|||
) |
|||
) |
|||
except Exception as e: |
|||
log.critical(f"Failed to list collections, error info: {e}") |
|||
|
|||
return collection_infos |
|||
|
|||
def clear_db(self, collection: Optional[str] = None, *args, **kwargs): |
|||
""" |
|||
Clear (drop) a collection from the Qdrant database. |
|||
|
|||
Args: |
|||
collection (str, optional): Collection name to drop. |
|||
*args: Variable length argument list. |
|||
**kwargs: Arbitrary keyword arguments. |
|||
""" |
|||
try: |
|||
self.client.delete_collection(collection_name=collection or self.default_collection) |
|||
except Exception as e: |
|||
log.warning(f"Failed to drop collection, error info: {e}") |
@ -0,0 +1,42 @@ |
|||
# DeepSearcher Documentation |
|||
|
|||
This directory contains the documentation for DeepSearcher, powered by MkDocs. |
|||
|
|||
## Setup |
|||
|
|||
1. Install MkDocs and required plugins: |
|||
|
|||
```bash |
|||
pip install mkdocs mkdocs-material mkdocs-jupyter pymdown-extensions |
|||
``` |
|||
|
|||
2. Clone the repository: |
|||
|
|||
```bash |
|||
git clone https://github.com/zilliztech/deep-searcher.git |
|||
cd deep-searcher |
|||
``` |
|||
|
|||
## Development |
|||
|
|||
To serve the documentation locally: |
|||
|
|||
```bash |
|||
mkdocs serve |
|||
``` |
|||
|
|||
This will start a local server at http://127.0.0.1:8000/ where you can preview the documentation. |
|||
|
|||
## Building |
|||
|
|||
To build the static site: |
|||
|
|||
```bash |
|||
mkdocs build |
|||
``` |
|||
|
|||
This will generate the static site in the `site` directory. |
|||
|
|||
## Deployment |
|||
|
|||
The documentation is automatically deployed when changes are pushed to the main branch using GitHub Actions. |
After Width: | Height: | Size: 307 KiB |
After Width: | Height: | Size: 3.4 MiB |
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 54 KiB |
@ -0,0 +1,126 @@ |
|||
# Embedding Model Configuration |
|||
|
|||
DeepSearcher supports various embedding models to convert text into vector representations for semantic search. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available Embedding Providers |
|||
|
|||
| Provider | Description | Key Features | |
|||
|----------|-------------|--------------| |
|||
| **OpenAIEmbedding** | OpenAI's text embedding models | High quality, production-ready | |
|||
| **MilvusEmbedding** | Built-in embedding models via Pymilvus | Multiple model options | |
|||
| **VoyageEmbedding** | VoyageAI embedding models | Specialized for search | |
|||
| **BedrockEmbedding** | Amazon Bedrock embedding | AWS integration | |
|||
| **GeminiEmbedding** | Google's Gemini embedding | High performance | |
|||
| **GLMEmbedding** | ChatGLM embeddings | Chinese language support | |
|||
| **OllamaEmbedding** | Local embedding with Ollama | Self-hosted option | |
|||
| **PPIOEmbedding** | PPIO cloud embedding | Scalable solution | |
|||
| **SiliconflowEmbedding** | Siliconflow's models | Enterprise support | |
|||
| **VolcengineEmbedding** | Volcengine embedding | High throughput | |
|||
| **NovitaEmbedding** | Novita AI embedding | Cost-effective | |
|||
| **SentenceTransformerEmbedding** | Sentence Transfomer Embedding | Self-hosted option | |
|||
| **IBM watsonx.ai** | Various options | IBM's Enterprise AI platform | |
|||
|
|||
## 🔍 Provider Examples |
|||
|
|||
### OpenAI Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
### Milvus Built-in Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"}) |
|||
``` |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"}) |
|||
``` |
|||
*For Jina's embedding model, requires `JINAAI_API_KEY` environment variable* |
|||
|
|||
### VoyageAI Embedding |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"}) |
|||
``` |
|||
*Requires `VOYAGE_API_KEY` environment variable and `pip install voyageai`* |
|||
|
|||
## 📚 Additional Providers |
|||
|
|||
??? example "Amazon Bedrock" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"}) |
|||
``` |
|||
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`* |
|||
|
|||
??? example "Novita AI" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"}) |
|||
``` |
|||
*Requires `NOVITA_API_KEY` environment variable* |
|||
|
|||
??? example "Siliconflow" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"}) |
|||
``` |
|||
*Requires `SILICONFLOW_API_KEY` environment variable* |
|||
|
|||
??? example "Volcengine" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"}) |
|||
``` |
|||
*Requires `VOLCENGINE_API_KEY` environment variable* |
|||
|
|||
??? example "GLM" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"}) |
|||
``` |
|||
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`* |
|||
|
|||
??? example "Google Gemini" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"}) |
|||
``` |
|||
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`* |
|||
|
|||
??? example "Ollama" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"}) |
|||
``` |
|||
*Requires local Ollama installation and `pip install ollama`* |
|||
|
|||
??? example "PPIO" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"}) |
|||
``` |
|||
*Requires `PPIO_API_KEY` environment variable* |
|||
|
|||
??? example "SentenceTransformer" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "SentenceTransformerEmbedding", {"model": "BAAI/bge-large-zh-v1.5"}) |
|||
``` |
|||
*Requires `pip install sentence-transformers`* |
|||
|
|||
??? example "IBM WatsonX" |
|||
|
|||
```python |
|||
config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"}) |
|||
``` |
|||
*Requires `pip install ibm-watsonx-ai`* |
@ -0,0 +1,70 @@ |
|||
# File Loader Configuration |
|||
|
|||
DeepSearcher supports various file loaders to extract and process content from different file formats. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available File Loaders |
|||
|
|||
| Loader | Description | Supported Formats | |
|||
|--------|-------------|-------------------| |
|||
| **UnstructuredLoader** | General purpose document loader with broad format support | PDF, DOCX, PPT, HTML, etc. | |
|||
| **DoclingLoader** | Document processing library with extraction capabilities | See [documentation](https://docling-project.github.io/docling/usage/supported_formats/) | |
|||
|
|||
## 🔍 File Loader Options |
|||
|
|||
### Unstructured |
|||
|
|||
[Unstructured](https://unstructured.io/) is a powerful library for extracting content from various document formats. |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "UnstructuredLoader", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
You can use Unstructured in two ways: |
|||
|
|||
1. **With API** (recommended for production) |
|||
- Set environment variables: |
|||
- `UNSTRUCTURED_API_KEY` |
|||
- `UNSTRUCTURED_API_URL` |
|||
|
|||
2. **Local Processing** |
|||
- Simply don't set the API environment variables |
|||
- Install required dependencies: |
|||
```bash |
|||
# Install core dependencies |
|||
pip install unstructured-ingest |
|||
|
|||
# For all document formats |
|||
pip install "unstructured[all-docs]" |
|||
|
|||
# For specific formats (e.g., PDF only) |
|||
pip install "unstructured[pdf]" |
|||
``` |
|||
|
|||
For more information: |
|||
- [Unstructured Documentation](https://docs.unstructured.io/ingestion/overview) |
|||
- [Installation Guide](https://docs.unstructured.io/open-source/installation/full-installation) |
|||
|
|||
### Docling |
|||
|
|||
[Docling](https://docling-project.github.io/docling/) provides document processing capabilities with support for multiple formats. |
|||
|
|||
```python |
|||
config.set_provider_config("file_loader", "DoclingLoader", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Docling: |
|||
```bash |
|||
pip install docling |
|||
``` |
|||
|
|||
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats). |
@ -0,0 +1,33 @@ |
|||
# Configuration Overview |
|||
|
|||
DeepSearcher provides flexible configuration options for all its components. You can customize the following aspects of the system: |
|||
|
|||
## 📋 Components |
|||
|
|||
| Component | Purpose | Documentation | |
|||
|-----------|---------|---------------| |
|||
| **LLM** | Large Language Models for query processing | [LLM Configuration](llm.md) | |
|||
| **Embedding Models** | Text embedding for vector retrieval | [Embedding Models](embedding.md) | |
|||
| **Vector Database** | Storage and retrieval of vector embeddings | [Vector Database](vector_db.md) | |
|||
| **File Loader** | Loading and processing various file formats | [File Loader](file_loader.md) | |
|||
| **Web Crawler** | Gathering information from web sources | [Web Crawler](web_crawler.md) | |
|||
|
|||
## 🔄 Configuration Method |
|||
|
|||
DeepSearcher uses a consistent configuration approach for all components: |
|||
|
|||
```python |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Create configuration |
|||
config = Configuration() |
|||
|
|||
# Set provider configurations |
|||
config.set_provider_config("[component]", "[provider]", {"option": "value"}) |
|||
|
|||
# Initialize with configuration |
|||
init_config(config=config) |
|||
``` |
|||
|
|||
For detailed configuration options for each component, please visit the corresponding documentation pages linked in the table above. |
|||
|
@ -0,0 +1,192 @@ |
|||
# LLM Configuration |
|||
|
|||
DeepSearcher supports various Large Language Models (LLMs) for processing queries and generating responses. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "(LLMName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available LLM Providers |
|||
|
|||
| Provider | Description | Key Models | |
|||
|----------|-------------|------------| |
|||
| **OpenAI** | OpenAI's API for GPT models | o1-mini, GPT-4 | |
|||
| **DeepSeek** | DeepSeek AI offering | deepseek-reasoner, coder | |
|||
| **Anthropic** | Anthropic's Claude models | claude-sonnet-4-0 | |
|||
| **Gemini** | Google's Gemini models | gemini-1.5-pro, gemini-2.0-flash | |
|||
| **XAI** | X.AI's Grok models | grok-2-latest | |
|||
| **Ollama** | Local LLM deployment | llama3, qwq, etc. | |
|||
| **SiliconFlow** | Enterprise AI platform | deepseek-r1 | |
|||
| **TogetherAI** | Multiple model options | llama-4, deepseek | |
|||
| **PPIO** | Cloud AI infrastructure | deepseek, llama | |
|||
| **Volcengine** | ByteDance LLM platform | deepseek-r1 | |
|||
| **GLM** | ChatGLM models | glm-4-plus | |
|||
| **Bedrock** | Amazon Bedrock LLMs | anthropic.claude, ai21.j2 | |
|||
| **Novita** | Novita AI models | Various options | |
|||
| **IBM watsonx.ai** | IBM Enterprise AI platform | Various options | |
|||
|
|||
## 🔍 Provider Examples |
|||
|
|||
### OpenAI |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
### DeepSeek |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"}) |
|||
``` |
|||
*Requires `DEEPSEEK_API_KEY` environment variable* |
|||
|
|||
### IBM WatsonX |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"}) |
|||
``` |
|||
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables* |
|||
|
|||
## 📚 Additional Providers |
|||
|
|||
??? example "DeepSeek from SiliconFlow" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"}) |
|||
``` |
|||
*Requires `SILICONFLOW_API_KEY` environment variable* |
|||
|
|||
More details about SiliconFlow: [https://docs.siliconflow.cn/quickstart](https://docs.siliconflow.cn/quickstart) |
|||
|
|||
??? example "DeepSeek from TogetherAI" |
|||
|
|||
*Requires `TOGETHER_API_KEY` environment variable and `pip install together`* |
|||
|
|||
For DeepSeek R1: |
|||
```python |
|||
config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"}) |
|||
``` |
|||
|
|||
For Llama 4: |
|||
```python |
|||
config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"}) |
|||
``` |
|||
|
|||
More details about TogetherAI: [https://www.together.ai/](https://www.together.ai/) |
|||
|
|||
??? example "XAI Grok" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "XAI", {"model": "grok-2-latest"}) |
|||
``` |
|||
*Requires `XAI_API_KEY` environment variable* |
|||
|
|||
More details about XAI Grok: [https://docs.x.ai/docs/overview#featured-models](https://docs.x.ai/docs/overview#featured-models) |
|||
|
|||
??? example "Claude" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"}) |
|||
``` |
|||
*Requires `ANTHROPIC_API_KEY` environment variable* |
|||
|
|||
More details about Anthropic Claude: [https://docs.anthropic.com/en/home](https://docs.anthropic.com/en/home) |
|||
|
|||
??? example "Google Gemini" |
|||
|
|||
```python |
|||
config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' }) |
|||
``` |
|||
*Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`* |
|||
|
|||
More details about Gemini: [https://ai.google.dev/gemini-api/docs](https://ai.google.dev/gemini-api/docs) |
|||
|
|||
??? example "DeepSeek from PPIO" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"}) |
|||
``` |
|||
*Requires `PPIO_API_KEY` environment variable* |
|||
|
|||
More details about PPIO: [https://ppinfra.com/docs/get-started/quickstart.html](https://ppinfra.com/docs/get-started/quickstart.html) |
|||
|
|||
??? example "Ollama" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Ollama", {"model": "qwq"}) |
|||
``` |
|||
|
|||
Follow [these instructions](https://github.com/jmorganca/ollama) to set up and run a local Ollama instance: |
|||
|
|||
1. [Download](https://ollama.ai/download) and install Ollama |
|||
2. View available models via the [model library](https://ollama.ai/library) |
|||
3. Pull models with `ollama pull <name-of-model>` |
|||
4. By default, Ollama has a REST API on [http://localhost:11434](http://localhost:11434) |
|||
|
|||
??? example "Volcengine" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"}) |
|||
``` |
|||
*Requires `VOLCENGINE_API_KEY` environment variable* |
|||
|
|||
More details about Volcengine: [https://www.volcengine.com/docs/82379/1099455](https://www.volcengine.com/docs/82379/1099455) |
|||
|
|||
??? example "GLM" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"}) |
|||
``` |
|||
*Requires `GLM_API_KEY` environment variable and `pip install zhipuai`* |
|||
|
|||
More details about GLM: [https://bigmodel.cn/dev/welcome](https://bigmodel.cn/dev/welcome) |
|||
|
|||
??? example "Amazon Bedrock" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"}) |
|||
``` |
|||
*Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`* |
|||
|
|||
More details about Amazon Bedrock: [https://docs.aws.amazon.com/bedrock/](https://docs.aws.amazon.com/bedrock/) |
|||
|
|||
??? example "Aliyun Bailian" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "OpenAI", {"model": "deepseek-r1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"}) |
|||
``` |
|||
*Requires `OPENAI_API_KEY` environment variable* |
|||
|
|||
More details about Aliyun Bailian models: [https://bailian.console.aliyun.com](https://bailian.console.aliyun.com) |
|||
|
|||
??? example "IBM watsonx.ai LLM" |
|||
|
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"}) |
|||
``` |
|||
|
|||
With custom parameters: |
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", { |
|||
"model": "ibm/granite-3-3-8b-instruct", |
|||
"max_new_tokens": 1000, |
|||
"temperature": 0.7, |
|||
"top_p": 0.9, |
|||
"top_k": 50 |
|||
}) |
|||
``` |
|||
|
|||
With space_id instead of project_id: |
|||
```python |
|||
config.set_provider_config("llm", "WatsonX", { |
|||
"model": "ibm/granite-3-3-8b-instruct"" |
|||
}) |
|||
``` |
|||
|
|||
*Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables and `pip install ibm-watsonx-ai`* |
|||
|
|||
More details about WatsonX: [https://www.ibm.com/products/watsonx-ai/foundation-models](https://www.ibm.com/products/watsonx-ai/foundation-models) |
|||
``` |
@ -0,0 +1,52 @@ |
|||
# Vector Database Configuration |
|||
|
|||
DeepSearcher uses vector databases to store and retrieve document embeddings for efficient semantic search. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
Currently supported vector databases: |
|||
- Milvus (including Milvus Lite and Zilliz Cloud) |
|||
|
|||
## 🔍 Milvus Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""}) |
|||
``` |
|||
|
|||
### Deployment Options |
|||
|
|||
??? example "Local Storage with Milvus Lite" |
|||
|
|||
Setting the `uri` as a local file (e.g., `./milvus.db`) automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file. This is the most convenient method for development and smaller datasets. |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""}) |
|||
``` |
|||
|
|||
??? example "Standalone Milvus Server" |
|||
|
|||
For larger datasets, you can set up a more performant Milvus server using [Docker or Kubernetes](https://milvus.io/docs/quickstart.md). In this setup, use the server URI as your `uri` parameter: |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "token": ""}) |
|||
``` |
|||
|
|||
Also, you could specify other connection parameters supported by Milvus such as `user`, `password`, `secure` or others. |
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "user": "<username>", "password":"<password>", "secure": True, "token": ""}) |
|||
``` |
|||
|
|||
??? example "Zilliz Cloud (Managed Service)" |
|||
|
|||
[Zilliz Cloud](https://zilliz.com/cloud) provides a fully managed cloud service for Milvus. To use Zilliz Cloud, adjust the `uri` and `token` according to the [Public Endpoint and API Key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details): |
|||
|
|||
```python |
|||
config.set_provider_config("vector_db", "Milvus", { |
|||
"uri": "https://your-instance-id.api.gcp-us-west1.zillizcloud.com", |
|||
"token": "your_api_key" |
|||
}) |
|||
``` |
@ -0,0 +1,97 @@ |
|||
# Web Crawler Configuration |
|||
|
|||
DeepSearcher supports various web crawlers to collect data from websites for processing and indexing. |
|||
|
|||
## 📝 Basic Configuration |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)") |
|||
``` |
|||
|
|||
## 📋 Available Web Crawlers |
|||
|
|||
| Crawler | Description | Key Feature | |
|||
|---------|-------------|-------------| |
|||
| **FireCrawlCrawler** | Cloud-based web crawling service | Simple API, managed service | |
|||
| **Crawl4AICrawler** | Browser automation crawler | Full JavaScript support | |
|||
| **JinaCrawler** | Content extraction service | High accuracy parsing | |
|||
| **DoclingCrawler** | Doc processing with crawling | Multiple format support | |
|||
|
|||
## 🔍 Web Crawler Options |
|||
|
|||
### FireCrawl |
|||
|
|||
[FireCrawl](https://docs.firecrawl.dev/introduction) is a cloud-based web crawling service designed for AI applications. |
|||
|
|||
**Key features:** |
|||
- Simple API |
|||
- Managed Service |
|||
- Advanced Parsing |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "FireCrawlCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Sign up for FireCrawl and get an API key |
|||
2. Set the API key as an environment variable: |
|||
```bash |
|||
export FIRECRAWL_API_KEY="your_api_key" |
|||
``` |
|||
3. For more information, see the [FireCrawl documentation](https://docs.firecrawl.dev/introduction) |
|||
|
|||
### Crawl4AI |
|||
|
|||
[Crawl4AI](https://docs.crawl4ai.com/) is a Python package for web crawling with browser automation capabilities. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Crawl4AI: |
|||
```bash |
|||
pip install crawl4ai |
|||
``` |
|||
2. Run the setup command: |
|||
```bash |
|||
crawl4ai-setup |
|||
``` |
|||
3. For more information, see the [Crawl4AI documentation](https://docs.crawl4ai.com/) |
|||
|
|||
### Jina Reader |
|||
|
|||
[Jina Reader](https://jina.ai/reader/) is a service for extracting content from web pages with high accuracy. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "JinaCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Get a Jina API key |
|||
2. Set the API key as an environment variable: |
|||
```bash |
|||
export JINA_API_TOKEN="your_api_key" |
|||
# or |
|||
export JINAAI_API_KEY="your_api_key" |
|||
``` |
|||
3. For more information, see the [Jina Reader documentation](https://jina.ai/reader/) |
|||
|
|||
### Docling Crawler |
|||
|
|||
[Docling](https://docling-project.github.io/docling/) provides web crawling capabilities alongside its document processing features. |
|||
|
|||
```python |
|||
config.set_provider_config("web_crawler", "DoclingCrawler", {}) |
|||
``` |
|||
|
|||
??? tip "Setup Instructions" |
|||
|
|||
1. Install Docling: |
|||
```bash |
|||
pip install docling |
|||
``` |
|||
2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats) |
@ -0,0 +1,159 @@ |
|||
# Contributing to DeepSearcher |
|||
|
|||
We welcome contributions from everyone. This document provides guidelines to make the contribution process straightforward. |
|||
|
|||
|
|||
## Pull Request Process |
|||
|
|||
1. Fork the repository and create your branch from `master`. |
|||
2. Make your changes. |
|||
3. Run tests and linting to ensure your code meets the project's standards. |
|||
4. Update documentation if necessary. |
|||
5. Submit a pull request. |
|||
|
|||
|
|||
## Linting and Formatting |
|||
|
|||
Keeping a consistent style for code, code comments, commit messages, and PR descriptions will greatly accelerate your PR review process. |
|||
We require you to run code linter and formatter before submitting your pull requests: |
|||
|
|||
To check the coding styles: |
|||
|
|||
```shell |
|||
make lint |
|||
``` |
|||
|
|||
To fix the coding styles: |
|||
|
|||
```shell |
|||
make format |
|||
``` |
|||
Our CI pipeline also runs these checks automatically on all pull requests to ensure code quality and consistency. |
|||
|
|||
|
|||
## Development Environment Setup with uv |
|||
|
|||
DeepSearcher uses [uv](https://github.com/astral-sh/uv) as the recommended package manager. uv is a fast, reliable Python package manager and installer. The project's `pyproject.toml` is configured to work with uv, which will provide faster dependency resolution and package installation compared to traditional tools. |
|||
|
|||
### Install Project in Development Mode(aka Editable Installation) |
|||
|
|||
1. Install uv if you haven't already: |
|||
Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/). |
|||
|
|||
2. Clone the repository and navigate to the project directory: |
|||
```shell |
|||
git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher |
|||
``` |
|||
3. Synchronize and install dependencies: |
|||
```shell |
|||
uv sync |
|||
source .venv/bin/activate |
|||
``` |
|||
`uv sync` will install all dependencies specified in `uv.lock` file. And the `source .venv/bin/activate` command will activate the virtual environment. |
|||
|
|||
- (Optional) To install all optional dependencies: |
|||
```shell |
|||
uv sync --all-extras --dev |
|||
``` |
|||
|
|||
- (Optional) To install specific optional dependencies: |
|||
```shell |
|||
# Take optional `ollama` dependency for example |
|||
uv sync --extra ollama |
|||
``` |
|||
For more optional dependencies, refer to the `[project.optional-dependencies]` part of `pyproject.toml` file. |
|||
|
|||
|
|||
|
|||
### Adding Dependencies |
|||
|
|||
When you need to add new dependencies to the `pyproject.toml` file, you can use the following commands: |
|||
|
|||
```shell |
|||
uv add <package_name> |
|||
``` |
|||
DeepSearcher uses optional dependencies to keep the default installation lightweight. Optional features can be installed using the syntax `deepsearcher[<extra>]`. To add a dependency to an optional extra, use the following command: |
|||
|
|||
```shell |
|||
uv add <package_name> --optional <extra> |
|||
``` |
|||
For more details, refer to the [offical Managing dependencies documentation](https://docs.astral.sh/uv/concepts/projects/dependencies/). |
|||
|
|||
### Dependencies Locking |
|||
|
|||
For development, we use lockfiles to ensure consistent dependencies. You can use |
|||
```shell |
|||
uv lock --check |
|||
``` |
|||
to verify if your lockfile is up-to-date with your project dependencies. |
|||
|
|||
When you modify or add dependencies in the project, the lockfile will be automatically updated the next time you run a uv command. You can also explicitly update the lockfile using: |
|||
```shell |
|||
uv lock |
|||
``` |
|||
|
|||
While the environment is synced automatically, it may also be explicitly synced using uv sync: |
|||
```shell |
|||
uv sync |
|||
``` |
|||
Syncing the environment manually is especially useful for ensuring your editor has the correct versions of dependencies. |
|||
|
|||
|
|||
For more detailed information about dependency locking and syncing, refer to the [offical Locking and syncing documentation](https://docs.astral.sh/uv/concepts/projects/sync/). |
|||
|
|||
|
|||
## Running Tests |
|||
|
|||
Before submitting your pull request, make sure to run the test suite to ensure your changes haven't introduced any regressions. |
|||
|
|||
### Installing Test Dependencies |
|||
|
|||
First, ensure you have pytest installed. If you haven't installed the development dependencies yet, you can do so with: |
|||
|
|||
```shell |
|||
uv sync --all-extras --dev |
|||
``` |
|||
|
|||
This will install all development dependencies and optional dependencies including pytest and other testing tools. |
|||
|
|||
### Running the Tests |
|||
|
|||
To run all tests in the `tests` directory: |
|||
|
|||
```shell |
|||
uv run pytest tests |
|||
``` |
|||
|
|||
For more verbose output that shows individual test results: |
|||
|
|||
```shell |
|||
uv run pytest tests -v |
|||
``` |
|||
|
|||
You can also run tests for specific directories or files. For example: |
|||
|
|||
```shell |
|||
# Run tests in a specific directory |
|||
uv run pytest tests/embedding |
|||
|
|||
# Run tests in a specific file |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py |
|||
|
|||
# Run a specific test class |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding |
|||
|
|||
# Run a specific test method |
|||
uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding::test_init_default |
|||
``` |
|||
|
|||
The `-v` flag (verbose mode) provides more detailed output, showing each test case and its result individually. This is particularly useful when you want to see which specific tests are passing or failing. |
|||
|
|||
|
|||
## Developer Certificate of Origin (DCO) |
|||
|
|||
All contributions require a sign-off, acknowledging the [Developer Certificate of Origin](https://developercertificate.org/). |
|||
Add a `Signed-off-by` line to your commit message: |
|||
|
|||
```text |
|||
Signed-off-by: Your Name <your.email@example.com> |
|||
``` |
@ -0,0 +1,65 @@ |
|||
# Basic Example |
|||
|
|||
This example demonstrates the core functionality of DeepSearcher - loading documents and performing semantic search. |
|||
|
|||
## Overview |
|||
|
|||
The script performs these steps: |
|||
|
|||
1. Configures DeepSearcher with default settings |
|||
2. Loads a PDF document about Milvus |
|||
3. Asks a question about Milvus and vector databases |
|||
4. Displays token usage information |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
config = Configuration() # Customize your config here |
|||
init_config(config=config) |
|||
|
|||
|
|||
# You should clone the milvus docs repo to your local machine first, execute: |
|||
# git clone https://github.com/milvus-io/milvus-docs.git |
|||
# Then replace the path below with the path to the milvus-docs repo on your local machine |
|||
# import glob |
|||
# all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True) |
|||
# load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents") |
|||
|
|||
# Hint: You can also load a single file, please execute it in the root directory of the deep searcher project |
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
question = "Write a report comparing Milvus with other vector databases." |
|||
|
|||
_, _, consumed_token = query(question, max_iter=1) |
|||
print(f"Consumed tokens: {consumed_token}") |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Make sure you have installed DeepSearcher: `pip install deepsearcher` |
|||
2. Create a data directory and add a PDF about Milvus (or use your own data) |
|||
3. Run the script: `python basic_example.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Configuration**: Using the default configuration |
|||
- **Document Loading**: Loading a single PDF file |
|||
- **Querying**: Asking a complex question requiring synthesis of information |
|||
- **Token Tracking**: Monitoring token usage from the LLM |
@ -0,0 +1,101 @@ |
|||
# Docling Integration Example |
|||
|
|||
This example shows how to use Docling for loading local files and crawling web content. |
|||
|
|||
## Overview |
|||
|
|||
The script demonstrates: |
|||
|
|||
1. Configuring DeepSearcher to use Docling for both file loading and web crawling |
|||
2. Loading data from local files using Docling's document parser |
|||
3. Crawling web content from multiple sources including Markdown and PDF files |
|||
4. Querying the loaded data |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_local_files, load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Configure Vector Database and Docling providers |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("file_loader", "DoclingLoader", {}) |
|||
config.set_provider_config("web_crawler", "DoclingCrawler", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2a: Load data from a local file using DoclingLoader |
|||
local_file = "your_local_file_or_directory" |
|||
local_collection_name = "DoclingLocalFiles" |
|||
local_collection_description = "Milvus Documents loaded using DoclingLoader" |
|||
|
|||
print("\n=== Loading local files using DoclingLoader ===") |
|||
|
|||
try: |
|||
load_from_local_files( |
|||
paths_or_directory=local_file, |
|||
collection_name=local_collection_name, |
|||
collection_description=local_collection_description, |
|||
force_new_collection=True |
|||
) |
|||
print(f"Successfully loaded: {local_file}") |
|||
except ValueError as e: |
|||
print(f"Validation error: {str(e)}") |
|||
except Exception as e: |
|||
print(f"Error: {str(e)}") |
|||
|
|||
print("Successfully loaded all local files") |
|||
|
|||
# Step 2b: Crawl URLs using DoclingCrawler |
|||
urls = [ |
|||
# Markdown documentation files |
|||
"https://milvus.io/docs/quickstart.md", |
|||
"https://milvus.io/docs/overview.md", |
|||
# PDF example - can handle various URL formats |
|||
"https://arxiv.org/pdf/2408.09869", |
|||
] |
|||
web_collection_name = "DoclingWebCrawl" |
|||
web_collection_description = "Milvus Documentation crawled using DoclingCrawler" |
|||
|
|||
print("\n=== Crawling web pages using DoclingCrawler ===") |
|||
|
|||
load_from_website( |
|||
urls=urls, |
|||
collection_name=web_collection_name, |
|||
collection_description=web_collection_description, |
|||
force_new_collection=True |
|||
) |
|||
print("Successfully crawled all URLs") |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher and Docling: `pip install deepsearcher docling` |
|||
2. Replace `your_local_file_or_directory` with your actual file/directory path |
|||
3. Run the script: `python load_and_crawl_using_docling.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Multiple Providers**: Configuring both file loader and web crawler to use Docling |
|||
- **Local Files**: Loading documents from your local filesystem |
|||
- **Web Crawling**: Retrieving content from multiple web URLs with different formats |
|||
- **Error Handling**: Graceful error handling for loading operations |
@ -0,0 +1,82 @@ |
|||
# FireCrawl Integration Example |
|||
|
|||
This example demonstrates how to use FireCrawl with DeepSearcher to crawl and extract content from websites. |
|||
|
|||
## Overview |
|||
|
|||
FireCrawl is a specialized web crawling service designed for AI applications. This example shows: |
|||
|
|||
1. Setting up FireCrawl with DeepSearcher |
|||
2. Configuring API keys for the service |
|||
3. Crawling a website and extracting content |
|||
4. Querying the extracted content |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
# Set API keys (ensure these are set securely in real applications) |
|||
os.environ['OPENAI_API_KEY'] = 'sk-***************' |
|||
os.environ['FIRECRAWL_API_KEY'] = 'fc-***************' |
|||
|
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Set up Vector Database (Milvus) and Web Crawler (FireCrawlCrawler) |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("web_crawler", "FireCrawlCrawler", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2: Load data from a website into Milvus |
|||
website_url = "https://example.com" # Replace with your target website |
|||
collection_name = "FireCrawl" |
|||
collection_description = "All Milvus Documents" |
|||
|
|||
# crawl a single webpage |
|||
load_from_website(urls=website_url, collection_name=collection_name, collection_description=collection_description) |
|||
# only applicable if using Firecrawl: deepsearcher can crawl multiple webpages, by setting max_depth, limit, allow_backward_links |
|||
# load_from_website(urls=website_url, max_depth=2, limit=20, allow_backward_links=True, collection_name=collection_name, collection_description=collection_description) |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" # Replace with your actual question |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher: `pip install deepsearcher` |
|||
2. Sign up for a FireCrawl API key at [firecrawl.dev](https://docs.firecrawl.dev/introduction) |
|||
3. Replace the placeholder API keys with your actual keys |
|||
4. Change the `website_url` to the website you want to crawl |
|||
5. Run the script: `python load_website_using_firecrawl.py` |
|||
|
|||
## Advanced Crawling Options |
|||
|
|||
FireCrawl provides several advanced options for crawling: |
|||
|
|||
- `max_depth`: Control how many links deep the crawler should go |
|||
- `limit`: Set a maximum number of pages to crawl |
|||
- `allow_backward_links`: Allow the crawler to navigate to parent/sibling pages |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Web Crawling**: Extracting content from websites |
|||
- **Depth Control**: Managing how deep the crawler navigates |
|||
- **URL Processing**: Handling multiple pages from a single starting point |
|||
- **Vector Storage**: Storing the crawled content in a vector database for search |
@ -0,0 +1,15 @@ |
|||
# Usage Examples |
|||
|
|||
DeepSearcher provides several example scripts to help you get started quickly. These examples demonstrate different ways to use DeepSearcher for various use cases. |
|||
|
|||
## 📋 Available Examples |
|||
|
|||
| Example | Description | Key Features | |
|||
|---------|-------------|--------------| |
|||
| [Basic Example](basic_example.md) | Simple example showing core functionality | Loading PDFs, querying | |
|||
| [Docling Integration](docling.md) | Using Docling for file loading and web crawling | Multiple sources, local and web | |
|||
| [Unstructured Integration](unstructured.md) | Using Unstructured for parsing documents | API and local processing | |
|||
| [FireCrawl Integration](firecrawl.md) | Web crawling with FireCrawl | Website data extraction | |
|||
| [Oracle Setup](oracle.md) | Advanced configuration with Oracle | Path setup, token tracking | |
|||
|
|||
Click on any example to see detailed code and explanations. |
@ -0,0 +1,70 @@ |
|||
# Oracle Example |
|||
|
|||
This example demonstrates an advanced setup using path manipulation and detailed token tracking. |
|||
|
|||
## Overview |
|||
|
|||
This example shows: |
|||
|
|||
1. Setting up Python path for importing from the parent directory |
|||
2. Initializing DeepSearcher with default configuration |
|||
3. Loading a PDF document and creating a vector database |
|||
4. Performing a complex query with full result and token tracking |
|||
5. Optional token consumption monitoring |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import sys, os |
|||
from pathlib import Path |
|||
script_directory = Path(__file__).resolve().parent.parent |
|||
sys.path.append(os.path.abspath(script_directory)) |
|||
|
|||
import logging |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
# Customize your config here |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
config = Configuration() |
|||
init_config(config=config) |
|||
|
|||
# Load your local data |
|||
# Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
|
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
# Query |
|||
from deepsearcher.online_query import query |
|||
|
|||
question = 'Write a report comparing Milvus with other vector databases.' |
|||
answer, retrieved_results, consumed_token = query(question) |
|||
print(answer) |
|||
|
|||
# get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model |
|||
# print(f"Consumed tokens: {consumed_token}") |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher: `pip install deepsearcher` |
|||
2. Make sure you have the data directory with "WhatisMilvus.pdf" (or change the path) |
|||
3. Run the script: `python basic_example_oracle.py` |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Path Management**: Setting up Python path to import from parent directory |
|||
- **Query Unpacking**: Getting full result details (answer, retrieved context, and tokens) |
|||
- **Complex Querying**: Asking for a comparative analysis that requires synthesis |
|||
- **Token Economy**: Monitoring token usage for cost optimization |
@ -0,0 +1,76 @@ |
|||
# Unstructured Integration Example |
|||
|
|||
This example demonstrates how to use the Unstructured library with DeepSearcher for advanced document parsing. |
|||
|
|||
## Overview |
|||
|
|||
Unstructured is a powerful document processing library that can extract content from various document formats. This example shows: |
|||
|
|||
1. Setting up Unstructured with DeepSearcher |
|||
2. Configuring the Unstructured API keys (optional) |
|||
3. Loading documents with Unstructured's parser |
|||
4. Querying the extracted content |
|||
|
|||
## Code Example |
|||
|
|||
```python |
|||
import logging |
|||
import os |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
# Suppress unnecessary logging from third-party libraries |
|||
logging.getLogger("httpx").setLevel(logging.WARNING) |
|||
|
|||
# (Optional) Set API keys (ensure these are set securely in real applications) |
|||
os.environ['UNSTRUCTURED_API_KEY'] = '***************' |
|||
os.environ['UNSTRUCTURED_API_URL'] = '***************' |
|||
|
|||
|
|||
def main(): |
|||
# Step 1: Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Configure Vector Database (Milvus) and File Loader (UnstructuredLoader) |
|||
config.set_provider_config("vector_db", "Milvus", {}) |
|||
config.set_provider_config("file_loader", "UnstructuredLoader", {}) |
|||
|
|||
# Apply the configuration |
|||
init_config(config) |
|||
|
|||
# Step 2: Load data from a local file or directory into Milvus |
|||
input_file = "your_local_file_or_directory" # Replace with your actual file path |
|||
collection_name = "Unstructured" |
|||
collection_description = "All Milvus Documents" |
|||
|
|||
load_from_local_files(paths_or_directory=input_file, collection_name=collection_name, collection_description=collection_description) |
|||
|
|||
# Step 3: Query the loaded data |
|||
question = "What is Milvus?" # Replace with your actual question |
|||
result = query(question) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
|||
``` |
|||
|
|||
## Running the Example |
|||
|
|||
1. Install DeepSearcher with Unstructured support: `pip install deepsearcher "unstructured[all-docs]"` |
|||
2. (Optional) Sign up for the Unstructured API at [unstructured.io](https://unstructured.io) if you want to use their cloud service |
|||
3. Replace `your_local_file_or_directory` with your own document file path or directory |
|||
4. Run the script: `python load_local_file_using_unstructured.py` |
|||
|
|||
## Unstructured Options |
|||
|
|||
You can use Unstructured in two modes: |
|||
|
|||
1. **API Mode**: Set the environment variables `UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_API_URL` to use their cloud service |
|||
2. **Local Mode**: Don't set the environment variables, and Unstructured will process documents locally on your machine |
|||
|
|||
## Key Concepts |
|||
|
|||
- **Document Processing**: Advanced document parsing for various formats |
|||
- **API/Local Options**: Flexibility in deployment based on your needs |
|||
- **Integration**: Seamless integration with DeepSearcher's vector database and query capabilities |
@ -0,0 +1,73 @@ |
|||
# Frequently Asked Questions |
|||
|
|||
## 🔍 Common Issues and Solutions |
|||
|
|||
--- |
|||
|
|||
### 💬 Q1: Why am I failing to parse LLM output format / How to select the right LLM? |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Solution:</strong> Small language models often struggle to follow prompts and generate responses in the expected format. For better results, we recommend using large reasoning models such as:</p> |
|||
|
|||
<ul> |
|||
<li>DeepSeek-R1 671B</li> |
|||
<li>OpenAI o-series models</li> |
|||
<li>Claude 3.7 Sonnet</li> |
|||
</ul> |
|||
|
|||
<p>These models provide superior reasoning capabilities and are more likely to produce correctly formatted outputs.</p> |
|||
</div> |
|||
|
|||
--- |
|||
|
|||
### 🌐 Q2: "We couldn't connect to 'https://huggingface.co'" error |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Error Message:</strong></p> |
|||
<div class="error-message"> |
|||
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json. |
|||
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. |
|||
</div> |
|||
|
|||
<p><strong>Solution:</strong> This issue is typically caused by network access problems to Hugging Face. Try these solutions:</p> |
|||
|
|||
<details> |
|||
<summary><strong>Network Issue? Try Using a Mirror</strong></summary> |
|||
|
|||
```bash |
|||
export HF_ENDPOINT=https://hf-mirror.com |
|||
``` |
|||
</details> |
|||
|
|||
<details> |
|||
<summary><strong>Permission Issue? Set Up a Personal Token</strong></summary> |
|||
|
|||
```bash |
|||
export HUGGING_FACE_HUB_TOKEN=xxxx |
|||
``` |
|||
</details> |
|||
</div> |
|||
|
|||
--- |
|||
|
|||
### 📓 Q3: DeepSearcher doesn't run in Jupyter notebook |
|||
|
|||
<div class="faq-answer"> |
|||
<p><strong>Solution:</strong> This is a common issue with asyncio in Jupyter notebooks. Install <code>nest_asyncio</code> and add the following code to the top of your notebook:</p> |
|||
|
|||
<div class="code-steps"> |
|||
<p><strong>Step 1:</strong> Install the required package</p> |
|||
|
|||
```bash |
|||
pip install nest_asyncio |
|||
``` |
|||
|
|||
<p><strong>Step 2:</strong> Add these lines to the beginning of your notebook</p> |
|||
|
|||
```python |
|||
import nest_asyncio |
|||
nest_asyncio.apply() |
|||
``` |
|||
</div> |
|||
</div> |
|||
</div> |
@ -0,0 +1,8 @@ |
|||
# Future Plans |
|||
|
|||
- Enhance web crawling functionality |
|||
- Support more vector databases (e.g., FAISS...) |
|||
- Add support for additional large models |
|||
- Provide RESTful API interface (**DONE**) |
|||
|
|||
We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯 |
@ -0,0 +1,45 @@ |
|||
# 🔍 DeepSearcher |
|||
|
|||
 |
|||
|
|||
<div align="center"> |
|||
|
|||
<a href="https://opensource.org/licenses/Apache-2.0"> |
|||
<img height="28" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=flat" alt="License"> |
|||
</a> |
|||
<a href="https://twitter.com/zilliz_universe"> |
|||
<img height="28" src="https://img.shields.io/badge/Follow-%40Zilliz-1DA1F2?style=flat&logo=twitter" alt="Twitter"> |
|||
</a> |
|||
<a href="https://discord.gg/mKc3R95yE5"> |
|||
<img height="28" src="https://img.shields.io/badge/Discord-Join%20Chat-5865F2?style=flat&logo=discord&logoColor=white" alt="Discord"> |
|||
</a> |
|||
|
|||
</div> |
|||
|
|||
|
|||
--- |
|||
|
|||
## ✨ Overview |
|||
|
|||
DeepSearcher combines cutting-edge LLMs (OpenAI o1, o3-mini, DeepSeek, Grok 3, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answers and comprehensive reports. |
|||
|
|||
> **Perfect for:** Enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios. |
|||
|
|||
|
|||
 |
|||
|
|||
|
|||
## 🚀 Key Features |
|||
|
|||
| Feature | Description | |
|||
|---------|-------------| |
|||
| 🔒 **Private Data Search** | Maximizes utilization of enterprise internal data while ensuring data security. When necessary, integrates online content for more accurate answers. | |
|||
| 🗄️ **Vector Database Management** | Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval. | |
|||
| 🧩 **Flexible Embedding Options** | Compatible with multiple embedding models for optimal selection based on your needs. | |
|||
| 🤖 **Multiple LLM Support** | Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation. | |
|||
| 📄 **Document Loader** | Supports local file loading, with web crawling capabilities under development. | |
|||
|
|||
## 🎬 Demo |
|||
|
|||
 |
|||
|
@ -0,0 +1,64 @@ |
|||
# 🛠️ Development Mode Installation |
|||
|
|||
This guide is for contributors who want to modify DeepSearcher's code or develop new features. |
|||
|
|||
## 📋 Prerequisites |
|||
|
|||
- Python 3.10 or higher |
|||
- git |
|||
- [uv](https://github.com/astral-sh/uv) package manager (recommended for faster installation) |
|||
|
|||
## 🔄 Installation Steps |
|||
|
|||
### Step 1: Install uv (Recommended) |
|||
|
|||
[uv](https://github.com/astral-sh/uv) is a faster alternative to pip for Python package management. |
|||
|
|||
=== "Using pip" |
|||
```bash |
|||
pip install uv |
|||
``` |
|||
|
|||
=== "Using curl (Unix/macOS)" |
|||
```bash |
|||
curl -LsSf https://astral.sh/uv/install.sh | sh |
|||
``` |
|||
|
|||
=== "Using PowerShell (Windows)" |
|||
```powershell |
|||
irm https://astral.sh/uv/install.ps1 | iex |
|||
``` |
|||
|
|||
For more options, see the [official uv installation guide](https://docs.astral.sh/uv/getting-started/installation/). |
|||
|
|||
### Step 2: Clone the repository |
|||
|
|||
```bash |
|||
git clone https://github.com/zilliztech/deep-searcher.git |
|||
cd deep-searcher |
|||
``` |
|||
|
|||
### Step 3: Set up the development environment |
|||
|
|||
=== "Using uv (Recommended)" |
|||
```bash |
|||
uv sync |
|||
source .venv/bin/activate |
|||
``` |
|||
|
|||
=== "Using pip" |
|||
```bash |
|||
python -m venv .venv |
|||
source .venv/bin/activate # On Windows: .venv\Scripts\activate |
|||
pip install -e ".[dev,all]" |
|||
``` |
|||
|
|||
## 🧪 Running Tests |
|||
|
|||
```bash |
|||
pytest tests/ |
|||
``` |
|||
|
|||
## 📚 Additional Resources |
|||
|
|||
For more detailed development setup instructions, including contribution guidelines, code style, and testing procedures, please refer to the [CONTRIBUTING.md](https://github.com/zilliztech/deep-searcher/blob/main/CONTRIBUTING.md) file in the repository. |
@ -0,0 +1,29 @@ |
|||
# 🔧 Installation |
|||
|
|||
DeepSearcher offers multiple installation methods to suit different user needs. |
|||
|
|||
## 📋 Installation Options |
|||
|
|||
| Method | Best For | Description | |
|||
|--------|----------|-------------| |
|||
| [📦 Installation via pip](pip.md) | Most users | Quick and easy installation using pip package manager | |
|||
| [🛠️ Development mode](development.md) | Contributors | Setup for those who want to modify the code or contribute | |
|||
|
|||
## 🚀 Quick Start |
|||
|
|||
Once installed, you can verify your installation: |
|||
|
|||
```python |
|||
from deepsearcher.configuration import Configuration |
|||
from deepsearcher.online_query import query |
|||
|
|||
# Initialize with default configuration |
|||
config = Configuration() |
|||
print("DeepSearcher installed successfully!") |
|||
``` |
|||
|
|||
## 💻 System Requirements |
|||
|
|||
- Python 3.10 or higher |
|||
- 4GB RAM minimum (8GB+ recommended) |
|||
- Internet connection for downloading models and dependencies |
@ -0,0 +1,52 @@ |
|||
# 📦 Installation via pip |
|||
|
|||
This method is recommended for most users who want to use DeepSearcher without modifying its source code. |
|||
|
|||
## 📋 Prerequisites |
|||
|
|||
- Python 3.10 or higher |
|||
- pip package manager (included with Python) |
|||
- Virtual environment tool (recommended) |
|||
|
|||
## 🔄 Step-by-Step Installation |
|||
|
|||
### Step 1: Create a virtual environment |
|||
|
|||
```bash |
|||
python -m venv .venv |
|||
``` |
|||
|
|||
### Step 2: Activate the virtual environment |
|||
|
|||
=== "Linux/macOS" |
|||
```bash |
|||
source .venv/bin/activate |
|||
``` |
|||
|
|||
=== "Windows" |
|||
```bash |
|||
.venv\Scripts\activate |
|||
``` |
|||
|
|||
### Step 3: Install DeepSearcher |
|||
|
|||
```bash |
|||
pip install deepsearcher |
|||
``` |
|||
|
|||
## 🧩 Optional Dependencies |
|||
|
|||
DeepSearcher supports various integrations through optional dependencies. |
|||
|
|||
| Integration | Command | Description | |
|||
|-------------|---------|-------------| |
|||
| Ollama | `pip install "deepsearcher[ollama]"` | For local LLM deployment | |
|||
| All extras | `pip install "deepsearcher[all]"` | Installs all optional dependencies | |
|||
|
|||
## ✅ Verify Installation |
|||
|
|||
```python |
|||
# Simple verification |
|||
from deepsearcher import __version__ |
|||
print(f"DeepSearcher version: {__version__}") |
|||
``` |
@ -0,0 +1,75 @@ |
|||
# Module Support |
|||
|
|||
DeepSearcher supports various integration modules including embedding models, large language models, document loaders and vector databases. |
|||
|
|||
## 📊 Overview |
|||
|
|||
| Module Type | Count | Description | |
|||
|-------------|-------|-------------| |
|||
| [Embedding Models](#embedding-models) | 7+ | Text vectorization tools | |
|||
| [Large Language Models](#llm-support) | 11+ | Query processing and text generation | |
|||
| [Document Loaders](#document-loader) | 5+ | Parse and process documents in various formats | |
|||
| [Vector Databases](#vector-database-support) | 2+ | Store and retrieve vector data | |
|||
|
|||
## 🔢 Embedding Models {#embedding-models} |
|||
|
|||
Support for various embedding models to convert text into vector representations for semantic search. |
|||
|
|||
| Provider | Required Environment Variables | Features | |
|||
|----------|--------------------------------|---------| |
|||
| **[Open-source models](https://milvus.io/docs/embeddings.md)** | None | Locally runnable open-source models | |
|||
| **[OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases)** | `OPENAI_API_KEY` | High-quality embeddings, easy to use | |
|||
| **[VoyageAI](https://docs.voyageai.com/embeddings/)** | `VOYAGE_API_KEY` | Embeddings optimized for retrieval | |
|||
| **[Amazon Bedrock](https://docs.aws.amazon.com/bedrock/)** | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` | AWS integration, enterprise-grade | |
|||
| **[FastEmbed](https://qdrant.github.io/fastembed/)** | None | Fast lightweight embeddings | |
|||
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Flexible cloud embeddings | |
|||
| **[Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings)** | `NOVITA_API_KEY` | Rich model selection | |
|||
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform | |
|||
|
|||
## 🧠 Large Language Models {#llm-support} |
|||
|
|||
Support for various large language models (LLMs) to process queries and generate responses. |
|||
|
|||
| Provider | Required Environment Variables | Features | |
|||
|----------|--------------------------------|---------| |
|||
| **[OpenAI](https://platform.openai.com/docs/models)** | `OPENAI_API_KEY` | GPT model family | |
|||
| **[DeepSeek](https://api-docs.deepseek.com/)** | `DEEPSEEK_API_KEY` | Powerful reasoning capabilities | |
|||
| **[XAI Grok](https://x.ai/blog/grok-3)** | `XAI_API_KEY` | Real-time knowledge and humor | |
|||
| **[Anthropic Claude](https://docs.anthropic.com/en/home)** | `ANTHROPIC_API_KEY` | Excellent long-context understanding | |
|||
| **[SiliconFlow](https://docs.siliconflow.cn/en/userguide/introduction)** | `SILICONFLOW_API_KEY` | Enterprise inference service | |
|||
| **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Diverse model support | |
|||
| **[TogetherAI](https://docs.together.ai/docs/introduction)** | `TOGETHER_API_KEY` | Wide range of open-source models | |
|||
| **[Google Gemini](https://ai.google.dev/gemini-api/docs)** | `GEMINI_API_KEY` | Google's multimodal models | |
|||
| **[SambaNova](https://docs.together.ai/docs/introduction)** | `SAMBANOVA_API_KEY` | High-performance AI platform | |
|||
| **[Ollama](https://ollama.com/)** | None | Local LLM deployment | |
|||
| **[Novita AI](https://novita.ai/docs/guides/introduction)** | `NOVITA_API_KEY` | Diverse AI services | |
|||
| **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform | |
|||
|
|||
## 📄 Document Loader {#document-loader} |
|||
|
|||
Support for loading and processing documents from various sources. |
|||
|
|||
### Local File Loaders |
|||
|
|||
| Loader | Supported Formats | Required Environment Variables | |
|||
|--------|-------------------|--------------------------------| |
|||
| **Built-in Loader** | PDF, TXT, MD | None | |
|||
| **[Unstructured](https://unstructured.io/)** | Multiple document formats | `UNSTRUCTURED_API_KEY`, `UNSTRUCTURED_URL` (optional) | |
|||
|
|||
### Web Crawlers |
|||
|
|||
| Crawler | Description | Required Environment Variables/Setup | |
|||
|---------|-------------|--------------------------------------| |
|||
| **[FireCrawl](https://docs.firecrawl.dev/introduction)** | Crawler designed for AI applications | `FIRECRAWL_API_KEY` | |
|||
| **[Jina Reader](https://jina.ai/reader/)** | High-accuracy web content extraction | `JINA_API_TOKEN` | |
|||
| **[Crawl4AI](https://docs.crawl4ai.com/)** | Browser automation crawler | Run `crawl4ai-setup` for first-time use | |
|||
|
|||
## 💾 Vector Database Support {#vector-database-support} |
|||
|
|||
Support for various vector databases for efficient storage and retrieval of embeddings. |
|||
|
|||
| Database | Description | Features | |
|||
|----------|-------------|----------| |
|||
| **[Milvus](https://milvus.io/)** | Open-source vector database | High-performance, scalable | |
|||
| **[Zilliz Cloud](https://www.zilliz.com/)** | Managed Milvus service | Fully managed, maintenance-free | |
|||
| **[Qdrant](https://qdrant.tech/)** | Vector similarity search engine | Simple, efficient | |
@ -0,0 +1,78 @@ |
|||
/* Add your custom CSS here */ |
|||
|
|||
/* FAQ Styling */ |
|||
.faq-answer { |
|||
background-color: #f8f9fa; |
|||
border-left: 4px solid #5c6bc0; |
|||
padding: 15px 20px; |
|||
margin-bottom: 20px; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
.error-message { |
|||
background-color: #ffebee; |
|||
border-left: 4px solid #f44336; |
|||
padding: 10px 15px; |
|||
margin: 10px 0; |
|||
font-family: monospace; |
|||
white-space: pre-wrap; |
|||
font-size: 0.9em; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
.code-steps { |
|||
margin: 15px 0; |
|||
} |
|||
|
|||
.code-steps p { |
|||
margin-bottom: 5px; |
|||
} |
|||
|
|||
details { |
|||
margin-bottom: 10px; |
|||
padding: 10px; |
|||
background-color: #e3f2fd; |
|||
border-radius: 4px; |
|||
} |
|||
|
|||
summary { |
|||
cursor: pointer; |
|||
padding: 8px 0; |
|||
} |
|||
|
|||
details[open] summary { |
|||
margin-bottom: 10px; |
|||
} |
|||
|
|||
h3 { |
|||
margin-top: 30px; |
|||
margin-bottom: 15px; |
|||
} |
|||
|
|||
/* Add smooth transition for collapsible sections */ |
|||
details summary { |
|||
transition: margin 0.3s ease; |
|||
} |
|||
|
|||
/* Styling for code blocks within FAQ */ |
|||
.faq-answer pre { |
|||
border-radius: 4px; |
|||
margin: 10px 0; |
|||
} |
|||
|
|||
/* Add styling for list items */ |
|||
.faq-answer ul { |
|||
padding-left: 25px; |
|||
} |
|||
|
|||
.faq-answer ul li { |
|||
margin: 5px 0; |
|||
} |
|||
|
|||
/* Add horizontal rule styling */ |
|||
hr { |
|||
border: 0; |
|||
height: 1px; |
|||
background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.1), rgba(0, 0, 0, 0)); |
|||
margin: 25px 0; |
|||
} |
@ -0,0 +1,63 @@ |
|||
# 💻 Command Line Interface |
|||
|
|||
DeepSearcher provides a convenient command line interface for loading data and querying. |
|||
|
|||
## 📥 Loading Data |
|||
|
|||
Load data from files or URLs: |
|||
|
|||
```shell |
|||
deepsearcher load "your_local_path_or_url" |
|||
``` |
|||
|
|||
Load into a specific collection: |
|||
|
|||
```shell |
|||
deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description" |
|||
``` |
|||
|
|||
### Examples |
|||
|
|||
#### Loading from local files: |
|||
|
|||
```shell |
|||
# Load a single file |
|||
deepsearcher load "/path/to/your/local/file.pdf" |
|||
|
|||
# Load multiple files at once |
|||
deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md" |
|||
``` |
|||
|
|||
#### Loading from URL: |
|||
|
|||
> **Note:** Set `FIRECRAWL_API_KEY` in your environment variables. See [FireCrawl documentation](https://docs.firecrawl.dev/introduction) for more details. |
|||
|
|||
```shell |
|||
deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek" |
|||
``` |
|||
|
|||
## 🔍 Querying Data |
|||
|
|||
Query your loaded data: |
|||
|
|||
```shell |
|||
deepsearcher query "Write a report about xxx." |
|||
``` |
|||
|
|||
## ❓ Help Commands |
|||
|
|||
Get general help information: |
|||
|
|||
```shell |
|||
deepsearcher --help |
|||
``` |
|||
|
|||
Get help for specific subcommands: |
|||
|
|||
```shell |
|||
# Help for load command |
|||
deepsearcher load --help |
|||
|
|||
# Help for query command |
|||
deepsearcher query --help |
|||
``` |
@ -0,0 +1,73 @@ |
|||
# 🌐 Deployment |
|||
|
|||
This guide explains how to deploy DeepSearcher as a web service. |
|||
|
|||
## ⚙️ Configure Modules |
|||
|
|||
You can configure all arguments by modifying the configuration file: |
|||
|
|||
```yaml |
|||
# config.yaml - https://github.com/zilliztech/deep-searcher/blob/main/config.yaml |
|||
llm: |
|||
provider: "OpenAI" |
|||
api_key: "your_openai_api_key_here" |
|||
# Additional configuration options... |
|||
``` |
|||
|
|||
> **Important:** Set your `OPENAI_API_KEY` in the `llm` section of the YAML file. |
|||
|
|||
## 🚀 Start Service |
|||
|
|||
The main script will run a FastAPI service with default address `localhost:8000`: |
|||
|
|||
```shell |
|||
$ python main.py |
|||
``` |
|||
|
|||
Once started, you should see output indicating the service is running successfully. |
|||
|
|||
## 🔍 Access via Browser |
|||
|
|||
You can access the web service through your browser: |
|||
|
|||
1. Open your browser and navigate to [http://localhost:8000/docs](http://localhost:8000/docs) |
|||
2. The Swagger UI will display all available API endpoints |
|||
3. Click the "Try it out" button on any endpoint to interact with it |
|||
4. Fill in the required parameters and execute the request |
|||
|
|||
This interactive documentation makes it easy to test and use all DeepSearcher API functionality. |
|||
|
|||
## 🐳 Docker Deployment |
|||
|
|||
You can also deploy DeepSearcher using Docker for easier environment setup and management. |
|||
|
|||
### Build Docker Image |
|||
|
|||
To build the Docker image, run the following command from the project root directory: |
|||
|
|||
```shell |
|||
docker build -t deepsearcher:latest . |
|||
``` |
|||
|
|||
This command builds a Docker image using the Dockerfile in the current directory and tags it as `deepsearcher:latest`. |
|||
|
|||
### Run Docker Container |
|||
|
|||
Once the image is built, you can run it as a container: |
|||
|
|||
```shell |
|||
docker run -p 8000:8000 \ |
|||
-e OPENAI_API_KEY=your_openai_api_key \ |
|||
-v $(pwd)/data:/app/data \ |
|||
-v $(pwd)/logs:/app/logs \ |
|||
-v $(pwd)/deepsearcher/config.yaml:/app/deepsearcher/config.yaml \ |
|||
deepsearcher:latest |
|||
``` |
|||
|
|||
This command: |
|||
- Maps port 8000 from the container to port 8000 on your host |
|||
- Sets the `OPENAI_API_KEY` environment variable |
|||
- Mounts the local `data`, `logs`, and configuration file to the container |
|||
- Runs the previously built `deepsearcher:latest` image |
|||
|
|||
> **Note:** Replace `your_openai_api_key` with your actual OpenAI API key, or set any other environment variables required for your configuration. |
@ -0,0 +1,13 @@ |
|||
# 📚 Usage Guide |
|||
|
|||
DeepSearcher provides multiple ways to use the system, including Python API, command line interface, and web service deployment. |
|||
|
|||
## 🔍 Usage Overview |
|||
|
|||
| Guide | Description | |
|||
|-------|-------------| |
|||
| [🚀 Quick Start](quick_start.md) | Quick start guide for Python API integration | |
|||
| [💻 Command Line Interface](cli.md) | Instructions for using the command line interface | |
|||
| [🌐 Deployment](deployment.md) | Guide for deploying as a web service | |
|||
|
|||
Choose the method that best suits your needs and follow the instructions on the corresponding page. |
@ -0,0 +1,42 @@ |
|||
# 🚀 Quick Start |
|||
|
|||
## Prerequisites |
|||
|
|||
✅ Before you begin, prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key. |
|||
|
|||
## Basic Usage |
|||
|
|||
```python |
|||
# Import configuration modules |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.online_query import query |
|||
|
|||
# Initialize configuration |
|||
config = Configuration() |
|||
|
|||
# Customize your config here |
|||
# (See the Configuration Details section below for more options) |
|||
config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"}) |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"}) |
|||
init_config(config=config) |
|||
|
|||
# Load data from local files |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
load_from_local_files(paths_or_directory=your_local_path) |
|||
|
|||
# (Optional) Load data from websites |
|||
# Requires FIRECRAWL_API_KEY environment variable |
|||
from deepsearcher.offline_loading import load_from_website |
|||
load_from_website(urls=website_url) |
|||
|
|||
# Query your data |
|||
result = query("Write a report about xxx.") # Replace with your question |
|||
print(result) |
|||
``` |
|||
|
|||
## Next Steps |
|||
|
|||
After completing this quick start, you might want to explore: |
|||
|
|||
- [Command Line Interface](cli.md) for non-programmatic usage |
|||
- [Deployment](deployment.md) for setting up a web service |
@ -0,0 +1,53 @@ |
|||
# Evaluation of DeepSearcher |
|||
## Introduction |
|||
DeepSearcher is very good at answering complex queries. In this evaluation introduction, we provide some scripts to evaluate the performance of DeepSearcher vs. naive RAG. |
|||
|
|||
The evaluation is based on the Recall metric: |
|||
|
|||
> Recall@K: The percentage of relevant documents that are retrieved among the top K documents returned by the search engine. |
|||
|
|||
Currently, we support the multi-hop question answering dataset of [2WikiMultiHopQA](https://paperswithcode.com/dataset/2wikimultihopqa). More dataset will be added in the future. |
|||
|
|||
## Evaluation Script |
|||
The main evaluation script is `evaluate.py`. |
|||
|
|||
Your can provide a config file, say `eval_config.yaml`, to specify the LLM, embedding model, and other provider and parameters. |
|||
```shell |
|||
python evaluate.py \ |
|||
--dataset 2wikimultihopqa \ |
|||
--config_yaml ./eval_config.yaml \ |
|||
--pre_num 5 \ |
|||
--output_dir ./eval_output |
|||
``` |
|||
`pre_num` is the number of samples to evaluate, the more samples, the more accurate the results will be, but it will consume more time and your LLM api token usage. |
|||
|
|||
After you have loaded the dataset into vectorDB in the first run, if you want to skip loading dataset again, you can set the flag `--skip_load` in the command line. |
|||
|
|||
For more arguments details, you can run |
|||
```shell |
|||
python evaluate.py --help |
|||
``` |
|||
|
|||
## Evaluation Results |
|||
We conducted tests using the commonly used 2WikiMultiHopQA dataset. (Due to the high consumption of API tokens for testing, we only tested the first 50 samples. This may introduce some fluctuations compared to testing the entire dataset, but it can still roughly reflect the general landscape of performance.) |
|||
|
|||
### Recall Comparison between Naive RAG and DeepSearcher with Different Models |
|||
With Max Iterations on the horizontal axis and Recall on the vertical axis, the following chart compares the recall rates of Deep Searcher and naive RAG. |
|||
 |
|||
#### Performance Improvement with Iterations |
|||
As we can see, as the number of Max Iterations increases, the recall performance of Deep Searcher improves significantly. And all the model results from Deep Searcher are significantly higher than those from naive RAG. |
|||
|
|||
#### Diminishing Returns |
|||
However, it is also evident that as the number of iterations gradually increases, the marginal gains decrease, indicating that there may be a certain limit reached after increasing the feedback iterations, and further feedback might not yield significantly better results. |
|||
|
|||
#### Model Performance Comparison |
|||
Claude-3-7-sonnet (red line) demonstrates superior performance throughout, achieving nearly perfect recall at 7 iterations. Most models show significant improvement as iterations increase, with the steepest gains occurring between 2-4 iterations. Models like o1-mini (yellow) and deepseek-r1 (green) exhibit strong performance at higher iteration counts. Since our sample number for testing is limited, the results of each test may vary somewhat. |
|||
Overall, reasoning models generally perform better than non-reasoning models. |
|||
|
|||
#### Limitations of Non-Reasoning Models |
|||
Additionally, in our tests, weaker and smaller non-reasoning models sometimes failed to complete the entire agent query pipeline, due to their inadequate instruction-following capabilities. |
|||
|
|||
### Token Consumption |
|||
We plotted the graph below with the number of iterations on the horizontal axis and the average token consumption per sample on the vertical axis: |
|||
 |
|||
It is evident that as the number of iterations increases, the token consumption of Deep Searcher rises linearly. Based on this approximate token consumption, you can check the pricing on your model provider's website to estimate the cost of running evaluations with different iteration settings. |
@ -0,0 +1,119 @@ |
|||
provide_settings: |
|||
llm: |
|||
provider: "OpenAI" |
|||
config: |
|||
model: "o1-mini" |
|||
# api_key: "sk-xxxx" # Uncomment to override the `OPENAI_API_KEY` set in the environment variable |
|||
# base_url: "" |
|||
|
|||
# provider: "DeepSeek" |
|||
# config: |
|||
# model: "deepseek-reasoner" |
|||
## api_key: "sk-xxxx" # Uncomment to override the `DEEPSEEK_API_KEY` set in the environment variable |
|||
## base_url: "" |
|||
|
|||
# provider: "SiliconFlow" |
|||
# config: |
|||
# model: "deepseek-ai/DeepSeek-R1" |
|||
## api_key: "xxxx" # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable |
|||
## base_url: "" |
|||
|
|||
# provider: "PPIO" |
|||
# config: |
|||
# model: "deepseek/deepseek-r1-turbo" |
|||
## api_key: "xxxx" # Uncomment to override the `PPIO_API_KEY` set in the environment variable |
|||
## base_url: "" |
|||
|
|||
# provider: "TogetherAI" |
|||
# config: |
|||
# model: "deepseek-ai/DeepSeek-R1" |
|||
## api_key: "xxxx" # Uncomment to override the `TOGETHER_API_KEY` set in the environment variable |
|||
|
|||
# provider: "AzureOpenAI" |
|||
# config: |
|||
# model: "" |
|||
# api_version: "" |
|||
## azure_endpoint: "xxxx" # Uncomment to override the `AZURE_OPENAI_ENDPOINT` set in the environment variable |
|||
## api_key: "xxxx" # Uncomment to override the `AZURE_OPENAI_KEY` set in the environment variable |
|||
|
|||
# provider: "Ollama" |
|||
# config: |
|||
# model: "qwq" |
|||
## base_url: "" |
|||
|
|||
# provider: "Novita" |
|||
# config: |
|||
# model: "deepseek/deepseek-v3-0324" |
|||
## api_key: "xxxx" # Uncomment to override the `NOVITA_API_KEY` set in the environment variable |
|||
## base_url: "" |
|||
|
|||
embedding: |
|||
provider: "OpenAIEmbedding" |
|||
config: |
|||
model: "text-embedding-ada-002" |
|||
# api_key: "" # Uncomment to override the `OPENAI_API_KEY` set in the environment variable |
|||
|
|||
|
|||
# provider: "MilvusEmbedding" |
|||
# config: |
|||
# model: "default" |
|||
|
|||
# provider: "VoyageEmbedding" |
|||
# config: |
|||
# model: "voyage-3" |
|||
## api_key: "" # Uncomment to override the `VOYAGE_API_KEY` set in the environment variable |
|||
|
|||
# provider: "BedrockEmbedding" |
|||
# config: |
|||
# model: "amazon.titan-embed-text-v2:0" |
|||
## aws_access_key_id: "" # Uncomment to override the `AWS_ACCESS_KEY_ID` set in the environment variable |
|||
## aws_secret_access_key: "" # Uncomment to override the `AWS_SECRET_ACCESS_KEY` set in the environment variable |
|||
|
|||
# provider: "SiliconflowEmbedding" |
|||
# config: |
|||
# model: "BAAI/bge-m3" |
|||
# . api_key: "" # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable |
|||
|
|||
# provider: "NovitaEmbedding" |
|||
# config: |
|||
# model: "baai/bge-m3" |
|||
# . api_key: "" # Uncomment to override the `NOVITA_API_KEY` set in the environment variable |
|||
|
|||
file_loader: |
|||
# provider: "PDFLoader" |
|||
# config: {} |
|||
|
|||
provider: "JsonFileLoader" |
|||
config: |
|||
text_key: "text" |
|||
|
|||
# provider: "TextLoader" |
|||
# config: {} |
|||
|
|||
# provider: "UnstructuredLoader" |
|||
# config: {} |
|||
|
|||
web_crawler: |
|||
provider: "FireCrawlCrawler" |
|||
config: {} |
|||
|
|||
# provider: "Crawl4AICrawler" |
|||
# config: {} |
|||
|
|||
# provider: "JinaCrawler" |
|||
# config: {} |
|||
|
|||
vector_db: |
|||
provider: "Milvus" |
|||
config: |
|||
default_collection: "deepsearcher" |
|||
uri: "./milvus.db" |
|||
token: "root:Milvus" |
|||
db: "default" |
|||
|
|||
query_settings: |
|||
max_iter: 3 |
|||
|
|||
load_settings: |
|||
chunk_size: 1500 |
|||
chunk_overlap: 100 |
@ -0,0 +1,329 @@ |
|||
# Some test dataset and evaluation method are ref from https://github.com/OSU-NLP-Group/HippoRAG/tree/main/data , many thanks |
|||
|
|||
################################################################################ |
|||
# Note: This evaluation script will cost a lot of LLM token usage, please make sure you have enough token budget. |
|||
################################################################################ |
|||
import argparse |
|||
import ast |
|||
import json |
|||
import logging |
|||
import os |
|||
import time |
|||
import warnings |
|||
from collections import defaultdict |
|||
from typing import List, Tuple |
|||
|
|||
import pandas as pd |
|||
|
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import naive_retrieve, retrieve |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
|
|||
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output |
|||
|
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
k_list = [2, 5] |
|||
|
|||
|
|||
def _deepsearch_retrieve_titles( |
|||
question: str, |
|||
retry_num: int = 4, |
|||
base_wait_time: int = 4, |
|||
max_iter: int = 3, |
|||
) -> Tuple[List[str], int, bool]: |
|||
""" |
|||
Retrieve document titles using DeepSearcher with retry mechanism. |
|||
|
|||
Args: |
|||
question (str): The query question. |
|||
retry_num (int, optional): Number of retry attempts. Defaults to 4. |
|||
base_wait_time (int, optional): Base wait time between retries in seconds. Defaults to 4. |
|||
max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3. |
|||
|
|||
Returns: |
|||
Tuple[List[str], int, bool]: A tuple containing: |
|||
- List of retrieved document titles |
|||
- Number of tokens consumed |
|||
- Boolean indicating whether the retrieval failed |
|||
""" |
|||
retrieved_results = [] |
|||
consume_tokens = 0 |
|||
for i in range(retry_num): |
|||
try: |
|||
retrieved_results, _, consume_tokens = retrieve(question, max_iter=max_iter) |
|||
break |
|||
except Exception: |
|||
wait_time = base_wait_time * (2**i) |
|||
print(f"Parse LLM's output failed, retry again after {wait_time} seconds...") |
|||
time.sleep(wait_time) |
|||
if retrieved_results: |
|||
retrieved_titles = [ |
|||
retrieved_result.metadata["title"] for retrieved_result in retrieved_results |
|||
] |
|||
fail = False |
|||
else: |
|||
print("Pipeline error, no retrieved results.") |
|||
retrieved_titles = [] |
|||
fail = True |
|||
return retrieved_titles, consume_tokens, fail |
|||
|
|||
|
|||
def _naive_retrieve_titles(question: str) -> List[str]: |
|||
""" |
|||
Retrieve document titles using naive retrieval method. |
|||
|
|||
Args: |
|||
question (str): The query question. |
|||
|
|||
Returns: |
|||
List[str]: List of retrieved document titles. |
|||
""" |
|||
retrieved_results = naive_retrieve(question) |
|||
retrieved_titles = [ |
|||
retrieved_result.metadata["title"] for retrieved_result in retrieved_results |
|||
] |
|||
return retrieved_titles |
|||
|
|||
|
|||
def _calcu_recall(sample, retrieved_titles, dataset) -> dict: |
|||
""" |
|||
Calculate recall metrics for retrieved titles. |
|||
|
|||
Args: |
|||
sample: The sample data containing ground truth information. |
|||
retrieved_titles: List of retrieved document titles. |
|||
dataset (str): The name of the dataset being evaluated. |
|||
|
|||
Returns: |
|||
dict: Dictionary containing recall values at different k values. |
|||
|
|||
Raises: |
|||
NotImplementedError: If the dataset is not supported. |
|||
""" |
|||
if dataset in ["2wikimultihopqa"]: |
|||
gold_passages = [item for item in sample["supporting_facts"]] |
|||
gold_items = set([item[0] for item in gold_passages]) |
|||
retrieved_items = retrieved_titles |
|||
else: |
|||
raise NotImplementedError |
|||
|
|||
recall = dict() |
|||
for k in k_list: |
|||
recall[k] = round( |
|||
sum(1 for t in gold_items if t in retrieved_items[:k]) / len(gold_items), 4 |
|||
) |
|||
return recall |
|||
|
|||
|
|||
def _print_recall_line(recall: dict, pre_str="", post_str="\n"): |
|||
""" |
|||
Print recall metrics in a formatted line. |
|||
|
|||
Args: |
|||
recall (dict): Dictionary containing recall values at different k values. |
|||
pre_str (str, optional): String to print before recall values. Defaults to "". |
|||
post_str (str, optional): String to print after recall values. Defaults to "\n". |
|||
""" |
|||
print(pre_str, end="") |
|||
for k in k_list: |
|||
print(f"R@{k}: {recall[k]:.3f} ", end="") |
|||
print(post_str, end="") |
|||
|
|||
|
|||
def evaluate( |
|||
dataset: str, |
|||
output_root: str, |
|||
pre_num: int = 10, |
|||
max_iter: int = 3, |
|||
skip_load=False, |
|||
flag: str = "result", |
|||
): |
|||
""" |
|||
Evaluate the retrieval performance on a dataset. |
|||
|
|||
Args: |
|||
dataset (str): Name of the dataset to evaluate. |
|||
output_root (str): Root directory for output files. |
|||
pre_num (int, optional): Number of samples to evaluate. Defaults to 10. |
|||
max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3. |
|||
skip_load (bool, optional): Whether to skip loading the dataset. Defaults to False. |
|||
flag (str, optional): Flag for the evaluation run. Defaults to "result". |
|||
""" |
|||
corpus_file = os.path.join(current_dir, f"../examples/data/{dataset}_corpus.json") |
|||
if not skip_load: |
|||
# set chunk size to a large number to avoid chunking, because the dataset was chunked already. |
|||
load_from_local_files( |
|||
corpus_file, force_new_collection=True, chunk_size=999999, chunk_overlap=0 |
|||
) |
|||
|
|||
eval_output_subdir = os.path.join(output_root, flag) |
|||
os.makedirs(eval_output_subdir, exist_ok=True) |
|||
csv_file_path = os.path.join(eval_output_subdir, "details.csv") |
|||
statistics_file_path = os.path.join(eval_output_subdir, "statistics.json") |
|||
|
|||
data_with_gt_file_path = os.path.join(current_dir, f"../examples/data/{dataset}.json") |
|||
data_with_gt = json.load(open(data_with_gt_file_path, "r")) |
|||
|
|||
if not pre_num: |
|||
pre_num = len(data_with_gt) |
|||
|
|||
pipeline_error_num = 0 |
|||
end_ind = min(pre_num, len(data_with_gt)) |
|||
|
|||
start_ind = 0 |
|||
existing_df = pd.DataFrame() |
|||
existing_statistics = defaultdict(dict) |
|||
existing_token_usage = 0 |
|||
existing_error_num = 0 |
|||
existing_sample_num = 0 |
|||
if os.path.exists(csv_file_path): |
|||
existing_df = pd.read_csv(csv_file_path) |
|||
start_ind = len(existing_df) |
|||
print(f"Loading results from {csv_file_path}, start_index = {start_ind}") |
|||
|
|||
if os.path.exists(statistics_file_path): |
|||
existing_statistics = json.load(open(statistics_file_path, "r")) |
|||
print( |
|||
f"Loading statistics from {statistics_file_path}, will recalculate the statistics based on both new and existing results." |
|||
) |
|||
existing_token_usage = existing_statistics["deepsearcher"]["token_usage"] |
|||
existing_error_num = existing_statistics["deepsearcher"].get("error_num", 0) |
|||
existing_sample_num = existing_statistics["deepsearcher"].get("sample_num", 0) |
|||
for sample_idx, sample in enumerate(data_with_gt[start_ind:end_ind]): |
|||
global_idx = sample_idx + start_ind |
|||
question = sample["question"] |
|||
|
|||
retrieved_titles, consume_tokens, fail = _deepsearch_retrieve_titles( |
|||
question, max_iter=max_iter |
|||
) |
|||
retrieved_titles_naive = _naive_retrieve_titles(question) |
|||
|
|||
if fail: |
|||
pipeline_error_num += 1 |
|||
print( |
|||
f"Pipeline error, no retrieved results. Current pipeline_error_num = {pipeline_error_num}" |
|||
) |
|||
|
|||
print(f"idx: {global_idx}: ") |
|||
recall = _calcu_recall(sample, retrieved_titles, dataset) |
|||
recall_naive = _calcu_recall(sample, retrieved_titles_naive, dataset) |
|||
current_result = [ |
|||
{ |
|||
"idx": global_idx, |
|||
"question": question, |
|||
"recall": recall, |
|||
"recall_naive": recall_naive, |
|||
"gold_titles": [item[0] for item in sample["supporting_facts"]], |
|||
"retrieved_titles": retrieved_titles, |
|||
"retrieved_titles_naive": retrieved_titles_naive, |
|||
} |
|||
] |
|||
current_df = pd.DataFrame(current_result) |
|||
existing_df = pd.concat([existing_df, current_df], ignore_index=True) |
|||
existing_df.to_csv(csv_file_path, index=False) |
|||
average_recall = dict() |
|||
average_recall_naive = dict() |
|||
for k in k_list: |
|||
average_recall[k] = sum( |
|||
[ |
|||
ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k) |
|||
for d in existing_df["recall"] |
|||
] |
|||
) / len(existing_df) |
|||
average_recall_naive[k] = sum( |
|||
[ |
|||
ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k) |
|||
for d in existing_df["recall_naive"] |
|||
] |
|||
) / len(existing_df) |
|||
_print_recall_line(average_recall, pre_str="Average recall of DeepSearcher: ") |
|||
_print_recall_line(average_recall_naive, pre_str="Average recall of naive RAG : ") |
|||
existing_token_usage += consume_tokens |
|||
existing_error_num += 1 if fail else 0 |
|||
existing_sample_num += 1 |
|||
existing_statistics["deepsearcher"]["average_recall"] = average_recall |
|||
existing_statistics["deepsearcher"]["token_usage"] = existing_token_usage |
|||
existing_statistics["deepsearcher"]["error_num"] = existing_error_num |
|||
existing_statistics["deepsearcher"]["sample_num"] = existing_sample_num |
|||
existing_statistics["deepsearcher"]["token_usage_per_sample"] = ( |
|||
existing_token_usage / existing_sample_num |
|||
) |
|||
existing_statistics["naive_rag"]["average_recall"] = average_recall_naive |
|||
json.dump(existing_statistics, open(statistics_file_path, "w"), indent=4) |
|||
print("") |
|||
print("Finish results to save.") |
|||
|
|||
|
|||
def main_eval(): |
|||
""" |
|||
Main function for running the evaluation from command line. |
|||
|
|||
This function parses command line arguments and calls the evaluate function |
|||
with the appropriate parameters. |
|||
""" |
|||
parser = argparse.ArgumentParser(prog="evaluate", description="Deep Searcher evaluation.") |
|||
parser.add_argument( |
|||
"--dataset", |
|||
type=str, |
|||
default="2wikimultihopqa", |
|||
help="Dataset name, default is `2wikimultihopqa`. More datasets will be supported in the future.", |
|||
) |
|||
parser.add_argument( |
|||
"--config_yaml", |
|||
type=str, |
|||
default="./eval_config.yaml", |
|||
help="Configuration yaml file path, default is `./eval_config.yaml`", |
|||
) |
|||
parser.add_argument( |
|||
"--pre_num", |
|||
type=int, |
|||
default=30, |
|||
help="Number of samples to evaluate, default is 30", |
|||
) |
|||
parser.add_argument( |
|||
"--max_iter", |
|||
type=int, |
|||
default=3, |
|||
help="Max iterations of reflection. Default is 3. It will overwrite the one in config yaml file.", |
|||
) |
|||
parser.add_argument( |
|||
"--output_dir", |
|||
type=str, |
|||
default="./eval_output", |
|||
help="Output root directory, default is `./eval_output`", |
|||
) |
|||
parser.add_argument( |
|||
"--skip_load", |
|||
action="store_true", |
|||
help="Whether to skip loading the dataset. Default it don't skip loading. If you want to skip loading, please set this flag.", |
|||
) |
|||
parser.add_argument( |
|||
"--flag", |
|||
type=str, |
|||
default="result", |
|||
help="Flag for evaluation, default is `result`", |
|||
) |
|||
|
|||
args = parser.parse_args() |
|||
|
|||
config = Configuration(config_path=args.config_yaml) |
|||
init_config(config=config) |
|||
|
|||
evaluate( |
|||
dataset=args.dataset, |
|||
output_root=args.output_dir, |
|||
pre_num=args.pre_num, |
|||
max_iter=args.max_iter, |
|||
skip_load=args.skip_load, |
|||
flag=args.flag, |
|||
) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main_eval() |
After Width: | Height: | Size: 124 KiB |
After Width: | Height: | Size: 92 KiB |
After Width: | Height: | Size: 130 KiB |
@ -0,0 +1,35 @@ |
|||
import logging |
|||
import os |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
config = Configuration() # Customize your config here |
|||
init_config(config=config) |
|||
|
|||
|
|||
# You should clone the milvus docs repo to your local machine first, execute: |
|||
# git clone https://github.com/milvus-io/milvus-docs.git |
|||
# Then replace the path below with the path to the milvus-docs repo on your local machine |
|||
# import glob |
|||
# all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True) |
|||
# load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents") |
|||
|
|||
# Hint: You can also load a single file, please execute it in the root directory of the deep searcher project |
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
question = "Write a report comparing Milvus with other vector databases." |
|||
|
|||
_, _, consumed_token = query(question, max_iter=1) |
|||
print(f"Consumed tokens: {consumed_token}") |
@ -0,0 +1,68 @@ |
|||
import logging |
|||
import os |
|||
import time |
|||
|
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.online_query import query |
|||
|
|||
# Configure logging |
|||
logging.basicConfig( |
|||
level=logging.INFO, |
|||
format='%(asctime)s - %(levelname)s - %(message)s', |
|||
datefmt='%Y-%m-%d %H:%M:%S' |
|||
) |
|||
logger = logging.getLogger(__name__) |
|||
|
|||
|
|||
|
|||
logger.info("Initializing DeepSearcher configuration") |
|||
config = Configuration() |
|||
config.set_provider_config("llm", "AzureOpenAI", { |
|||
"model": "gpt-4.1", |
|||
"api_key": "<yourkey>", |
|||
"base_url": "https://<youraifoundry>.openai.azure.com/openai/", |
|||
"api_version": "2024-12-01-preview" |
|||
}) |
|||
config.set_provider_config("embedding", "OpenAIEmbedding", { |
|||
"model": "text-embedding-ada-002", |
|||
"api_key": "<yourkey>", |
|||
"azure_endpoint": "https://<youraifoundry>.openai.azure.com/", |
|||
"api_version": "2023-05-15" |
|||
# Remove api_version and other Azure-specific parameters |
|||
}) |
|||
config.set_provider_config("vector_db", "AzureSearch", { |
|||
"endpoint": "https://<yourazureaisearch>.search.windows.net", |
|||
"index_name": "<yourindex>", |
|||
"api_key": "<yourkey>", |
|||
"vector_field": "content_vector" |
|||
}) |
|||
|
|||
logger.info("Configuration initialized successfully") |
|||
|
|||
try: |
|||
logger.info("Applying global configuration") |
|||
init_config(config) |
|||
logger.info("Configuration applied globally") |
|||
|
|||
# Example question |
|||
question = "Create a detailed report about what Python is all about" |
|||
logger.info(f"Processing query: '{question}'") |
|||
|
|||
start_time = time.time() |
|||
result = query(question) |
|||
query_time = time.time() - start_time |
|||
logger.info(f"Query processed in {query_time:.2f} seconds") |
|||
|
|||
logger.info("Retrieved result successfully") |
|||
print(result[0]) # Print the first element of the tuple |
|||
|
|||
# Check if there's a second element in the tuple that contains source documents |
|||
if len(result) > 1 and hasattr(result[1], "__len__"): |
|||
logger.info(f"Found {len(result[1])} source documents") |
|||
for i, doc in enumerate(result[1]): |
|||
if hasattr(doc, "metadata") and "source" in doc.metadata: |
|||
logger.info(f"Source {i+1}: {doc.metadata['source']}") |
|||
except Exception as e: |
|||
logger.error(f"Error executing query: {str(e)}") |
|||
import traceback |
|||
logger.error(traceback.format_exc()) |
@ -0,0 +1,40 @@ |
|||
import sys, os |
|||
from pathlib import Path |
|||
script_directory = Path(__file__).resolve().parent.parent |
|||
sys.path.append(os.path.abspath(script_directory)) |
|||
|
|||
import logging |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
# Customize your config here |
|||
from deepsearcher.configuration import Configuration, init_config |
|||
|
|||
config = Configuration() |
|||
init_config(config=config) |
|||
|
|||
# # Load your local data |
|||
# # Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project |
|||
|
|||
from deepsearcher.offline_loading import load_from_local_files |
|||
|
|||
load_from_local_files( |
|||
paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"), |
|||
collection_name="milvus_docs", |
|||
collection_description="All Milvus Documents", |
|||
# force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True |
|||
) |
|||
|
|||
# Query |
|||
from deepsearcher.online_query import query |
|||
|
|||
question = 'Write a report comparing Milvus with other vector databases.' |
|||
answer, retrieved_results, consumed_token = query(question) |
|||
print(answer) |
|||
|
|||
# # get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model |
|||
# print(f"Consumed tokens: {consumed_token}") |
|||
|