initial commit

2 weeks ago · 9497784957
172 changed files with 26247 additions and 0 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,32 @@
 ---
 name: Bug report
 about: Create a report to help us improve
 title: ''
 labels: ''
 assignees: ''
 ---
 Please describe your issue **in English**
 *Note: Small LLMs cannot perform well at prompt following, and are prone to hallucinations. Please make sure your LLM is cutting-edge, preferably a reasoning model, e.g. OpenAI o-series, DeepSeek R1, Claude 3.7 Sonnet etc.*
 **Describe the bug**
 A clear and concise description of what the bug is.
 **To Reproduce**
 Steps to reproduce the behavior:
 **Expected behavior**
 A clear and concise description of what you expected to happen.
 **Screenshots**
 If applicable, add screenshots to help explain your problem.
 **Environment (please complete the following information):**
 - OS: [e.g. MacOS]
 - pip dependencies
 - Version [e.g. 0.0.1]
 **Additional context**
 Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,22 @@
 ---
 name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: ''
 assignees: ''
 ---
 Please describe your suggestion **in English**.
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 **Describe the solution you'd like**
 A clear and concise description of what you want to happen.
 **Describe alternatives you've considered**
 A clear and concise description of any alternative solutions or features you've considered.
 **Additional context**
 Add any other context or screenshots about the feature request here.
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -0,0 +1,34 @@
 misc:
  - branch: &BRANCHES
      #  In this pull request, the changes are based on the main branch
      - &MASTER_BRANCH base=main
  - name: Label bug fix PRs
    conditions:
      # branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
      - or: *BRANCHES
      - 'title~=^fix:'
    actions:
      label:
        add:
          - kind/bug
  - name: Label feature PRs
    conditions:
      # branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
      - or: *BRANCHES
      - 'title~=^feat:'
    actions:
      label:
        add:
          - kind/feature
  - name: Label enhancement PRs
    conditions:
      # branch condition: in this pull request, the changes are based on any branch referenced by BRANCHES
      - or: *BRANCHES
      - 'title~=^enhance:'
    actions:
      label:
        add:
          - kind/enhancement
--- a/.github/workflows/cd-docs.yml
+++ b/.github/workflows/cd-docs.yml
@ -0,0 +1,20 @@
 name: "Run Docs CD with UV"
 on:
  push:
    branches:
      - "main"
      - "master"
    paths:
      - 'docs/**'
      - 'mkdocs.yml'
      - '.github/workflows/docs.yml'
 jobs:
  build-deploy-docs:
    if: github.repository == 'zilliztech/deep-searcher'
    uses: ./.github/workflows/docs.yml
    with:
      deploy: true
    permissions:
      contents: write
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@ -0,0 +1,24 @@
 name: "Run Docs CI with UV"
 on:
  pull_request:
    types: [opened, reopened, synchronize]
    paths:
      - 'docs/**'
      - 'mkdocs.yml'
      - '.github/workflows/docs.yml'
  push:
    branches:
      - "**"
      - "!gh-pages"
    paths:
      - 'docs/**'
      - 'mkdocs.yml'
      - '.github/workflows/docs.yml'
 jobs:
  build-docs:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'zilliztech/deep-searcher') }}
    uses: ./.github/workflows/docs.yml
    with:
      deploy: false
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -0,0 +1,27 @@
 on:
    workflow_call:
        inputs:
            deploy:
                type: boolean
                description: "If true, the docs will be deployed."
                default: false
 jobs:
    run-docs:
        runs-on: ubuntu-latest
        steps:
        - uses: actions/checkout@v4
        - name: Install uv
          uses: astral-sh/setup-uv@v5
        - name: Install dependencies
          run: |
            uv sync --all-extras --dev
            source .venv/bin/activate
        - name: Build docs
          run: uv run mkdocs build --verbose --clean
        - name: Build and push docs
          if: inputs.deploy
          run: uv run mkdocs gh-deploy --force
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -0,0 +1,37 @@
 #git tag v0.x.x  # Must be same as the version in pyproject.toml
 #git push --tags
 name: Publish Python Package to PyPI
 on:
  push:
    tags:
      - "v*"
 jobs:
  publish:
    name: Publish to PyPI
    runs-on: ubuntu-latest
    environment: pypi
    permissions:
      id-token: write
      contents: read
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
      - name: Install build tools
        run: python -m pip install build
      - name: Build package
        run: python -m build
      - name: Publish to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -0,0 +1,25 @@
 name: Ruff
 on: 
  push:
    branches: [ main, master ]
  pull_request:
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v5
      - name: Install the project
        run: |
          uv sync --all-extras --dev
          source .venv/bin/activate
      - name: Run Ruff
        run: |
          uv run ruff format --diff
          uv run ruff check
      # - name: Run tests
      #   run: uv run pytest tests
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,199 @@
 # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 ### Python Patch ###
 # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
 poetry.toml
 # ruff
 .ruff_cache/
 # LSP config files
 pyrightconfig.json
 ### VisualStudioCode ###
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets
 # Local History for Visual Studio Code
 .history/
 # Built Visual Studio Code Extensions
 *.vsix
 ### VisualStudioCode Patch ###
 # Ignore all local history of files
 .history
 .ionide
 # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
 .DS_Store
 *.db
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,11 @@
 {
    "python.testing.unittestArgs": [
        "-v",
        "-s",
        "./tests",
        "-p",
        "test_*.py"
    ],
    "python.testing.pytestEnabled": false,
    "python.testing.unittestEnabled": true
 }
--- a/19
+++ b/19
@ -0,0 +1,19 @@
 FROM ghcr.io/astral-sh/uv:python3.10-bookworm-slim
 WORKDIR /app
 RUN mkdir -p /tmp/uv-cache /app/data /app/logs
 COPY pyproject.toml uv.lock LICENSE README.md ./
 COPY deepsearcher/ ./deepsearcher/
 RUN uv sync 
 COPY . .
 EXPOSE 8000
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/docs || exit 1
 CMD ["uv", "run", "python", "main.py", "--enable-cors", "true"] 
--- a/201
+++ b/201
@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright 2019 Zilliz
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 lint:
 	uv run ruff format --diff
 	uv run ruff check
 format:
 	uv run ruff format
 	uv run ruff check --fix
--- a/README.md
+++ b/README.md
@ -0,0 +1,590 @@
 ![DeepSearcher](./assets/pic/logo.png)
 <div align="center">
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![DeepWiki](https://img.shields.io/badge/DeepWiki-AI%20Docs-orange.svg)](https://deepwiki.com/zilliztech/deep-searcher)
 [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/zilliz_universe.svg?style=social&label=Follow%20%40Zilliz)](https://twitter.com/zilliz_universe)
 <a href="https://discord.gg/mKc3R95yE5"><img height="20" src="https://img.shields.io/badge/Discord-%235865F2.svg?style=for-the-badge&logo=discord&logoColor=white" alt="discord"/></a>
 </div>
 ---
 DeepSearcher combines cutting-edge LLMs (OpenAI o3, Qwen3, DeepSeek, Grok 4, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answer and comprehensive report. This project is suitable for enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios.
 ![Architecture](./assets/pic/deep-searcher-arch.png)
 ## 🚀 Features
 - **Private Data Search**: Maximizes the utilization of enterprise internal data while ensuring data security. When necessary, it can integrate online content for more accurate answers.
 - **Vector Database Management**: Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval.
 - **Flexible Embedding Options**: Compatible with multiple embedding models for optimal selection.
 - **Multiple LLM Support**: Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation.
 - **Document Loader**: Supports local file loading, with web crawling capabilities under development.
 ---
 ## 🎉 Demo
 ![demo](./assets/pic/demo.gif)
 ## 📖 Quick Start
 ### Installation
 Install DeepSearcher using one of the following methods:
 #### Option 1: Using pip
 Create and activate a virtual environment(Python 3.10 version is recommended).
 ```bash
 python -m venv .venv
 source .venv/bin/activate
 ```
 Install DeepSearcher
 ```bash
 pip install deepsearcher
 ```
 For optional dependencies, e.g., ollama:
 ```bash
 pip install "deepsearcher[ollama]"
 ```
 #### Option 2: Install in Development Mode
 We recommend using [uv](https://github.com/astral-sh/uv) for faster and more reliable installation. Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/) to install it.
 Clone the repository and navigate to the project directory:
 ```shell
 git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher
 ```
 Synchronize and install dependencies:
 ```shell
 uv sync
 source .venv/bin/activate
 ```
 For more detailed development setup and optional dependency installation options, see [CONTRIBUTING.md](CONTRIBUTING.md#development-environment-setup-with-uv).
 ### Quick start demo
 To run this quick start demo, please prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key.
 ```python
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.online_query import query
 config = Configuration()
 # Customize your config here,
 # more configuration see the Configuration Details section below.
 config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
 config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"})
 init_config(config = config)
 # Load your local data
 from deepsearcher.offline_loading import load_from_local_files
 load_from_local_files(paths_or_directory=your_local_path)
 # (Optional) Load from web crawling (`FIRECRAWL_API_KEY` env variable required)
 from deepsearcher.offline_loading import load_from_website
 load_from_website(urls=website_url)
 # Query
 result = query("Write a report about xxx.") # Your question here
 ```
 ### Configuration Details:
 #### LLM Configuration
 <pre><code>config.set_provider_config("llm", "(LLMName)", "(Arguments dict)")</code></pre>
 <p>The "LLMName" can be one of the following: ["DeepSeek", "OpenAI", "XAI", "SiliconFlow", "Aliyun", "PPIO", "TogetherAI", "Gemini", "Ollama", "Novita"]</p>
 <p> The "Arguments dict" is a dictionary that contains the necessary arguments for the LLM class.</p>
 <details>
  <summary>Example (OpenAI)</summary>
    <p> Make sure you have prepared your OPENAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})</code></pre>
    <p> More details about OpenAI models: https://platform.openai.com/docs/models </p>
 </details>
 <details>
  <summary>Example (Qwen3 from Aliyun Bailian)</summary>
    <p> Make sure you have prepared your Bailian API KEY as an env variable <code>DASHSCOPE_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "Aliyun", {"model": "qwen-plus-latest"})</code></pre>
    <p> More details about Aliyun Bailian models: https://bailian.console.aliyun.com </p>
 </details>
 <details>
  <summary>Example (Qwen3 from OpenRouter)</summary>
    <pre><code>config.set_provider_config("llm", "OpenAI", {"model": "qwen/qwen3-235b-a22b:free", "base_url": "https://openrouter.ai/api/v1", "api_key": "OPENROUTER_API_KEY"})</code></pre>
    <p> More details about OpenRouter models: https://openrouter.ai/qwen/qwen3-235b-a22b:free </p>
 </details>
 <details>
  <summary>Example (DeepSeek from official)</summary>
    <p> Make sure you have prepared your DEEPSEEK API KEY as an env variable <code>DEEPSEEK_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"})</code></pre>
    <p> More details about DeepSeek: https://api-docs.deepseek.com/ </p>
 </details>
 <details>
  <summary>Example (DeepSeek from SiliconFlow)</summary>
    <p> Make sure you have prepared your SILICONFLOW API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre>
    <p> More details about SiliconFlow: https://docs.siliconflow.cn/quickstart </p>
 </details>
 <details>
  <summary>Example (DeepSeek from TogetherAI)</summary>
    <p> Make sure you have prepared your TOGETHER API KEY as an env variable <code>TOGETHER_API_KEY</code>.</p>
    For deepseek R1:
    <pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"})</code></pre>
    For Llama 4:
    <pre><code>config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"})</code></pre>
    <p> You need to install together before running, execute: <code>pip install together</code>. More details about TogetherAI: https://www.together.ai/ </p>
 </details>
 <details>
  <summary>Example (XAI Grok)</summary>
    <p> Make sure you have prepared your XAI API KEY as an env variable <code>XAI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "XAI", {"model": "grok-4-0709"})</code></pre>
    <p> More details about XAI Grok: https://docs.x.ai/docs/overview#featured-models </p>
 </details>
 <details>
  <summary>Example (Claude)</summary>
    <p> Make sure you have prepared your ANTHROPIC API KEY as an env variable <code>ANTHROPIC_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"})</code></pre>
    <p> More details about Anthropic Claude: https://docs.anthropic.com/en/home </p>
 </details>
 <details>
  <summary>Example (Google Gemini)</summary>
    <p> Make sure you have prepared your GEMINI API KEY as an env variable <code>GEMINI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' })</code></pre>
    <p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p>
 </details>
 <details>
  <summary>Example (DeepSeek from PPIO)</summary>
    <p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>. You can create an API Key <a href="https://ppinfra.com/settings/key-management?utm_source=github_deep-searcher">here</a>. </p>
    <pre><code>config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"})</code></pre>
    <p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p>
 </details>
 <details>
  <summary>Example (Ollama)</summary>
  <p> Follow <a href="https://github.com/jmorganca/ollama">these instructions</a> to set up and run a local Ollama instance:</p>
  <p> <a href="https://ollama.ai/download">Download</a> and install Ollama onto the available supported platforms (including Windows Subsystem for Linux).</p>
  <p> View a list of available models via the <a href="https://ollama.ai/library">model library</a>.</p>
  <p> Fetch available LLM models via <code>ollama pull &lt;name-of-model&gt;</code></p>
  <p> Example: <code>ollama pull qwen3</code></p>
  <p> To chat directly with a model from the command line, use <code>ollama run &lt;name-of-model&gt;</code>.</p>
  <p> By default, Ollama has a REST API for running and managing models on <a href="http://localhost:11434">http://localhost:11434</a>.</p>
  <pre><code>config.set_provider_config("llm", "Ollama", {"model": "qwen3"})</code></pre>
 </details>
 <details>
  <summary>Example (Volcengine)</summary>
    <p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>. You can create an API Key <a href="https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey">here</a>. </p>
    <pre><code>config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"})</code></pre>
    <p> More details about Volcengine: https://www.volcengine.com/docs/82379/1099455?utm_source=github_deep-searcher </p>
 </details>
 <details>
  <summary>Example (GLM)</summary>
    <p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"})</code></pre>
    <p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p>
 </details>
 <details>
  <summary>Example (Amazon Bedrock)</summary>
    <p> Make sure you have prepared your Amazon Bedrock API KEY as an env variable <code>AWS_ACCESS_KEY_ID</code> and <code>AWS_SECRET_ACCESS_KEY</code>.</p>
    <pre><code>config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"})</code></pre>
    <p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p>
 </details>
 <details>
  <summary>Example (IBM watsonx.ai)</summary>
    <p> Make sure you have prepared your watsonx.ai credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p>
    <pre><code>config.set_provider_config("llm", "watsonx", {"model": "us.deepseek.r1-v1:0"})</code></pre>
    <p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p>
 </details>
 #### Embedding Model Configuration
 <pre><code>config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)")</code></pre>
 <p>The "EmbeddingModelName" can be one of the following: ["MilvusEmbedding", "OpenAIEmbedding", "VoyageEmbedding", "SiliconflowEmbedding", "PPIOEmbedding", "NovitaEmbedding"]</p>
 <p> The "Arguments dict" is a dictionary that contains the necessary arguments for the embedding model class.</p>
 <details>
  <summary>Example (OpenAI embedding)</summary>
    <p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"})</code></pre>
    <p> More details about OpenAI models: https://platform.openai.com/docs/guides/embeddings/use-cases </p>
 </details>
 <details>
  <summary>Example (OpenAI embedding Azure)</summary>
    <p> Make sure you have prepared your OpenAI API KEY as an env variable <code>OPENAI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "OpenAIEmbedding", {
    "model": "text-embedding-ada-002",
    "azure_endpoint": "https://<youraifoundry>.openai.azure.com/",
    "api_version": "2023-05-15"
 })</code></pre>
 </details>
 <details>
  <summary>Example (Pymilvus built-in embedding model)</summary>
    <p> Use the built-in embedding model in Pymilvus, you can set the model name as <code>"default"</code>, <code>"BAAI/bge-base-en-v1.5"</code>, <code>"BAAI/bge-large-en-v1.5"</code>, <code>"jina-embeddings-v3"</code>, etc. <br/>
    See [milvus_embedding.py](deepsearcher/embedding/milvus_embedding.py) for more details.  </p>
    <pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"})</code></pre>
    <pre><code>config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"})</code></pre>
    <p> For Jina's embedding model, you need<code>JINAAI_API_KEY</code>.</p>
    <p> You need to install pymilvus model before running, execute: <code>pip install pymilvus.model</code>. More details about Pymilvus: https://milvus.io/docs/embeddings.md </p>
 </details>
 <details>
  <summary>Example (VoyageAI embedding)</summary>
    <p> Make sure you have prepared your VOYAGE API KEY as an env variable <code>VOYAGE_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"})</code></pre>
    <p> You need to install voyageai before running, execute: <code>pip install voyageai</code>. More details about VoyageAI: https://docs.voyageai.com/embeddings/ </p>
 </details>
 <details>
  <summary>Example (Amazon Bedrock embedding)</summary>
  <pre><code>config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"})</code></pre>
  <p> You need to install boto3 before running, execute: <code>pip install boto3</code>. More details about Amazon Bedrock: https://docs.aws.amazon.com/bedrock/ </p>
 </details>
 <details>
  <summary>Example (Novita AI embedding)</summary>
    <p> Make sure you have prepared your Novita AI API KEY as an env variable <code>NOVITA_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"})</code></pre>
    <p> More details about Novita AI: https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link </p>
 </details>
 <details>
  <summary>Example (Siliconflow embedding)</summary>
    <p> Make sure you have prepared your Siliconflow API KEY as an env variable <code>SILICONFLOW_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"})</code></pre>
    <p> More details about Siliconflow: https://docs.siliconflow.cn/en/api-reference/embeddings/create-embeddings </p>
 </details>
 <details>
  <summary>Example (Volcengine embedding)</summary>
    <p> Make sure you have prepared your Volcengine API KEY as an env variable <code>VOLCENGINE_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"})</code></pre>
    <p> More details about Volcengine: https://www.volcengine.com/docs/82379/1302003 </p>
 </details>
 <details>
  <summary>Example (GLM embedding)</summary>
    <p> Make sure you have prepared your GLM API KEY as an env variable <code>GLM_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"})</code></pre>
    <p> You need to install zhipuai before running, execute: <code>pip install zhipuai</code>. More details about GLM: https://bigmodel.cn/dev/welcome </p>
 </details>
 <details>
  <summary>Example (Google Gemini embedding)</summary>
    <p> Make sure you have prepared your Gemini API KEY as an env variable <code>GEMINI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"})</code></pre>
    <p> You need to install gemini before running, execute: <code>pip install google-genai</code>. More details about Gemini: https://ai.google.dev/gemini-api/docs </p>
 </details>
 <details>
  <summary>Example (Ollama embedding)</summary>
    <pre><code>config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"})</code></pre>
    <p> You need to install ollama before running, execute: <code>pip install ollama</code>. More details about Ollama Python SDK: https://github.com/ollama/ollama-python </p>
 </details>
 <details>
  <summary>Example (PPIO embedding)</summary>
    <p> Make sure you have prepared your PPIO API KEY as an env variable <code>PPIO_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"})</code></pre>
    <p> More details about PPIO: https://ppinfra.com/docs/get-started/quickstart.html?utm_source=github_deep-searcher </p>
 </details>
 <details>
  <summary>Example (FastEmbed embedding)</summary>
    <pre><code>config.set_provider_config("embedding", "FastEmbedEmbedding", {"model": "intfloat/multilingual-e5-large"})</code></pre>
    <p> You need to install fastembed before running, execute: <code>pip install fastembed</code>. More details about fastembed: https://github.com/qdrant/fastembed </p>
 </details>
 <details>
  <summary>Example (IBM watsonx.ai embedding)</summary>
    <p> Make sure you have prepared your WatsonX credentials as env variables <code>WATSONX_APIKEY</code>, <code>WATSONX_URL</code>, and <code>WATSONX_PROJECT_ID</code>.</p>
    <pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"})</code></pre>
    <pre><code>config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "sentence-transformers/all-minilm-l6-v2"})</code></pre>
    <p> You need to install ibm-watsonx-ai before running, execute: <code>pip install ibm-watsonx-ai</code>. More details about IBM watsonx.ai: https://www.ibm.com/products/watsonx-ai/foundation-models </p>
 </details>
 #### Vector Database Configuration
 <pre><code>config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)")</code></pre>
 <p>The "VectorDBName" can be one of the following: ["Milvus"] (Under development)</p>
 <p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Vector Database class.</p>
 <details>
  <summary>Example (Milvus)</summary>
    <pre><code>config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})</code></pre>
    <p> More details about Milvus Config:</p>
    <ul>
        <li>
            Setting the <code>uri</code> as a local file, e.g. <code>./milvus.db</code>, is the most convenient method, as it automatically utilizes <a href="https://milvus.io/docs/milvus_lite.md" target="_blank">Milvus Lite</a> to store all data in this file.
        </li>
    </ul>
    <ul>
      <li>
          If you have a large-scale dataset, you can set up a more performant Milvus server using 
          <a href="https://milvus.io/docs/quickstart.md" target="_blank">Docker or Kubernetes</a>. 
          In this setup, use the server URI, e.g., <code>http://localhost:19530</code>, as your <code>uri</code>. 
          You can also use any other connection parameters supported by Milvus such as <code>host</code>, <code>user</code>, <code>password</code>, or <code>secure</code>.
        </li>
    </ul>
    <ul>
        <li>
            If you want to use <a href="https://zilliz.com/cloud" target="_blank">Zilliz Cloud</a>, 
            the fully managed cloud service for Milvus, adjust the <code>uri</code> and <code>token</code> 
            according to the <a href="https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details" 
            target="_blank">Public Endpoint and API Key</a> in Zilliz Cloud.
        </li>
    </ul>
 </details>
 <details>
  <summary>Example (AZURE AI Search)</summary>
    <pre><code>config.set_provider_config("vector_db", "AzureSearch", {
    "endpoint": "https://<yourazureaisearch>.search.windows.net",
    "index_name": "<yourindex>",
    "api_key": "<yourkey>",
    "vector_field": ""
 })</code></pre>
    <p> More details about Milvus Config:</p>
 </details>
 #### File Loader Configuration
 <pre><code>config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)")</code></pre>
 <p>The "FileLoaderName" can be one of the following: ["PDFLoader", "TextLoader", "UnstructuredLoader"]</p>
 <p> The "Arguments dict" is a dictionary that contains the necessary arguments for the File Loader class.</p>
 <details>
  <summary>Example (Unstructured)</summary>
    <p>You can use Unstructured in two ways:</p>
    <ul>
      <li>With API: Set environment variables <code>UNSTRUCTURED_API_KEY</code> and <code>UNSTRUCTURED_API_URL</code></li>
      <li>Without API: Use the local processing mode by simply not setting these environment variables</li>
    </ul>
    <pre><code>config.set_provider_config("file_loader", "UnstructuredLoader", {})</code></pre>
    <ul>
      <li>Currently supported file types: ["pdf"] (Under development)</li>
      <li>Installation requirements:
        <ul>
          <li>Install ingest pipeline: <code>pip install unstructured-ingest</code></li>
          <li>For all document formats: <code>pip install "unstructured[all-docs]"</code></li>
          <li>For specific formats (e.g., PDF only): <code>pip install "unstructured[pdf]"</code></li>
        </ul>
      </li>
      <li>More information:
        <ul>
          <li>Unstructured documentation: <a href="https://docs.unstructured.io/ingestion/overview">https://docs.unstructured.io/ingestion/overview</a></li>
          <li>Installation guide: <a href="https://docs.unstructured.io/open-source/installation/full-installation">https://docs.unstructured.io/open-source/installation/full-installation</a></li>
        </ul>
      </li>
    </ul>
 </details>
 <details>
  <summary>Example (Docling)</summary>
    <pre><code>config.set_provider_config("file_loader", "DoclingLoader", {})</code></pre>
    <p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p>
    <p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p>
 </details>
 #### Web Crawler Configuration
 <pre><code>config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)")</code></pre>
 <p>The "WebCrawlerName" can be one of the following: ["FireCrawlCrawler", "Crawl4AICrawler", "JinaCrawler"]</p>
 <p> The "Arguments dict" is a dictionary that contains the necessary arguments for the Web Crawler class.</p>
 <details>
  <summary>Example (FireCrawl)</summary>
    <p> Make sure you have prepared your FireCrawl API KEY as an env variable <code>FIRECRAWL_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("web_crawler", "FireCrawlCrawler", {})</code></pre>
    <p> More details about FireCrawl: https://docs.firecrawl.dev/introduction </p>
 </details>
 <details>
  <summary>Example (Crawl4AI)</summary>
    <p> Make sure you have run <code>crawl4ai-setup</code> in your environment.</p>
    <pre><code>config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}})</code></pre>
    <p> You need to install crawl4ai before running, execute: <code>pip install crawl4ai</code>. More details about Crawl4AI: https://docs.crawl4ai.com/ </p>
 </details>
 <details>
  <summary>Example (Jina Reader)</summary>
    <p> Make sure you have prepared your Jina Reader API KEY as an env variable <code>JINA_API_TOKEN</code> or <code>JINAAI_API_KEY</code>.</p>
    <pre><code>config.set_provider_config("web_crawler", "JinaCrawler", {})</code></pre>
    <p> More details about Jina Reader: https://jina.ai/reader/ </p>
 </details>
 <details>
  <summary>Example (Docling)</summary>
    <pre><code>config.set_provider_config("web_crawler", "DoclingCrawler", {})</code></pre>
    <p> Currently supported file types: please refer to the Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats </p>
    <p> You need to install docling before running, execute: <code>pip install docling</code>. More details about Docling: https://docling-project.github.io/docling/ </p>
 </details>
 ### Python CLI Mode
 #### Load
 ```shell
 deepsearcher load "your_local_path_or_url"
 # load into a specific collection
 deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description"
 ```
 Example loading from local file:
 ```shell
 deepsearcher load "/path/to/your/local/file.pdf"
 # or more files at once
 deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md"
 ```
 Example loading from url (*Set `FIRECRAWL_API_KEY` in your environment variables, see [FireCrawl](https://docs.firecrawl.dev/introduction) for more details*):
 ```shell
 deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek"
 ```
 #### Query
 ```shell
 deepsearcher query "Write a report about xxx."
 ```
 More help information
 ```shell
 deepsearcher --help
 ```
 For more help information about a specific subcommand, you can use `deepsearcher [subcommand] --help`.
 ```shell
 deepsearcher load --help
 deepsearcher query --help
 ```
 ### Deployment
 #### Configure modules
 You can configure all arguments by modifying [config.yaml](./config.yaml) to set up your system with default modules.
 For example, set your `OPENAI_API_KEY` in the `llm` section of the YAML file.
 #### Start service
 The main script will run a FastAPI service with default address `localhost:8000`.
 ```shell
 $ python main.py
 ```
 #### Access via browser
 You can open url http://localhost:8000/docs in browser to access the web service.
 Click on the button "Try it out", it allows you to fill the parameters and directly interact with the API.
 ---
 ## ❓ Q&A
 **Q1**: Why I failed to parse LLM output format / How to select the LLM?
 **A1**: Small LLMs struggle to follow the prompt to generate a desired response, which usually cause the format parsing problem. A better practice is to use large reasoning models e.g. deepseek-r1 671b, OpenAI o-series, Claude 4 sonnet, etc. as your LLM. 
 ---
 **Q2**: 
 OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json.
 Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
 **A2**: This is mainly due to abnormal access to huggingface, which may be a network or permission problem. You can try the following two methods:
 1. If there is a network problem, set up a proxy, try adding the following environment variable.
 ```bash
 export HF_ENDPOINT=https://hf-mirror.com
 ```
 2. If there is a permission problem, set up a personal token, try adding the following environment variable.
 ```bash
 export HUGGING_FACE_HUB_TOKEN=xxxx
 ```
 ---
 **Q3**: DeepSearcher doesn't run in Jupyter notebook.
 **A3**: Install `nest_asyncio` and then put this code block in front of your jupyter notebook.
 ```
 pip install nest_asyncio
 ```
 ```
 import nest_asyncio
 nest_asyncio.apply()
 ```
 ---
 ## 🔧 Module Support
 ### 🔹 Embedding Models
 - [Open-source embedding models](https://milvus.io/docs/embeddings.md)
 - [OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases) (`OPENAI_API_KEY` env variable required)
 - [VoyageAI](https://docs.voyageai.com/embeddings/) (`VOYAGE_API_KEY` env variable required)
 - [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/) (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` env variable required)
 - [FastEmbed](https://qdrant.github.io/fastembed/)
 - [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required)
 - [Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required)
 - [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variables required)
 ### 🔹 LLM Support
 - [OpenAI](https://platform.openai.com/docs/models) (`OPENAI_API_KEY` env variable required)
 - [DeepSeek](https://api-docs.deepseek.com/) (`DEEPSEEK_API_KEY` env variable required)
 - [XAI Grok](https://x.ai/api) (`XAI_API_KEY` env variable required)
 - [Anthropic Claude](https://docs.anthropic.com/en/home) (`ANTHROPIC_API_KEY` env variable required)
 - [SiliconFlow Inference Service](https://docs.siliconflow.cn/en/userguide/introduction) (`SILICONFLOW_API_KEY` env variable required)
 - [PPIO](https://ppinfra.com/model-api/product/llm-api?utm_source=github_deep-searcher) (`PPIO_API_KEY` env variable required)
 - [TogetherAI Inference Service](https://docs.together.ai/docs/introduction) (`TOGETHER_API_KEY` env variable required)
 - [Google Gemini](https://ai.google.dev/gemini-api/docs) (`GEMINI_API_KEY` env variable required)
 - [SambaNova Cloud Inference Service](https://docs.together.ai/docs/introduction) (`SAMBANOVA_API_KEY` env variable required)
 - [Ollama](https://ollama.com/)
 - [Novita AI](https://novita.ai/docs/guides/introduction?utm_source=github_deep-searcher&utm_medium=github_readme&utm_campaign=link) (`NOVITA_API_KEY` env variable required)
 - [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm) (`WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` env variable required)
 ### 🔹 Document Loader
 - Local File
  - PDF(with txt/md) loader
  - [Unstructured](https://unstructured.io/) (under development) (`UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_URL` env variables required)
 - Web Crawler
  - [FireCrawl](https://docs.firecrawl.dev/introduction) (`FIRECRAWL_API_KEY` env variable required)
  - [Jina Reader](https://jina.ai/reader/) (`JINA_API_TOKEN` env variable required)
  - [Crawl4AI](https://docs.crawl4ai.com/) (You should run command `crawl4ai-setup` for the first time)
 ### 🔹 Vector Database Support
 - [Milvus](https://milvus.io/) and [Zilliz Cloud](https://www.zilliz.com/) (fully managed Milvus)
 - [Qdrant](https://qdrant.tech/)
 ---
 ## 📊 Evaluation 
 See the [Evaluation](./evaluation) directory for more details.
 ---
 ## 📌 Future Plans
 - Enhance web crawling functionality
 - Support more vector databases (e.g., FAISS...)
 - Add support for additional large models
 - Provide RESTful API interface (**DONE**)
 We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯
--- a/assets/pic/deep-searcher-arch.png
+++ b/assets/pic/deep-searcher-arch.png
--- a/assets/pic/demo.gif
+++ b/assets/pic/demo.gif
--- a/assets/pic/logo.png
+++ b/assets/pic/logo.png
--- a/deepsearcher/init.py
+++ b/deepsearcher/init.py
@ -0,0 +1,5 @@
 import os
 # ignore the warnings
 # None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
--- a/deepsearcher/agent/init.py
+++ b/deepsearcher/agent/init.py
@ -0,0 +1,12 @@
 from .base import BaseAgent, RAGAgent
 from .chain_of_rag import ChainOfRAG
 from .deep_search import DeepSearch
 from .naive_rag import NaiveRAG
 __all__ = [
    "ChainOfRAG",
    "DeepSearch",
    "NaiveRAG",
    "BaseAgent",
    "RAGAgent",
 ]
--- a/deepsearcher/agent/base.py
+++ b/deepsearcher/agent/base.py
@ -0,0 +1,103 @@
 from abc import ABC
 from typing import Any, List, Tuple
 from deepsearcher.vector_db import RetrievalResult
 def describe_class(description):
    """
    Decorator function to add a description to a class.
    This decorator adds a __description__ attribute to the decorated class,
    which can be used for documentation or introspection.
    Args:
        description: The description to add to the class.
    Returns:
        A decorator function that adds the description to the class.
    """
    def decorator(cls):
        cls.__description__ = description
        return cls
    return decorator
 class BaseAgent(ABC):
    """
    Abstract base class for all agents in the DeepSearcher system.
    This class defines the basic interface for agents, including initialization
    and invocation methods.
    """
    def __init__(self, **kwargs):
        """
        Initialize a BaseAgent object.
        Args:
            **kwargs: Arbitrary keyword arguments.
        """
        pass
    def invoke(self, query: str, **kwargs) -> Any:
        """
        Invoke the agent and return the result.
        Args:
            query: The query string.
            **kwargs: Additional keyword arguments.
        Returns:
            The result of invoking the agent.
        """
 class RAGAgent(BaseAgent):
    """
    Abstract base class for Retrieval-Augmented Generation (RAG) agents.
    This class extends BaseAgent with methods specific to RAG, including
    retrieval and query methods.
    """
    def __init__(self, **kwargs):
        """
        Initialize a RAGAgent object.
        Args:
            **kwargs: Arbitrary keyword arguments.
        """
        pass
    def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
        """
        Retrieve document results from the knowledge base.
        Args:
            query: The query string.
            **kwargs: Additional keyword arguments.
        Returns:
            A tuple containing:
                - the retrieved results
                - the total number of token usages of the LLM
                - any additional metadata, which can be an empty dictionary
        """
    def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
        """
        Query the agent and return the answer.
        Args:
            query: The query string.
            **kwargs: Additional keyword arguments.
        Returns:
            A tuple containing:
                - the result generated from LLM
                - the retrieved document results
                - the total number of token usages of the LLM
        """
--- a/deepsearcher/agent/chain_of_rag.py
+++ b/deepsearcher/agent/chain_of_rag.py
@ -0,0 +1,326 @@
 from typing import List, Tuple
 from deepsearcher.agent.base import RAGAgent, describe_class
 from deepsearcher.agent.collection_router import CollectionRouter
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.utils import log
 from deepsearcher.vector_db import RetrievalResult
 from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results
 FOLLOWUP_QUERY_PROMPT = """You are using a search tool to answer the main query by iteratively searching the database. Given the following intermediate queries and answers, generate a new simple follow-up question that can help answer the main query. You may rephrase or decompose the main query when previous answers are not helpful. Ask simple follow-up questions only as the search tool may not understand complex questions.
 ## Previous intermediate queries and answers
 {intermediate_context}
 ## Main query to answer
 {query}
 Respond with a simple follow-up question that will help answer the main query, do not explain yourself or output anything else.
 """
 INTERMEDIATE_ANSWER_PROMPT = """Given the following documents, generate an appropriate answer for the query. DO NOT hallucinate any information, only use the provided documents to generate the answer. Respond "No relevant information found" if the documents do not contain useful information.
 ## Documents
 {retrieved_documents}
 ## Query
 {sub_query}
 Respond with a concise answer only, do not explain yourself or output anything else.
 """
 FINAL_ANSWER_PROMPT = """Given the following intermediate queries and answers, generate a final answer for the main query by combining relevant information. Note that intermediate answers are generated by an LLM and may not always be accurate.
 ## Documents
 {retrieved_documents}
 ## Intermediate queries and answers
 {intermediate_context}
 ## Main query
 {query}
 Respond with an appropriate answer only, do not explain yourself or output anything else.
 """
 REFLECTION_PROMPT = """Given the following intermediate queries and answers, judge whether you have enough information to answer the main query. If you believe you have enough information, respond with "Yes", otherwise respond with "No".
 ## Intermediate queries and answers
 {intermediate_context}
 ## Main query
 {query}
 Respond with "Yes" or "No" only, do not explain yourself or output anything else.
 """
 GET_SUPPORTED_DOCS_PROMPT = """Given the following documents, select the ones that are support the Q-A pair.
 ## Documents
 {retrieved_documents}
 ## Q-A Pair
 ### Question
 {query}
 ### Answer
 {answer}
 Respond with a python list of indices of the selected documents.
 """
@describe_class(
    "This agent can decompose complex queries and gradually find the fact information of sub-queries. "
    "It is very suitable for handling concrete factual queries and multi-hop questions."
 )
 class ChainOfRAG(RAGAgent):
    """
    Chain of Retrieval-Augmented Generation (RAG) agent implementation.
    This agent implements a multi-step RAG process where each step can refine
    the query and retrieval process based on previous results, creating a chain
    of increasingly focused and relevant information retrieval and generation.
    Inspired by: https://arxiv.org/pdf/2501.14342
    """
    def __init__(
        self,
        llm: BaseLLM,
        embedding_model: BaseEmbedding,
        vector_db: BaseVectorDB,
        max_iter: int = 4,
        early_stopping: bool = False,
        route_collection: bool = True,
        text_window_splitter: bool = True,
        **kwargs,
    ):
        """
        Initialize the ChainOfRAG agent with configuration parameters.
        Args:
            llm (BaseLLM): The language model to use for generating answers.
            embedding_model (BaseEmbedding): The embedding model to use for embedding queries.
            vector_db (BaseVectorDB): The vector database to search for relevant documents.
            max_iter (int, optional): The maximum number of iterations for the RAG process. Defaults to 4.
            early_stopping (bool, optional): Whether to use early stopping. Defaults to False.
            route_collection (bool, optional): Whether to route the query to specific collections. Defaults to True.
            text_window_splitter (bool, optional): Whether use text_window splitter. Defaults to True.
        """
        self.llm = llm
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.max_iter = max_iter
        self.early_stopping = early_stopping
        self.route_collection = route_collection
        self.collection_router = CollectionRouter(
            llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
        )
        self.text_window_splitter = text_window_splitter
    def _reflect_get_subquery(self, query: str, intermediate_context: List[str]) -> Tuple[str, int]:
        chat_response = self.llm.chat(
            [
                {
                    "role": "user",
                    "content": FOLLOWUP_QUERY_PROMPT.format(
                        query=query,
                        intermediate_context="\n".join(intermediate_context),
                    ),
                }
            ]
        )
        return self.llm.remove_think(chat_response.content), chat_response.total_tokens
    def _retrieve_and_answer(self, query: str) -> Tuple[str, List[RetrievalResult], int]:
        consume_tokens = 0
        if self.route_collection:
            selected_collections, n_token_route = self.collection_router.invoke(
                query=query, dim=self.embedding_model.dimension
            )
        else:
            selected_collections = self.collection_router.all_collections
            n_token_route = 0
        consume_tokens += n_token_route
        all_retrieved_results = []
        for collection in selected_collections:
            log.color_print(f"<search> Search [{query}] in [{collection}]...  </search>\n")
            query_vector = self.embedding_model.embed_query(query)
            retrieved_results = self.vector_db.search_data(
                collection=collection, vector=query_vector, query_text=query
            )
            all_retrieved_results.extend(retrieved_results)
        all_retrieved_results = deduplicate_results(all_retrieved_results)
        chat_response = self.llm.chat(
            [
                {
                    "role": "user",
                    "content": INTERMEDIATE_ANSWER_PROMPT.format(
                        retrieved_documents=self._format_retrieved_results(all_retrieved_results),
                        sub_query=query,
                    ),
                }
            ]
        )
        return (
            self.llm.remove_think(chat_response.content),
            all_retrieved_results,
            consume_tokens + chat_response.total_tokens,
        )
    def _get_supported_docs(
        self,
        retrieved_results: List[RetrievalResult],
        query: str,
        intermediate_answer: str,
    ) -> Tuple[List[RetrievalResult], int]:
        supported_retrieved_results = []
        token_usage = 0
        if "No relevant information found" not in intermediate_answer:
            chat_response = self.llm.chat(
                [
                    {
                        "role": "user",
                        "content": GET_SUPPORTED_DOCS_PROMPT.format(
                            retrieved_documents=self._format_retrieved_results(retrieved_results),
                            query=query,
                            answer=intermediate_answer,
                        ),
                    }
                ]
            )
            supported_doc_indices = self.llm.literal_eval(chat_response.content)
            supported_retrieved_results = [
                retrieved_results[int(i)]
                for i in supported_doc_indices
                if int(i) < len(retrieved_results)
            ]
            token_usage = chat_response.total_tokens
        return supported_retrieved_results, token_usage
    def _check_has_enough_info(
        self, query: str, intermediate_contexts: List[str]
    ) -> Tuple[bool, int]:
        if not intermediate_contexts:
            return False, 0
        chat_response = self.llm.chat(
            [
                {
                    "role": "user",
                    "content": REFLECTION_PROMPT.format(
                        query=query,
                        intermediate_context="\n".join(intermediate_contexts),
                    ),
                }
            ]
        )
        has_enough_info = self.llm.remove_think(chat_response.content).strip().lower() == "yes"
        return has_enough_info, chat_response.total_tokens
    def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
        """
        Retrieves relevant documents based on the input query and iteratively refines the search.
        This method iteratively refines the search query based on intermediate results, retrieves documents,
        and filters out supported documents. It keeps track of the intermediate contexts and token usage.
        Args:
            query (str): The initial search query.
            **kwargs: Additional keyword arguments.
                - max_iter (int, optional): The maximum number of iterations for refinement. Defaults to self.max_iter.
        Returns:
            Tuple[List[RetrievalResult], int, dict]: A tuple containing:
                - List[RetrievalResult]: The list of all retrieved and deduplicated results.
                - int: The total token usage across all iterations.
                - dict: A dictionary containing additional information, including the intermediate contexts.
        """
        max_iter = kwargs.pop("max_iter", self.max_iter)
        intermediate_contexts = []
        all_retrieved_results = []
        token_usage = 0
        for iter in range(max_iter):
            log.color_print(f">> Iteration: {iter + 1}\n")
            followup_query, n_token0 = self._reflect_get_subquery(query, intermediate_contexts)
            intermediate_answer, retrieved_results, n_token1 = self._retrieve_and_answer(
                followup_query
            )
            supported_retrieved_results, n_token2 = self._get_supported_docs(
                retrieved_results, followup_query, intermediate_answer
            )
            all_retrieved_results.extend(supported_retrieved_results)
            intermediate_idx = len(intermediate_contexts) + 1
            intermediate_contexts.append(
                f"Intermediate query{intermediate_idx}: {followup_query}\nIntermediate answer{intermediate_idx}: {intermediate_answer}"
            )
            token_usage += n_token0 + n_token1 + n_token2
            if self.early_stopping:
                has_enough_info, n_token_check = self._check_has_enough_info(
                    query, intermediate_contexts
                )
                token_usage += n_token_check
                if has_enough_info:
                    log.color_print(
                        f"<think> Early stopping after iteration {iter + 1}: Have enough information to answer the main query. </think>\n"
                    )
                    break
        all_retrieved_results = deduplicate_results(all_retrieved_results)
        additional_info = {"intermediate_context": intermediate_contexts}
        return all_retrieved_results, token_usage, additional_info
    def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
        """
        Executes a query and returns the final answer along with all retrieved results and total token usage.
        This method initiates a query, retrieves relevant documents, and then summarizes the answer based on the retrieved documents and intermediate contexts. It logs the final answer and returns the answer content, all retrieved results, and the total token usage including the tokens used for the final answer.
        Args:
            query (str): The initial query to execute.
            **kwargs: Additional keyword arguments to pass to the `retrieve` method.
        Returns:
            Tuple[str, List[RetrievalResult], int]: A tuple containing:
                - str: The final answer content.
                - List[RetrievalResult]: The list of all retrieved and deduplicated results.
                - int: The total token usage across all iterations, including the final answer.
        """
        all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs)
        intermediate_context = additional_info["intermediate_context"]
        log.color_print(
            f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n"
        )
        chat_response = self.llm.chat(
            [
                {
                    "role": "user",
                    "content": FINAL_ANSWER_PROMPT.format(
                        retrieved_documents=self._format_retrieved_results(all_retrieved_results),
                        intermediate_context="\n".join(intermediate_context),
                        query=query,
                    ),
                }
            ]
        )
        log.color_print("\n==== FINAL ANSWER====\n")
        log.color_print(self.llm.remove_think(chat_response.content))
        return (
            self.llm.remove_think(chat_response.content),
            all_retrieved_results,
            n_token_retrieval + chat_response.total_tokens,
        )
    def _format_retrieved_results(self, retrieved_results: List[RetrievalResult]) -> str:
        formatted_documents = []
        for i, result in enumerate(retrieved_results):
            if self.text_window_splitter and "wider_text" in result.metadata:
                text = result.metadata["wider_text"]
            else:
                text = result.text
            formatted_documents.append(f"<Document {i}>\n{text}\n<\Document {i}>")
        return "\n".join(formatted_documents)
--- a/deepsearcher/agent/collection_router.py
+++ b/deepsearcher/agent/collection_router.py
@ -0,0 +1,98 @@
 from typing import List, Tuple
 from deepsearcher.agent.base import BaseAgent
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.utils import log
 from deepsearcher.vector_db.base import BaseVectorDB
 COLLECTION_ROUTE_PROMPT = """
 I provide you with collection_name(s) and corresponding collection_description(s). Please select the collection names that may be related to the question and return a python list of str. If there is no collection related to the question, you can return an empty list.
 "QUESTION": {question}
 "COLLECTION_INFO": {collection_info}
 When you return, you can ONLY return a json convertable python list of str, WITHOUT any other additional content. Your selected collection name list is:
 """
 class CollectionRouter(BaseAgent):
    """
    Routes queries to appropriate collections in the vector database.
    This class analyzes the content of a query and determines which collections
    in the vector database are most likely to contain relevant information.
    """
    def __init__(self, llm: BaseLLM, vector_db: BaseVectorDB, dim: int, **kwargs):
        """
        Initialize the CollectionRouter.
        Args:
            llm: The language model to use for analyzing queries.
            vector_db: The vector database containing the collections.
            dim: The dimension of the vector space to search in.
        """
        self.llm = llm
        self.vector_db = vector_db
        self.all_collections = [
            collection_info.collection_name
            for collection_info in self.vector_db.list_collections(dim=dim)
        ]
    def invoke(self, query: str, dim: int, **kwargs) -> Tuple[List[str], int]:
        """
        Determine which collections are relevant for the given query.
        This method analyzes the query content and selects collections that are
        most likely to contain information relevant to answering the query.
        Args:
            query (str): The query to analyze.
            dim (int): The dimension of the vector space to search in.
        Returns:
            Tuple[List[str], int]: A tuple containing:
                - A list of selected collection names
                - The token usage for the routing operation
        """
        consume_tokens = 0
        collection_infos = self.vector_db.list_collections(dim=dim)
        if len(collection_infos) == 0:
            log.color_print(
                "No collections found in the vector database. Please check the database connection."
            )
            return [], 0
        if len(collection_infos) == 1:
            the_only_collection = collection_infos[0].collection_name
            log.color_print(
                f"<think> Perform search [{query}] on the vector DB collection: {the_only_collection} </think>\n"
            )
            return [the_only_collection], 0
        vector_db_search_prompt = COLLECTION_ROUTE_PROMPT.format(
            question=query,
            collection_info=[
                {
                    "collection_name": collection_info.collection_name,
                    "collection_description": collection_info.description,
                }
                for collection_info in collection_infos
            ],
        )
        chat_response = self.llm.chat(
            messages=[{"role": "user", "content": vector_db_search_prompt}]
        )
        selected_collections = self.llm.literal_eval(chat_response.content)
        consume_tokens += chat_response.total_tokens
        for collection_info in collection_infos:
            # If a collection description is not provided, use the query as the search query
            if not collection_info.description:
                selected_collections.append(collection_info.collection_name)
            # If the default collection exists, use the query as the search query
            if self.vector_db.default_collection == collection_info.collection_name:
                selected_collections.append(collection_info.collection_name)
        selected_collections = list(set(selected_collections))
        log.color_print(
            f"<think> Perform search [{query}] on the vector DB collections: {selected_collections} </think>\n"
        )
        return selected_collections, consume_tokens
--- a/deepsearcher/agent/deep_search.py
+++ b/deepsearcher/agent/deep_search.py
@ -0,0 +1,319 @@
 import asyncio
 from typing import List, Tuple
 from deepsearcher.agent.base import RAGAgent, describe_class
 from deepsearcher.agent.collection_router import CollectionRouter
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.utils import log
 from deepsearcher.vector_db import RetrievalResult
 from deepsearcher.vector_db.base import BaseVectorDB, deduplicate_results
 SUB_QUERY_PROMPT = """To answer this question more comprehensively, please break down the original question into up to four sub-questions. Return as list of str.
 If this is a very simple question and no decomposition is necessary, then keep the only one original question in the python code list.
 Original Question: {original_query}
 <EXAMPLE>
 Example input:
 "Explain deep learning"
 Example output:
 [
    "What is deep learning?",
    "What is the difference between deep learning and machine learning?",
    "What is the history of deep learning?"
 ]
 </EXAMPLE>
 Provide your response in a python code list of str format:
 """
 RERANK_PROMPT = """Based on the query questions and the retrieved chunk, to determine whether the chunk is helpful in answering any of the query question, you can only return "YES" or "NO", without any other information.
 Query Questions: {query}
 Retrieved Chunk: {retrieved_chunk}
 Is the chunk helpful in answering the any of the questions?
 """
 REFLECT_PROMPT = """Determine whether additional search queries are needed based on the original query, previous sub queries, and all retrieved document chunks. If further research is required, provide a Python list of up to 3 search queries. If no further research is required, return an empty list.
 If the original query is to write a report, then you prefer to generate some further queries, instead return an empty list.
 Original Query: {question}
 Previous Sub Queries: {mini_questions}
 Related Chunks: 
 {mini_chunk_str}
 Respond exclusively in valid List of str format without any other text."""
 SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks.
 Original Query: {question}
 Previous Sub Queries: {mini_questions}
 Related Chunks: 
 {mini_chunk_str}
 """
@describe_class(
    "This agent is suitable for handling general and simple queries, such as given a topic and then writing a report, survey, or article."
 )
 class DeepSearch(RAGAgent):
    """
    Deep Search agent implementation for comprehensive information retrieval.
    This agent performs a thorough search through the knowledge base, analyzing
    multiple aspects of the query to provide comprehensive and detailed answers.
    """
    def __init__(
        self,
        llm: BaseLLM,
        embedding_model: BaseEmbedding,
        vector_db: BaseVectorDB,
        max_iter: int = 3,
        route_collection: bool = True,
        text_window_splitter: bool = True,
        **kwargs,
    ):
        """
        Initialize the DeepSearch agent.
        Args:
            llm: The language model to use for generating answers.
            embedding_model: The embedding model to use for query embedding.
            vector_db: The vector database to search for relevant documents.
            max_iter: The maximum number of iterations for the search process.
            route_collection: Whether to use a collection router for search.
            text_window_splitter: Whether to use text_window splitter.
            **kwargs: Additional keyword arguments for customization.
        """
        self.llm = llm
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.max_iter = max_iter
        self.route_collection = route_collection
        self.collection_router = CollectionRouter(
            llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
        )
        self.text_window_splitter = text_window_splitter
    def _generate_sub_queries(self, original_query: str) -> Tuple[List[str], int]:
        chat_response = self.llm.chat(
            messages=[
                {"role": "user", "content": SUB_QUERY_PROMPT.format(original_query=original_query)}
            ]
        )
        response_content = self.llm.remove_think(chat_response.content)
        return self.llm.literal_eval(response_content), chat_response.total_tokens
    async def _search_chunks_from_vectordb(self, query: str, sub_queries: List[str]):
        consume_tokens = 0
        if self.route_collection:
            selected_collections, n_token_route = self.collection_router.invoke(
                query=query, dim=self.embedding_model.dimension
            )
        else:
            selected_collections = self.collection_router.all_collections
            n_token_route = 0
        consume_tokens += n_token_route
        all_retrieved_results = []
        query_vector = self.embedding_model.embed_query(query)
        for collection in selected_collections:
            log.color_print(f"<search> Search [{query}] in [{collection}]...  </search>\n")
            retrieved_results = self.vector_db.search_data(
                collection=collection, vector=query_vector, query_text=query
            )
            if not retrieved_results or len(retrieved_results) == 0:
                log.color_print(
                    f"<search> No relevant document chunks found in '{collection}'! </search>\n"
                )
                continue
            accepted_chunk_num = 0
            references = set()
            for retrieved_result in retrieved_results:
                chat_response = self.llm.chat(
                    messages=[
                        {
                            "role": "user",
                            "content": RERANK_PROMPT.format(
                                query=[query] + sub_queries,
                                retrieved_chunk=f"<chunk>{retrieved_result.text}</chunk>",
                            ),
                        }
                    ]
                )
                consume_tokens += chat_response.total_tokens
                response_content = self.llm.remove_think(chat_response.content).strip()
                if "YES" in response_content and "NO" not in response_content:
                    all_retrieved_results.append(retrieved_result)
                    accepted_chunk_num += 1
                    references.add(retrieved_result.reference)
            if accepted_chunk_num > 0:
                log.color_print(
                    f"<search> Accept {accepted_chunk_num} document chunk(s) from references: {list(references)} </search>\n"
                )
            else:
                log.color_print(
                    f"<search> No document chunk accepted from '{collection}'! </search>\n"
                )
        return all_retrieved_results, consume_tokens
    def _generate_gap_queries(
        self, original_query: str, all_sub_queries: List[str], all_chunks: List[RetrievalResult]
    ) -> Tuple[List[str], int]:
        reflect_prompt = REFLECT_PROMPT.format(
            question=original_query,
            mini_questions=all_sub_queries,
            mini_chunk_str=self._format_chunk_texts([chunk.text for chunk in all_chunks])
            if len(all_chunks) > 0
            else "NO RELATED CHUNKS FOUND.",
        )
        chat_response = self.llm.chat([{"role": "user", "content": reflect_prompt}])
        response_content = self.llm.remove_think(chat_response.content)
        return self.llm.literal_eval(response_content), chat_response.total_tokens
    def retrieve(self, original_query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
        """
        Retrieve relevant documents from the knowledge base for the given query.
        This method performs a deep search through the vector database to find
        the most relevant documents for answering the query.
        Args:
            original_query (str): The query to search for.
            **kwargs: Additional keyword arguments for customizing the retrieval.
        Returns:
            Tuple[List[RetrievalResult], int, dict]: A tuple containing:
                - A list of retrieved document results
                - The token usage for the retrieval operation
                - Additional information about the retrieval process
        """
        return asyncio.run(self.async_retrieve(original_query, **kwargs))
    async def async_retrieve(
        self, original_query: str, **kwargs
    ) -> Tuple[List[RetrievalResult], int, dict]:
        max_iter = kwargs.pop("max_iter", self.max_iter)
        ### SUB QUERIES ###
        log.color_print(f"<query> {original_query} </query>\n")
        all_search_res = []
        all_sub_queries = []
        total_tokens = 0
        sub_queries, used_token = self._generate_sub_queries(original_query)
        total_tokens += used_token
        if not sub_queries:
            log.color_print("No sub queries were generated by the LLM. Exiting.")
            return [], total_tokens, {}
        else:
            log.color_print(
                f"<think> Break down the original query into new sub queries: {sub_queries}</think>\n"
            )
        all_sub_queries.extend(sub_queries)
        sub_gap_queries = sub_queries
        for iter in range(max_iter):
            log.color_print(f">> Iteration: {iter + 1}\n")
            search_res_from_vectordb = []
            search_res_from_internet = []  # TODO
            # Create all search tasks
            search_tasks = [
                self._search_chunks_from_vectordb(query, sub_gap_queries)
                for query in sub_gap_queries
            ]
            # Execute all tasks in parallel and wait for results
            search_results = await asyncio.gather(*search_tasks)
            # Merge all results
            for result in search_results:
                search_res, consumed_token = result
                total_tokens += consumed_token
                search_res_from_vectordb.extend(search_res)
            search_res_from_vectordb = deduplicate_results(search_res_from_vectordb)
            # search_res_from_internet = deduplicate_results(search_res_from_internet)
            all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
            if iter == max_iter - 1:
                log.color_print("<think> Exceeded maximum iterations. Exiting. </think>\n")
                break
            ### REFLECTION & GET GAP QUERIES ###
            log.color_print("<think> Reflecting on the search results... </think>\n")
            sub_gap_queries, consumed_token = self._generate_gap_queries(
                original_query, all_sub_queries, all_search_res
            )
            total_tokens += consumed_token
            if not sub_gap_queries or len(sub_gap_queries) == 0:
                log.color_print("<think> No new search queries were generated. Exiting. </think>\n")
                break
            else:
                log.color_print(
                    f"<think> New search queries for next iteration: {sub_gap_queries} </think>\n"
                )
                all_sub_queries.extend(sub_gap_queries)
        all_search_res = deduplicate_results(all_search_res)
        additional_info = {"all_sub_queries": all_sub_queries}
        return all_search_res, total_tokens, additional_info
    def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
        """
        Query the agent and generate an answer based on retrieved documents.
        This method retrieves relevant documents and uses the language model
        to generate a comprehensive answer to the query.
        Args:
            query (str): The query to answer.
            **kwargs: Additional keyword arguments for customizing the query process.
        Returns:
            Tuple[str, List[RetrievalResult], int]: A tuple containing:
                - The generated answer
                - A list of retrieved document results
                - The total token usage
        """
        all_retrieved_results, n_token_retrieval, additional_info = self.retrieve(query, **kwargs)
        if not all_retrieved_results or len(all_retrieved_results) == 0:
            return f"No relevant information found for query '{query}'.", [], n_token_retrieval
        all_sub_queries = additional_info["all_sub_queries"]
        chunk_texts = []
        for chunk in all_retrieved_results:
            if self.text_window_splitter and "wider_text" in chunk.metadata:
                chunk_texts.append(chunk.metadata["wider_text"])
            else:
                chunk_texts.append(chunk.text)
        log.color_print(
            f"<think> Summarize answer from all {len(all_retrieved_results)} retrieved chunks... </think>\n"
        )
        summary_prompt = SUMMARY_PROMPT.format(
            question=query,
            mini_questions=all_sub_queries,
            mini_chunk_str=self._format_chunk_texts(chunk_texts),
        )
        chat_response = self.llm.chat([{"role": "user", "content": summary_prompt}])
        log.color_print("\n==== FINAL ANSWER====\n")
        log.color_print(self.llm.remove_think(chat_response.content))
        return (
            self.llm.remove_think(chat_response.content),
            all_retrieved_results,
            n_token_retrieval + chat_response.total_tokens,
        )
    def _format_chunk_texts(self, chunk_texts: List[str]) -> str:
        chunk_str = ""
        for i, chunk in enumerate(chunk_texts):
            chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n"""
        return chunk_str
--- a/deepsearcher/agent/naive_rag.py
+++ b/deepsearcher/agent/naive_rag.py
@ -0,0 +1,128 @@
 from typing import List, Tuple
 from deepsearcher.agent.base import RAGAgent
 from deepsearcher.agent.collection_router import CollectionRouter
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.utils import log
 from deepsearcher.vector_db.base import BaseVectorDB, RetrievalResult, deduplicate_results
 SUMMARY_PROMPT = """You are a AI content analysis expert, good at summarizing content. Please summarize a specific and detailed answer or report based on the previous queries and the retrieved document chunks.
 Original Query: {query}
 Related Chunks: 
 {mini_chunk_str}
 """
 class NaiveRAG(RAGAgent):
    """
    Naive Retrieval-Augmented Generation agent implementation.
    This agent implements a straightforward RAG approach, retrieving relevant
    documents and generating answers without complex processing or refinement steps.
    """
    def __init__(
        self,
        llm: BaseLLM,
        embedding_model: BaseEmbedding,
        vector_db: BaseVectorDB,
        top_k: int = 10,
        route_collection: bool = True,
        text_window_splitter: bool = True,
        **kwargs,
    ):
        """
        Initialize the NaiveRAG agent.
        Args:
            llm: The language model to use for generating answers.
            embedding_model: The embedding model to use for query embedding.
            vector_db: The vector database to search for relevant documents.
            **kwargs: Additional keyword arguments for customization.
        """
        self.llm = llm
        self.embedding_model = embedding_model
        self.vector_db = vector_db
        self.top_k = top_k
        self.route_collection = route_collection
        if self.route_collection:
            self.collection_router = CollectionRouter(
                llm=self.llm, vector_db=self.vector_db, dim=embedding_model.dimension
            )
        self.text_window_splitter = text_window_splitter
    def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
        """
        Retrieve relevant documents from the knowledge base for the given query.
        This method performs a basic search through the vector database to find
        documents relevant to the query.
        Args:
            query (str): The query to search for.
            **kwargs: Additional keyword arguments for customizing the retrieval.
        Returns:
            Tuple[List[RetrievalResult], int, dict]: A tuple containing:
                - A list of retrieved document results
                - The token usage for the retrieval operation
                - Additional information about the retrieval process
        """
        consume_tokens = 0
        if self.route_collection:
            selected_collections, n_token_route = self.collection_router.invoke(
                query=query, dim=self.embedding_model.dimension
            )
        else:
            selected_collections = self.collection_router.all_collections
            n_token_route = 0
        consume_tokens += n_token_route
        all_retrieved_results = []
        for collection in selected_collections:
            retrieval_res = self.vector_db.search_data(
                collection=collection,
                vector=self.embedding_model.embed_query(query),
                top_k=max(self.top_k // len(selected_collections), 1),
                query_text=query,
            )
            all_retrieved_results.extend(retrieval_res)
        all_retrieved_results = deduplicate_results(all_retrieved_results)
        return all_retrieved_results, consume_tokens, {}
    def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
        """
        Query the agent and generate an answer based on retrieved documents.
        This method retrieves relevant documents and uses the language model
        to generate a simple answer to the query.
        Args:
            query (str): The query to answer.
            **kwargs: Additional keyword arguments for customizing the query process.
        Returns:
            Tuple[str, List[RetrievalResult], int]: A tuple containing:
                - The generated answer
                - A list of retrieved document results
                - The total token usage
        """
        all_retrieved_results, n_token_retrieval, _ = self.retrieve(query)
        chunk_texts = []
        for chunk in all_retrieved_results:
            if self.text_window_splitter and "wider_text" in chunk.metadata:
                chunk_texts.append(chunk.metadata["wider_text"])
            else:
                chunk_texts.append(chunk.text)
        mini_chunk_str = ""
        for i, chunk in enumerate(chunk_texts):
            mini_chunk_str += f"""<chunk_{i}>\n{chunk}\n</chunk_{i}>\n"""
        summary_prompt = SUMMARY_PROMPT.format(query=query, mini_chunk_str=mini_chunk_str)
        char_response = self.llm.chat([{"role": "user", "content": summary_prompt}])
        final_answer = char_response.content
        log.color_print("\n==== FINAL ANSWER====\n")
        log.color_print(final_answer)
        return final_answer, all_retrieved_results, n_token_retrieval + char_response.total_tokens
--- a/deepsearcher/agent/rag_router.py
+++ b/deepsearcher/agent/rag_router.py
@ -0,0 +1,93 @@
 from typing import List, Optional, Tuple
 from deepsearcher.agent import RAGAgent
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.utils import log
 from deepsearcher.vector_db import RetrievalResult
 RAG_ROUTER_PROMPT = """Given a list of agent indexes and corresponding descriptions, each agent has a specific function. 
 Given a query, select only one agent that best matches the agent handling the query, and return the index without any other information.
 ## Question
 {query}
 ## Agent Indexes and Descriptions
 {description_str}
 Only return one agent index number that best matches the agent handling the query:
 """
 class RAGRouter(RAGAgent):
    """
    Routes queries to the most appropriate RAG agent implementation.
    This class analyzes the content and requirements of a query and determines
    which RAG agent implementation is best suited to handle it.
    """
    def __init__(
        self,
        llm: BaseLLM,
        rag_agents: List[RAGAgent],
        agent_descriptions: Optional[List[str]] = None,
    ):
        """
        Initialize the RAGRouter.
        Args:
            llm: The language model to use for analyzing queries.
            rag_agents: A list of RAGAgent instances.
            agent_descriptions (list, optional): A list of descriptions for each agent.
        """
        self.llm = llm
        self.rag_agents = rag_agents
        self.agent_descriptions = agent_descriptions
        if not self.agent_descriptions:
            try:
                self.agent_descriptions = [
                    agent.__class__.__description__ for agent in self.rag_agents
                ]
            except Exception:
                raise AttributeError(
                    "Please provide agent descriptions or set __description__ attribute for each agent class."
                )
    def _route(self, query: str) -> Tuple[RAGAgent, int]:
        description_str = "\n".join(
            [f"[{i + 1}]: {description}" for i, description in enumerate(self.agent_descriptions)]
        )
        prompt = RAG_ROUTER_PROMPT.format(query=query, description_str=description_str)
        chat_response = self.llm.chat(messages=[{"role": "user", "content": prompt}])
        try:
            selected_agent_index = int(self.llm.remove_think(chat_response.content)) - 1
        except ValueError:
            # In some reasoning LLM, the output is not a number, but a explaination string with a number in the end.
            log.warning(
                "Parse int failed in RAGRouter, but will try to find the last digit as fallback."
            )
            selected_agent_index = (
                int(self.find_last_digit(self.llm.remove_think(chat_response.content))) - 1
            )
        selected_agent = self.rag_agents[selected_agent_index]
        log.color_print(
            f"<think> Select agent [{selected_agent.__class__.__name__}] to answer the query [{query}] </think>\n"
        )
        return self.rag_agents[selected_agent_index], chat_response.total_tokens
    def retrieve(self, query: str, **kwargs) -> Tuple[List[RetrievalResult], int, dict]:
        agent, n_token_router = self._route(query)
        retrieved_results, n_token_retrieval, metadata = agent.retrieve(query, **kwargs)
        return retrieved_results, n_token_router + n_token_retrieval, metadata
    def query(self, query: str, **kwargs) -> Tuple[str, List[RetrievalResult], int]:
        agent, n_token_router = self._route(query)
        answer, retrieved_results, n_token_retrieval = agent.query(query, **kwargs)
        return answer, retrieved_results, n_token_router + n_token_retrieval
    def find_last_digit(self, string):
        for char in reversed(string):
            if char.isdigit():
                return char
        raise ValueError("No digit found in the string")
--- a/deepsearcher/cli.py
+++ b/deepsearcher/cli.py
@ -0,0 +1,118 @@
 import argparse
 import logging
 import sys
 import warnings
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.offline_loading import load_from_local_files, load_from_website
 from deepsearcher.online_query import query
 from deepsearcher.utils import log
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 warnings.simplefilter(action="ignore", category=FutureWarning)  # disable warning output
 def main():
    """
    Main entry point for the DeepSearcher CLI.
    This function parses command line arguments and executes the appropriate action
    based on the subcommand provided (query or load). It handles the deprecated
    command line format and provides helpful error messages.
    Returns:
        None
    """
    if "--query" in sys.argv or "--load" in sys.argv:
        print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.")
        print("Please use:")
        print("  deepsearcher query <your_query> --max_iter 3")
        print(
            "  deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>"
        )
        sys.exit(1)
    config = Configuration()  # Customize your config here
    init_config(config=config)
    parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.")
    subparsers = parser.add_subparsers(dest="subcommand", title="subcommands")
    ## Arguments of query
    query_parser = subparsers.add_parser("query", help="Query a question or search topic.")
    query_parser.add_argument("query", type=str, default="", help="query question or search topic.")
    query_parser.add_argument(
        "--max_iter",
        type=int,
        default=3,
        help="Max iterations of reflection. Default is 3.",
    )
    ## Arguments of loading
    load_parser = subparsers.add_parser(
        "load", help="Load knowledge from local files or from URLs."
    )
    load_parser.add_argument(
        "load_path",
        type=str,
        nargs="+",  # 1 or more files or urls
        help="Load knowledge from local files or from URLs.",
    )
    load_parser.add_argument(
        "--batch_size",
        type=int,
        default=256,
        help="Batch size for loading knowledge.",
    )
    load_parser.add_argument(
        "--collection_name",
        type=str,
        default=None,
        help="Destination collection name of loaded knowledge.",
    )
    load_parser.add_argument(
        "--collection_desc",
        type=str,
        default=None,
        help="Description of the collection.",
    )
    load_parser.add_argument(
        "--force_new_collection",
        type=bool,
        default=False,
        help="If you want to drop origin collection and create a new collection on every load, set to True",
    )
    args = parser.parse_args()
    if args.subcommand == "query":
        final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter)
        log.color_print("\n==== FINAL ANSWER====\n")
        log.color_print(final_answer)
        log.color_print("\n### References\n")
        for i, ref in enumerate(refs):
            log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}")
    elif args.subcommand == "load":
        urls = [url for url in args.load_path if url.startswith("http")]
        local_files = [file for file in args.load_path if not file.startswith("http")]
        kwargs = {}
        if args.collection_name:
            kwargs["collection_name"] = args.collection_name
        if args.collection_desc:
            kwargs["collection_description"] = args.collection_desc
        if args.force_new_collection:
            kwargs["force_new_collection"] = args.force_new_collection
        if args.batch_size:
            kwargs["batch_size"] = args.batch_size
        if len(urls) > 0:
            load_from_website(urls, **kwargs)
        if len(local_files) > 0:
            load_from_local_files(local_files, **kwargs)
    else:
        print("Please provide a query or a load argument.")
 if __name__ == "__main__":
    main()
--- a/deepsearcher/config.yaml
+++ b/deepsearcher/config.yaml
@ -0,0 +1,87 @@
 provide_settings:
  llm:
    provider: "OpenAILLM"
    config:
      model: "Qwen/Qwen3-8B-FP8"
      api_key: "empty"
      base_url: "http://localhost:8000/v1"
  embedding:
    provider: "OpenAIEmbedding"
    config:
      model: "Qwen/Qwen3-Embedding-0.6B"
      api_key: "empty"
      base_url: "http://localhost:8001/v1"
      dimension: 1024
      dim_change: false
  file_loader:
    provider: "PDFLoader"
    config: {}
 #    provider: "JsonFileLoader"
 #    config:
 #      text_key: ""
 #    provider: "TextLoader"
 #    config: {}
 #    provider: "UnstructuredLoader"
 #    config: {}
 #    provider: "DoclingLoader"
 #    config: {}
  web_crawler:
    provider: "FireCrawlCrawler"
    config: {}
    # provider: "Crawl4AICrawler"
    # config: # Uncomment to custom browser configuration for Crawl4AI
    #   browser_config:
    #     headless: false
    #     proxy: "http://127.0.0.1:7890"
    #     chrome_channel: "chrome"
    #     verbose: true
    #     viewport_width: 800
    #     viewport_height: 600
    #    provider: "JinaCrawler"
    #    config: {}
    #    provider: "DoclingCrawler"
    #    config: {}
  vector_db:
    provider: "Milvus"
    config:
      default_collection: "deepsearcher"
      uri: "http://localhost:19530"
      token: "root:Milvus"
      db: "default"
  # vector_db:      
  #   provider: "OracleDB"
  #   config:
  #     default_collection: "deepsearcher"
  #     user: ""
  #     password: ""
  #     dsn: ""
  #     config_dir: ""
  #     wallet_location: ""
  #     wallet_password: ""
  # vector_db:      
  #   provider: "Qdrant"
  #   config:
  #     default_collection: "deepsearcher"
  #     host: "localhost"
  #     port: 6333
 query_settings:
  max_iter: 2
 load_settings:
  chunk_size: 1024
  chunk_overlap: 128
--- a/deepsearcher/configuration.py
+++ b/deepsearcher/configuration.py
@ -0,0 +1,240 @@
 import os
 from typing import Literal
 import yaml
 from deepsearcher.agent import ChainOfRAG, DeepSearch, NaiveRAG
 from deepsearcher.agent.rag_router import RAGRouter
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.loader.file_loader.base import BaseLoader
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.vector_db.base import BaseVectorDB
 current_dir = os.path.dirname(os.path.abspath(__file__))
 DEFAULT_CONFIG_YAML_PATH = os.path.join(current_dir, "config.yaml")
 FeatureType = Literal["llm", "embedding", "file_loader", "web_crawler", "vector_db"]
 class Configuration:
    """
    Configuration class for DeepSearcher.
    This class manages the configuration settings for various components of the DeepSearcher system,
    including LLM providers, embedding models, file loaders, web crawlers, and vector databases.
    It loads configurations from a YAML file and provides methods to get and set provider configurations.
    """
    def __init__(self, config_path: str = DEFAULT_CONFIG_YAML_PATH):
        """
        Initialize the Configuration object.
        Args:
            config_path: Path to the configuration YAML file. Defaults to the config.yaml in the project root.
        """
        # Initialize default configurations
        config_data = self.load_config_from_yaml(config_path)
        self.provide_settings = config_data["provide_settings"]
        self.query_settings = config_data["query_settings"]
        self.load_settings = config_data["load_settings"]
    def load_config_from_yaml(self, config_path: str):
        """
        Load configuration from a YAML file.
        Args:
            config_path: Path to the configuration YAML file.
        Returns:
            The loaded configuration data as a dictionary.
        """
        with open(config_path, "r") as file:
            return yaml.safe_load(file)
    def set_provider_config(self, feature: FeatureType, provider: str, provider_configs: dict):
        """
        Set the provider and its configurations for a given feature.
        Args:
            feature: The feature to configure (e.g., 'llm', 'file_loader', 'web_crawler').
            provider: The provider name (e.g., 'openai', 'deepseek').
            provider_configs: A dictionary with configurations specific to the provider.
        Raises:
            ValueError: If the feature is not supported.
        """
        if feature not in self.provide_settings:
            raise ValueError(f"Unsupported feature: {feature}")
        self.provide_settings[feature]["provider"] = provider
        self.provide_settings[feature]["config"] = provider_configs
    def get_provider_config(self, feature: FeatureType):
        """
        Get the current provider and configuration for a given feature.
        Args:
            feature: The feature to retrieve (e.g., 'llm', 'file_loader', 'web_crawler').
        Returns:
            A dictionary with provider and its configurations.
        Raises:
            ValueError: If the feature is not supported.
        """
        if feature not in self.provide_settings:
            raise ValueError(f"Unsupported feature: {feature}")
        return self.provide_settings[feature]
 class ModuleFactory:
    """
    Factory class for creating instances of various modules in the DeepSearcher system.
    This class creates instances of LLMs, embedding models, file loaders, web crawlers,
    and vector databases based on the configuration settings.
    """
    def __init__(self, config: Configuration):
        """
        Initialize the ModuleFactory.
        Args:
            config: The Configuration object containing provider settings.
        """
        self.config = config
    def _create_module_instance(self, feature: FeatureType, module_name: str):
        """
        Create an instance of a module based on the feature and module name.
        Args:
            feature: The feature type (e.g., 'llm', 'embedding').
            module_name: The module name to import from.
        Returns:
            An instance of the specified module.
        """
        # e.g.
        # feature = "file_loader"
        # module_name = "deepsearcher.loader.file_loader"
        class_name = self.config.provide_settings[feature]["provider"]
        module = __import__(module_name, fromlist=[class_name])
        class_ = getattr(module, class_name)
        return class_(**self.config.provide_settings[feature]["config"])
    def create_llm(self) -> BaseLLM:
        """
        Create an instance of a language model.
        Returns:
            An instance of a BaseLLM implementation.
        """
        return self._create_module_instance("llm", "deepsearcher.llm")
    def create_embedding(self) -> BaseEmbedding:
        """
        Create an instance of an embedding model.
        Returns:
            An instance of a BaseEmbedding implementation.
        """
        return self._create_module_instance("embedding", "deepsearcher.embedding")
    def create_file_loader(self) -> BaseLoader:
        """
        Create an instance of a file loader.
        Returns:
            An instance of a BaseLoader implementation.
        """
        return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader")
    def create_web_crawler(self) -> BaseCrawler:
        """
        Create an instance of a web crawler.
        Returns:
            An instance of a BaseCrawler implementation.
        """
        return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler")
    def create_vector_db(self) -> BaseVectorDB:
        """
        Create an instance of a vector database.
        Returns:
            An instance of a BaseVectorDB implementation.
        """
        return self._create_module_instance("vector_db", "deepsearcher.vector_db")
 config = Configuration()
 module_factory: ModuleFactory = None
 llm: BaseLLM = None
 embedding_model: BaseEmbedding = None
 file_loader: BaseLoader = None
 vector_db: BaseVectorDB = None
 web_crawler: BaseCrawler = None
 default_searcher: RAGRouter = None
 naive_rag: NaiveRAG = None
 def init_config(config: Configuration):
    """
    Initialize the global configuration and create instances of all required modules.
    This function initializes the global variables for the LLM, embedding model,
    file loader, web crawler, vector database, and RAG agents.
    Args:
        config: The Configuration object to use for initialization.
    """
    global \
        module_factory, \
        llm, \
        embedding_model, \
        file_loader, \
        vector_db, \
        web_crawler, \
        default_searcher, \
        naive_rag
    module_factory = ModuleFactory(config)
    llm = module_factory.create_llm()
    embedding_model = module_factory.create_embedding()
    file_loader = module_factory.create_file_loader()
    web_crawler = module_factory.create_web_crawler()
    vector_db = module_factory.create_vector_db()
    default_searcher = RAGRouter(
        llm=llm,
        rag_agents=[
            DeepSearch(
                llm=llm,
                embedding_model=embedding_model,
                vector_db=vector_db,
                max_iter=config.query_settings["max_iter"],
                route_collection=True,
                text_window_splitter=True,
            ),
            ChainOfRAG(
                llm=llm,
                embedding_model=embedding_model,
                vector_db=vector_db,
                max_iter=config.query_settings["max_iter"],
                route_collection=True,
                text_window_splitter=True,
            ),
        ],
    )
    naive_rag = NaiveRAG(
        llm=llm,
        embedding_model=embedding_model,
        vector_db=vector_db,
        top_k=10,
        route_collection=True,
        text_window_splitter=True,
    )
--- a/deepsearcher/embedding/init.py
+++ b/deepsearcher/embedding/init.py
@ -0,0 +1,5 @@
 from .openai_embedding import OpenAIEmbedding
 __all__ = [
    "OpenAIEmbedding",
 ]
--- a/deepsearcher/embedding/base.py
+++ b/deepsearcher/embedding/base.py
@ -0,0 +1,76 @@
 from typing import List
 from tqdm import tqdm
 from deepsearcher.loader.splitter import Chunk
 class BaseEmbedding:
    """
    Abstract base class for embedding model implementations.
    This class defines the interface for embedding model implementations,
    including methods for embedding queries and documents, and a property
    for the dimensionality of the embeddings.
    """
    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query text.
        Args:
            text: The query text to embed.
        Returns:
            A list of floats representing the embedding vector.
        """
        pass
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed a list of document texts.
        This default implementation calls embed_query for each text,
        but implementations may override this with a more efficient batch method.
        Args:
            texts: A list of document texts to embed.
        Returns:
            A list of embedding vectors, one for each input text.
        """
        return [self.embed_query(text) for text in texts]
    def embed_chunks(self, chunks: List[Chunk], batch_size: int = 256) -> List[Chunk]:
        """
        Embed a list of Chunk objects.
        This method extracts the text from each chunk, embeds it in batches,
        and updates the chunks with their embeddings.
        Args:
            chunks: A list of Chunk objects to embed.
            batch_size: The number of chunks to process in each batch.
        Returns:
            The input list of Chunk objects, updated with embeddings.
        """
        texts = [chunk.text for chunk in chunks]
        batch_texts = [texts[i : i + batch_size] for i in range(0, len(texts), batch_size)]
        embeddings = []
        for batch_text in tqdm(batch_texts, desc="Embedding chunks"):
            batch_embeddings = self.embed_documents(batch_text)
            embeddings.extend(batch_embeddings)
        for chunk, embedding in zip(chunks, embeddings):
            chunk.embedding = embedding
        return chunks
    @property
    def dimension(self) -> int:
        """
        Get the dimensionality of the embeddings.
        Returns:
            The number of dimensions in the embedding vectors.
        """
        pass
--- a/deepsearcher/embedding/openai_embedding.py
+++ b/deepsearcher/embedding/openai_embedding.py
@ -0,0 +1,103 @@
 import os
 from typing import List
 from openai import OpenAI
 from openai._types import NOT_GIVEN
 from deepsearcher.embedding.base import BaseEmbedding
 class OpenAIEmbedding(BaseEmbedding):
    """
    OpenAI embedding model implementation.
    This class provides an interface to the OpenAI embedding API, which offers
    various embedding models for text processing.
    For more information, see:
    https://platform.openai.com/docs/guides/embeddings/use-cases
    """
    def __init__(self, model: str, **kwargs):
        """
        Initialize the OpenAI embedding model.
        Args:
            model (str): The model identifier to use for embeddings.
            **kwargs: Additional keyword arguments.
                - api_key (str): The API key.
                - base_url (str): The base URL.
                - model_name (str): Alternative way to specify the model.
                - dimension (int): The dimension of the embedding vectors.
                - dim_change (bool): Whether it's able to change the dimension of the generated embeddings.
        """
        # Extract standard parameters (keep original behavior)
        if "api_key" in kwargs:
            api_key = kwargs.pop("api_key")
        if "base_url" in kwargs:
            base_url = kwargs.pop("base_url")
        else:
            base_url = os.getenv("OPENAI_BASE_URL")
        if "model_name" in kwargs:
            model = kwargs.pop("model_name")
        if "dimension" in kwargs:
            dimension = kwargs.pop("dimension") 
        else:
            dimension = NOT_GIVEN
        if "dim_change" in kwargs:
            dim_change = kwargs.pop("dim_change")
        self.dim = dimension
        self.dim_change = dim_change
        self.model = model
        self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs)
    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query text.
        Args:
            text (str): The query text to embed.
        Returns:
            List[float]: A list of floats representing the embedding vector.
        """
        response = self.client.embeddings.create(
            input=[text], model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN
        )
        return response.data[0].embedding
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed a list of document texts.
        Args:
            texts (List[str]): A list of document texts to embed.
        Returns:
            List[List[float]]: A list of embedding vectors, one for each input text.
        """
        response = self.client.embeddings.create(
            input=texts, model=self.model, dimensions=self.dimension if self.dim_change is True else NOT_GIVEN
        )
        return [r.embedding for r in response.data]
    @property
    def dimension(self) -> int:
        """
        Get the dimensionality of the embeddings for the current model.
        Returns:
            int: The number of dimensions in the embedding vectors.
        """
        return self.dim
--- a/deepsearcher/llm/init.py
+++ b/deepsearcher/llm/init.py
@ -0,0 +1,5 @@
 from .openai_llm import OpenAILLM
 __all__ = [
    "OpenAILLM",
 ]
--- a/deepsearcher/llm/base.py
+++ b/deepsearcher/llm/base.py
@ -0,0 +1,120 @@
 import ast
 import re
 from abc import ABC
 from typing import Dict, List
 class ChatResponse(ABC):
    """
    Represents a response from a chat model.
    This class encapsulates the content of a response from a chat model
    along with information about token usage.
    Attributes:
        content: The text content of the response.
        total_tokens: The total number of tokens used in the request and response.
    """
    def __init__(self, content: str, total_tokens: int) -> None:
        """
        Initialize a ChatResponse object.
        Args:
            content: The text content of the response.
            total_tokens: The total number of tokens used in the request and response.
        """
        self.content = content
        self.total_tokens = total_tokens
    def __repr__(self) -> str:
        """
        Return a string representation of the ChatResponse.
        Returns:
            A string representation of the ChatResponse object.
        """
        return f"ChatResponse(content={self.content}, total_tokens={self.total_tokens})"
 class BaseLLM(ABC):
    """
    Abstract base class for language model implementations.
    This class defines the interface for language model implementations,
    including methods for chat-based interactions and parsing responses.
    """
    def __init__(self):
        """
        Initialize a BaseLLM object.
        """
        pass
    def chat(self, messages: List[Dict]) -> ChatResponse:
        """
        Send a chat message to the language model and get a response.
        Args:
            messages: A list of message dictionaries, typically in the format
                     [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
        Returns:
            A ChatResponse object containing the model's response.
        """
        pass
    @staticmethod
    def literal_eval(response_content: str):
        """
        Parse a string response into a Python object using ast.literal_eval.
        This method attempts to extract and parse JSON or Python literals from the response content,
        handling various formats like code blocks and special tags.
        Args:
            response_content: The string content to parse.
        Returns:
            The parsed Python object.
        Raises:
            ValueError: If the response content cannot be parsed.
        """
        response_content = response_content.strip()
        response_content = BaseLLM.remove_think(response_content)
        try:
            if response_content.startswith("```") and response_content.endswith("```"):
                if response_content.startswith("```python"):
                    response_content = response_content[9:-3]
                elif response_content.startswith("```json"):
                    response_content = response_content[7:-3]
                elif response_content.startswith("```str"):
                    response_content = response_content[6:-3]
                elif response_content.startswith("```\n"):
                    response_content = response_content[4:-3]
                else:
                    raise ValueError("Invalid code block format")
            result = ast.literal_eval(response_content.strip())
        except Exception:
            matches = re.findall(r"(\[.*?\]|\{.*?\})", response_content, re.DOTALL)
            if len(matches) != 1:
                raise ValueError(
                    f"Invalid JSON/List format for response content:\n{response_content}"
                )
            json_part = matches[0]
            return ast.literal_eval(json_part)
        return result
    @staticmethod
    def remove_think(response_content: str) -> str:
        # remove content between <think> and </think>, especial for reasoning model
        if "<think>" in response_content and "</think>" in response_content:
            end_of_think = response_content.find("</think>") + len("</think>")
            response_content = response_content[end_of_think:]
        return response_content.strip()
--- a/deepsearcher/llm/openai_llm.py
+++ b/deepsearcher/llm/openai_llm.py
@ -0,0 +1,61 @@
 import os
 from typing import Dict, List
 from deepsearcher.llm.base import BaseLLM, ChatResponse
 class OpenAILLM(BaseLLM):
    """
    OpenAI language model implementation.
    This class provides an interface to interact with OpenAI's language models
    through their API.
    Attributes:
        model (str): The OpenAI model identifier to use.
        client: The OpenAI client instance.
    """
    def __init__(self, model: str = "o1-mini", **kwargs):
        """
        Initialize an OpenAI language model client.
        Args:
            model (str, optional): The model identifier to use. Defaults to "o1-mini".
            **kwargs: Additional keyword arguments to pass to the OpenAI client.
                - api_key: OpenAI API key. If not provided, uses OPENAI_API_KEY environment variable.
                - base_url: OpenAI API base URL. If not provided, uses OPENAI_BASE_URL environment variable.
        """
        from openai import OpenAI
        self.model = model
        if "api_key" in kwargs:
            api_key = kwargs.pop("api_key")
        else:
            api_key = os.getenv("OPENAI_API_KEY")
        if "base_url" in kwargs:
            base_url = kwargs.pop("base_url")
        else:
            base_url = os.getenv("OPENAI_BASE_URL")
        self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs)
    def chat(self, messages: List[Dict]) -> ChatResponse:
        """
        Send a chat message to the OpenAI model and get a response.
        Args:
            messages (List[Dict]): A list of message dictionaries, typically in the format
                                  [{"role": "system", "content": "..."},
                                   {"role": "user", "content": "..."}]
        Returns:
            ChatResponse: An object containing the model's response and token usage information.
        """
        completion = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
        )
        return ChatResponse(
            content=completion.choices[0].message.content,
            total_tokens=completion.usage.total_tokens,
        )
--- a/deepsearcher/loader/init.py
+++ b/deepsearcher/loader/init.py
--- a/deepsearcher/loader/file_loader/init.py
+++ b/deepsearcher/loader/file_loader/init.py
@ -0,0 +1,7 @@
 from deepsearcher.loader.file_loader.docling_loader import DoclingLoader
 from deepsearcher.loader.file_loader.json_loader import JsonFileLoader
 from deepsearcher.loader.file_loader.pdf_loader import PDFLoader
 from deepsearcher.loader.file_loader.text_loader import TextLoader
 from deepsearcher.loader.file_loader.unstructured_loader import UnstructuredLoader
 __all__ = ["PDFLoader", "TextLoader", "UnstructuredLoader", "JsonFileLoader", "DoclingLoader"]
--- a/deepsearcher/loader/file_loader/base.py
+++ b/deepsearcher/loader/file_loader/base.py
@ -0,0 +1,70 @@
 import os
 from abc import ABC
 from typing import List
 from langchain_core.documents import Document
 class BaseLoader(ABC):
    """
    Abstract base class for file loaders.
    This class defines the interface for loading documents from files and directories.
    All specific file loaders should inherit from this class and implement the required methods.
    """
    def __init__(self, **kwargs):
        """
        Initialize the loader with optional keyword arguments.
        Args:
            **kwargs: Optional keyword arguments for specific loader implementations.
        """
        pass
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a single file and convert it to Document objects.
        Args:
            file_path: Path to the file to be loaded.
        Returns:
            A list of Document objects containing the text and metadata.
        Note:
            Return a list of Document objects which contain the text and metadata.
            In the metadata, it's recommended to include the reference to the file.
            e.g. return [Document(page_content=..., metadata={"reference": file_path})]
        """
        pass
    def load_directory(self, directory: str) -> List[Document]:
        """
        Load all supported files from a directory and its subdirectories recursively.
        Args:
            directory: Path to the directory containing files to be loaded.
        Returns:
            A list of Document objects from all supported files in the directory and subdirectories.
        """
        documents = []
        for root, _, files in os.walk(directory):
            for file in files:
                for suffix in self.supported_file_types:
                    if file.endswith(suffix):
                        full_path = os.path.join(root, file)
                        documents.extend(self.load_file(full_path))
                        break
        return documents
    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by this loader.
        Returns:
            A list of supported file extensions (without the dot).
        """
        pass
--- a/deepsearcher/loader/file_loader/docling_loader.py
+++ b/deepsearcher/loader/file_loader/docling_loader.py
@ -0,0 +1,117 @@
 import os
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.file_loader.base import BaseLoader
 from deepsearcher.utils import log
 class DoclingLoader(BaseLoader):
    """
    Loader that utilizes Docling's DocumentConverter and HierarchicalChunker
    to convert and chunk files (e.g. Markdown or HTML) into Document objects.
    """
    def __init__(self):
        """
        Initialize the DoclingLoader with DocumentConverter and HierarchicalChunker instances.
        """
        from docling.document_converter import DocumentConverter
        from docling_core.transforms.chunker import HierarchicalChunker
        self.converter = DocumentConverter()
        self.chunker = HierarchicalChunker()
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a local file (or URL) using docling's conversion and perform hierarchical chunking.
        Args:
            file_path: Path or URL of the file to be loaded.
        Returns:
            A list of Document objects, each representing a chunk.
        Raises:
            FileNotFoundError: If the file does not exist.
            ValueError: If the file type is not supported.
            IOError: If there is an error reading the file.
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
        # Check if the file has a supported extension
        file_extension = os.path.splitext(file_path)[1].lower().lstrip(".")
        if file_extension not in self.supported_file_types:
            supported_formats = ", ".join(self.supported_file_types)
            raise ValueError(
                f"Unsupported file type: '{file_extension}'. "
                f"Supported file types are: {supported_formats}"
            )
        try:
            conversion_result = self.converter.convert(file_path)
            docling_document = conversion_result.document
            chunks = list(self.chunker.chunk(docling_document))
            documents = []
            for chunk in chunks:
                metadata = {"reference": file_path, "text": chunk.text}
                documents.append(Document(page_content=chunk.text, metadata=metadata))
            return documents
        except Exception as e:
            log.color_print(f"Error processing file {file_path}: {str(e)}")
            raise IOError(f"Failed to process file {file_path}: {str(e)}")
    def load_directory(self, directory: str) -> List[Document]:
        """
        Load all supported files from a directory.
        Args:
            directory: Path to the directory containing files to be loaded.
        Returns:
            A list of Document objects from all supported files in the directory.
        Raises:
            NotADirectoryError: If the specified path is not a directory.
        """
        if not os.path.isdir(directory):
            raise NotADirectoryError(f"Error: '{directory}' is not a directory.")
        return super().load_directory(directory)
    @property
    def supported_file_types(self) -> List[str]:
        """
        Return the list of file extensions supported by this loader.
        Supported formats (refer to the official website: https://docling-project.github.io/docling/usage/supported_formats/):
        - PDF
        - Office formats: DOCX, XLSX, PPTX
        - Markdown
        - AsciiDoc
        - HTML, XHTML
        - CSV
        - Images: PNG, JPEG, TIFF, BMP
        """
        return [
            "pdf",
            "docx",
            "xlsx",
            "pptx",
            "md",
            "adoc",
            "asciidoc",
            "html",
            "xhtml",
            "csv",
            "png",
            "jpg",
            "jpeg",
            "tif",
            "tiff",
            "bmp",
        ]
--- a/deepsearcher/loader/file_loader/json_loader.py
+++ b/deepsearcher/loader/file_loader/json_loader.py
@ -0,0 +1,94 @@
 import json
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.file_loader.base import BaseLoader
 class JsonFileLoader(BaseLoader):
    """
    Loader for JSON and JSONL files.
    This loader handles JSON and JSONL files, extracting text content from a specified key
    and converting each entry into Document objects for further processing.
    """
    def __init__(self, text_key: str):
        """
        Initialize the JsonFileLoader.
        Args:
            text_key: The key in the JSON data that contains the text content to be extracted.
        """
        self.text_key = text_key
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a JSON or JSONL file and convert it to Document objects.
        Args:
            file_path: Path to the JSON or JSONL file to be loaded.
        Returns:
            A list of Document objects, one for each entry in the JSON/JSONL file.
        """
        if file_path.endswith(".jsonl"):
            data_list: list[dict] = self._read_jsonl_file(file_path)
        else:
            data_list: list[dict] = self._read_json_file(file_path)
        documents = []
        for data_dict in data_list:
            page_content = data_dict.pop(self.text_key)
            data_dict.update({"reference": file_path})
            document = Document(page_content=page_content, metadata=data_dict)
            documents.append(document)
        return documents
    def _read_json_file(self, file_path: str) -> list[dict]:
        """
        Read and parse a JSON file.
        Args:
            file_path: Path to the JSON file.
        Returns:
            A list of dictionaries parsed from the JSON file.
        Raises:
            ValueError: If the JSON file does not contain a list of dictionaries.
        """
        json_data = json.load(open(file_path))
        if not isinstance(json_data, list):
            raise ValueError("JSON file must contain a list of dictionaries.")
        return json_data
    def _read_jsonl_file(self, file_path: str) -> List[dict]:
        """
        Read and parse a JSONL file (JSON Lines format).
        Args:
            file_path: Path to the JSONL file.
        Returns:
            A list of dictionaries parsed from the JSONL file.
        """
        data_list = []
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                try:
                    json_data = json.loads(line)
                    data_list.append(json_data)
                except json.JSONDecodeError:
                    print(f"Failed to decode line: {line}")
        return data_list
    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by this loader.
        Returns:
            A list of supported file extensions: ["txt", "md"].
        """
        return ["txt", "md"]
--- a/deepsearcher/loader/file_loader/pdf_loader.py
+++ b/deepsearcher/loader/file_loader/pdf_loader.py
@ -0,0 +1,54 @@
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.file_loader.base import BaseLoader
 class PDFLoader(BaseLoader):
    """
    Loader for PDF files.
    This loader handles PDF files and also supports text files with extensions like .txt and .md,
    converting them into Document objects for further processing.
    """
    def __init__(self):
        """
        Initialize the PDFLoader.
        """
        pass
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a PDF file and convert it to a Document object.
        Args:
            file_path: Path to the PDF file to be loaded.
        Returns:
            A list containing a single Document object with the file content and reference.
        Note:
            This loader also supports .txt and .md files for convenience.
        """
        import pdfplumber
        if file_path.endswith(".pdf"):
            with pdfplumber.open(file_path) as file:
                page_content = "\n\n".join([page.extract_text() for page in file.pages])
                return [Document(page_content=page_content, metadata={"reference": file_path})]
        elif file_path.endswith(".txt") or file_path.endswith(".md"):
            with open(file_path, "r", encoding="utf-8") as file:
                page_content = file.read()
                return [Document(page_content=page_content, metadata={"reference": file_path})]
    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by this loader.
        Returns:
            A list of supported file extensions: ["pdf", "md", "txt"].
        """
        return ["pdf", "md", "txt"]
--- a/deepsearcher/loader/file_loader/text_loader.py
+++ b/deepsearcher/loader/file_loader/text_loader.py
@ -0,0 +1,43 @@
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.file_loader.base import BaseLoader
 class TextLoader(BaseLoader):
    """
    Loader for plain text files.
    This loader handles text files with extensions like .txt and .md,
    converting them into Document objects for further processing.
    """
    def __init__(self):
        """
        Initialize the TextLoader.
        """
        pass
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a text file and convert it to a Document object.
        Args:
            file_path: Path to the text file to be loaded.
        Returns:
            A list containing a single Document object with the file content and reference.
        """
        with open(file_path, "r", encoding="utf-8") as f:
            return [Document(page_content=f.read(), metadata={"reference": file_path})]
    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by this loader.
        Returns:
            A list of supported file extensions: ["txt", "md"].
        """
        return ["txt", "md"]
--- a/deepsearcher/loader/file_loader/unstructured_loader.py
+++ b/deepsearcher/loader/file_loader/unstructured_loader.py
@ -0,0 +1,201 @@
 import os
 import shutil
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.file_loader.base import BaseLoader
 from deepsearcher.utils import log
 class UnstructuredLoader(BaseLoader):
    """
    Loader for unstructured documents using the unstructured-io library.
    This loader processes various document formats using the unstructured-io library's
    processing pipeline, extracting text and metadata from complex document formats.
    """
    def __init__(self):
        """
        Initialize the UnstructuredLoader.
        Creates a temporary directory for processed outputs and cleans up any existing ones.
        """
        self.directory_with_results = "./pdf_processed_outputs"
        if os.path.exists(self.directory_with_results):
            shutil.rmtree(self.directory_with_results)
        os.makedirs(self.directory_with_results)
    def load_pipeline(self, input_path: str) -> List[Document]:
        """
        Process documents using the unstructured-io pipeline.
        Args:
            input_path: Path to the file or directory to be processed.
        Returns:
            A list of Document objects extracted from the processed files.
        Note:
            If UNSTRUCTURED_API_KEY and UNSTRUCTURED_API_URL environment variables are set,
            the API-based partitioning will be used. Otherwise, local partitioning will be used.
        """
        from unstructured_ingest.interfaces import ProcessorConfig
        from unstructured_ingest.pipeline.pipeline import Pipeline
        from unstructured_ingest.processes.connectors.local import (
            LocalConnectionConfig,
            LocalDownloaderConfig,
            LocalIndexerConfig,
            LocalUploaderConfig,
        )
        from unstructured_ingest.processes.partitioner import PartitionerConfig
        # Check if API environment variables are set
        api_key = os.getenv("UNSTRUCTURED_API_KEY")
        api_url = os.getenv("UNSTRUCTURED_API_URL")
        use_api = api_key is not None and api_url is not None
        if use_api:
            log.color_print("Using Unstructured API for document processing")
        else:
            log.color_print(
                "Using local processing for documents (UNSTRUCTURED_API_KEY or UNSTRUCTURED_API_URL not set)"
            )
        Pipeline.from_configs(
            context=ProcessorConfig(),
            indexer_config=LocalIndexerConfig(input_path=input_path),
            downloader_config=LocalDownloaderConfig(),
            source_connection_config=LocalConnectionConfig(),
            partitioner_config=PartitionerConfig(
                partition_by_api=use_api,
                api_key=api_key,
                partition_endpoint=api_url,
                strategy="hi_res",
            ),
            uploader_config=LocalUploaderConfig(output_dir=self.directory_with_results),
        ).run()
        from unstructured.staging.base import elements_from_json
        elements = []
        for filename in os.listdir(self.directory_with_results):
            if filename.endswith(".json"):
                file_path = os.path.join(self.directory_with_results, filename)
                try:
                    elements.extend(elements_from_json(filename=file_path))
                except IOError:
                    log.color_print(f"Error: Could not read file {filename}.")
        documents = []
        for element in elements:
            metadata = element.metadata.to_dict()
            metadata["reference"] = input_path  # TODO test it
            documents.append(
                Document(
                    page_content=element.text,
                    metadata=metadata,
                )
            )
        return documents
    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a single file using the unstructured-io pipeline.
        Args:
            file_path: Path to the file to be processed.
        Returns:
            A list of Document objects extracted from the processed file.
        """
        return self.load_pipeline(file_path)
    def load_directory(self, directory: str) -> List[Document]:
        """
        Load all supported files from a directory using the unstructured-io pipeline.
        Args:
            directory: Path to the directory containing files to be processed.
        Returns:
            A list of Document objects extracted from all processed files.
        """
        return self.load_pipeline(directory)
    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by the unstructured-io library. Please refer to the Unstructured documentation for more details: https://docs.unstructured.io/ui/supported-file-types.
        Returns:
            A comprehensive list of supported file extensions.
        Note:
            The unstructured-io library supports a wide range of document formats
            including office documents, images, emails, and more.
        """
        return [
            "abw",
            "bmp",
            "csv",
            "cwk",
            "dbf",
            "dif",
            "doc",
            "docm",
            "docx",
            "dot",
            "dotm",
            "eml",
            "epub",
            "et",
            "eth",
            "fods",
            "gif",
            "heic",
            "htm",
            "html",
            "hwp",
            "jpeg",
            "jpg",
            "md",
            "mcw",
            "mw",
            "odt",
            "org",
            "p7s",
            "pages",
            "pbd",
            "pdf",
            "png",
            "pot",
            "potm",
            "ppt",
            "pptm",
            "pptx",
            "prn",
            "rst",
            "rtf",
            "sdp",
            "sgl",
            "svg",
            "sxg",
            "tiff",
            "txt",
            "tsv",
            "uof",
            "uos1",
            "uos2",
            "web",
            "webp",
            "wk2",
            "xls",
            "xlsb",
            "xlsm",
            "xlsx",
            "xlw",
            "xml",
            "zabw",
        ]
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@ -0,0 +1,105 @@
 ## Sentence Window splitting strategy, ref:
 #  https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
 from typing import List
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 class Chunk:
    """
    Represents a chunk of text with associated metadata and embedding.
    A chunk is a segment of text extracted from a document, along with its reference
    information, metadata, and optional embedding vector.
    Attributes:
        text: The text content of the chunk.
        reference: A reference to the source of the chunk (e.g., file path, URL).
        metadata: Additional metadata associated with the chunk.
        embedding: The vector embedding of the chunk, if available.
    """
    def __init__(
        self,
        text: str,
        reference: str,
        metadata: dict = None,
        embedding: List[float] = None,
    ):
        """
        Initialize a Chunk object.
        Args:
            text: The text content of the chunk.
            reference: A reference to the source of the chunk.
            metadata: Additional metadata associated with the chunk. Defaults to an empty dict.
            embedding: The vector embedding of the chunk. Defaults to None.
        """
        self.text = text
        self.reference = reference
        self.metadata = metadata or {}
        self.embedding = embedding or None
 def _sentence_window_split(
    split_docs: List[Document], original_document: Document, offset: int = 200
 ) -> List[Chunk]:
    """
    Create chunks with context windows from split documents.
    This function takes documents that have been split into smaller pieces and
    adds context from the original document by including text before and after
    each split piece, up to the specified offset.
    Args:
        split_docs: List of documents that have been split.
        original_document: The original document before splitting.
        offset: Number of characters to include before and after each split piece.
    Returns:
        A list of Chunk objects with context windows.
    """
    chunks = []
    original_text = original_document.page_content
    for doc in split_docs:
        doc_text = doc.page_content
        start_index = original_text.index(doc_text)
        end_index = start_index + len(doc_text) - 1
        wider_text = original_text[
            max(0, start_index - offset) : min(len(original_text), end_index + offset)
        ]
        reference = doc.metadata.pop("reference", "")
        doc.metadata["wider_text"] = wider_text
        chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
        chunks.append(chunk)
    return chunks
 def split_docs_to_chunks(
    documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
 ) -> List[Chunk]:
    """
    Split documents into chunks with context windows.
    This function splits a list of documents into smaller chunks with overlapping text,
    and adds context windows to each chunk by including text before and after the chunk.
    Args:
        documents: List of documents to split.
        chunk_size: Size of each chunk in characters.
        chunk_overlap: Number of characters to overlap between chunks.
    Returns:
        A list of Chunk objects with context windows.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    all_chunks = []
    for doc in documents:
        split_docs = text_splitter.split_documents([doc])
        split_chunks = _sentence_window_split(split_docs, doc, offset=300)
        all_chunks.extend(split_chunks)
    return all_chunks
--- a/deepsearcher/loader/web_crawler/init.py
+++ b/deepsearcher/loader/web_crawler/init.py
@ -0,0 +1,11 @@
 from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler
 from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler
 from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler
 from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler
 __all__ = [
    "FireCrawlCrawler",
    "JinaCrawler",
    "Crawl4AICrawler",
    "DoclingCrawler",
 ]
--- a/deepsearcher/loader/web_crawler/base.py
+++ b/deepsearcher/loader/web_crawler/base.py
@ -0,0 +1,55 @@
 from abc import ABC
 from typing import List
 from langchain_core.documents import Document
 class BaseCrawler(ABC):
    """
    Abstract base class for web crawlers.
    This class defines the interface for crawling web pages and converting them
    into Document objects for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the crawler with optional keyword arguments.
        Args:
            **kwargs: Optional keyword arguments for specific crawler implementations.
        """
        pass
    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL and convert it to Document objects.
        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects containing the content and metadata from the URL.
        Note:
            Implementations should include the URL reference in the metadata.
            e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
        """
        pass
    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
        """
        Crawl multiple URLs and return a list of Document objects.
        Args:
            urls: A list of URLs to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects containing the content and metadata from all URLs.
        """
        documents = []
        for url in urls:
            documents.extend(self.crawl_url(url, **crawl_kwargs))
        return documents
--- a/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
+++ b/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
@ -0,0 +1,140 @@
 import asyncio
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.utils import log
 class Crawl4AICrawler(BaseCrawler):
    """
    Web crawler using the Crawl4AI library.
    This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
    into markdown format for further processing. It supports both single-page crawling
    and batch crawling of multiple pages.
    """
    def __init__(self, **kwargs):
        """
        Initialize the Crawl4AICrawler.
        Args:
            **kwargs: Optional keyword arguments.
                browser_config: Configuration for the browser used by Crawl4AI.
        """
        super().__init__(**kwargs)
        self.crawler = None  # Lazy init
        self.browser_config = kwargs.get("browser_config", None)
    def _lazy_init(self):
        """
        Initialize the crawler lazily when needed.
        This method creates the AsyncWebCrawler instance with the provided browser configuration
        only when it's first needed, to avoid unnecessary initialization.
        """
        from crawl4ai import AsyncWebCrawler, BrowserConfig
        if self.crawler is None:
            config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
            self.crawler = AsyncWebCrawler(config=config)
    async def _async_crawl(self, url: str) -> Document:
        """
        Asynchronously crawl a single URL.
        Args:
            url: The URL to crawl.
        Returns:
            A Document object with the markdown content and metadata from the URL.
        """
        if self.crawler is None:
            self._lazy_init()
        async with self.crawler as crawler:
            result = await crawler.arun(url)
            markdown_content = result.markdown or ""
            metadata = {
                "reference": url,
                "success": result.success,
                "status_code": result.status_code,
                "media": result.media,
                "links": result.links,
            }
            if hasattr(result, "metadata") and result.metadata:
                metadata["title"] = result.metadata.get("title", "")
                metadata["author"] = result.metadata.get("author", "")
            return Document(page_content=markdown_content, metadata=metadata)
    def crawl_url(self, url: str) -> List[Document]:
        """
        Crawl a single URL.
        Args:
            url: The URL to crawl.
        Returns:
            A list containing a single Document object with the markdown content and metadata,
            or an empty list if an error occurs.
        """
        try:
            document = asyncio.run(self._async_crawl(url))
            return [document]
        except Exception as e:
            log.error(f"Error during crawling {url}: {e}")
            return []
    async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
        """
        Asynchronously crawl multiple URLs.
        Args:
            urls: A list of URLs to crawl.
        Returns:
            A list of Document objects with the markdown content and metadata from all URLs.
        """
        if self.crawler is None:
            self._lazy_init()
        async with self.crawler as crawler:
            results = await crawler.arun_many(urls)
            documents = []
            for result in results:
                markdown_content = result.markdown or ""
                metadata = {
                    "reference": result.url,
                    "success": result.success,
                    "status_code": result.status_code,
                    "media": result.media,
                    "links": result.links,
                }
                if hasattr(result, "metadata") and result.metadata:
                    metadata["title"] = result.metadata.get("title", "")
                    metadata["author"] = result.metadata.get("author", "")
                documents.append(Document(page_content=markdown_content, metadata=metadata))
            return documents
    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
        """
        Crawl multiple URLs.
        Args:
            urls: A list of URLs to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects with the markdown content and metadata from all URLs,
            or an empty list if an error occurs.
        """
        try:
            return asyncio.run(self._async_crawl_many(urls))
        except Exception as e:
            log.error(f"Error during crawling {urls}: {e}")
            return []
--- a/deepsearcher/loader/web_crawler/docling_crawler.py
+++ b/deepsearcher/loader/web_crawler/docling_crawler.py
@ -0,0 +1,98 @@
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.utils import log
 class DoclingCrawler(BaseCrawler):
    """
    Web crawler using Docling's DocumentConverter and HierarchicalChunker.
    This crawler leverages Docling's capabilities to convert web pages into structured
    documents and chunk them appropriately for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
        Args:
            **kwargs: Optional keyword arguments.
        """
        super().__init__(**kwargs)
        from docling.document_converter import DocumentConverter
        from docling_core.transforms.chunker import HierarchicalChunker
        self.converter = DocumentConverter()
        self.chunker = HierarchicalChunker()
    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL using Docling's conversion and perform hierarchical chunking.
        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects, each representing a chunk from the crawled URL.
        Raises:
            IOError: If there is an error processing the URL.
        """
        try:
            # Use Docling to convert the URL to a document
            conversion_result = self.converter.convert(url)
            docling_document = conversion_result.document
            # Chunk the document using hierarchical chunking
            chunks = list(self.chunker.chunk(docling_document))
            documents = []
            for chunk in chunks:
                metadata = {"reference": url, "text": chunk.text}
                documents.append(Document(page_content=chunk.text, metadata=metadata))
            return documents
        except Exception as e:
            log.color_print(f"Error processing URL {url}: {str(e)}")
            raise IOError(f"Failed to process URL {url}: {str(e)}")
    @property
    def supported_file_types(self) -> List[str]:
        """
        Return the list of file types and formats supported by Docling.
        Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
        - PDF
        - Office formats: DOCX, XLSX, PPTX
        - Markdown
        - AsciiDoc
        - HTML, XHTML
        - CSV
        - Images: PNG, JPEG, TIFF, BMP
        Returns:
            A list of file extensions supported by this crawler.
        """
        return [
            "pdf",
            "docx",
            "xlsx",
            "pptx",
            "md",
            "adoc",
            "asciidoc",
            "html",
            "xhtml",
            "csv",
            "png",
            "jpg",
            "jpeg",
            "tif",
            "tiff",
            "bmp",
        ]
--- a/deepsearcher/loader/web_crawler/firecrawl_crawler.py
+++ b/deepsearcher/loader/web_crawler/firecrawl_crawler.py
@ -0,0 +1,88 @@
 import os
 from typing import List, Optional
 from firecrawl import FirecrawlApp, ScrapeOptions
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 class FireCrawlCrawler(BaseCrawler):
    """
    Web crawler using the FireCrawl service.
    This crawler uses the FireCrawl service to crawl web pages and convert them
    into markdown format for further processing. It supports both single-page scraping
    and recursive crawling of multiple pages.
    """
    def __init__(self, **kwargs):
        """
        Initialize the FireCrawlCrawler.
        Args:
            **kwargs: Optional keyword arguments.
        """
        super().__init__(**kwargs)
        self.app = None
    def crawl_url(
        self,
        url: str,
        max_depth: Optional[int] = None,
        limit: Optional[int] = None,
        allow_backward_links: Optional[bool] = None,
    ) -> List[Document]:
        """
        Dynamically crawls a URL using either scrape_url or crawl_url:
        - Uses scrape_url for single-page extraction if no params are provided.
        - Uses crawl_url to recursively gather pages when any param is provided.
        Args:
            url (str): The starting URL to crawl.
            max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
            limit (Optional[int]): Maximum number of pages to crawl (default: 20).
            allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
        Returns:
            List[Document]: List of Document objects with page content and metadata.
        """
        # Lazy init
        self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
        # if user just inputs a single url as param
        # scrape single page
        if max_depth is None and limit is None and allow_backward_links is None:
            # Call the new Firecrawl API, passing formats directly
            scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
            data = scrape_response.model_dump()
            return [
                Document(
                    page_content=data.get("markdown", ""),
                    metadata={"reference": url, **data.get("metadata", {})},
                )
            ]
        # else, crawl multiple pages based on users' input params
        # set default values if not provided
        crawl_response = self.app.crawl_url(
            url=url,
            limit=limit or 20,
            max_depth=max_depth or 2,
            allow_backward_links=allow_backward_links or False,
            scrape_options=ScrapeOptions(formats=["markdown"]),
            poll_interval=5,
        )
        items = crawl_response.model_dump().get("data", [])
        documents: List[Document] = []
        for item in items:
            # Support items that are either dicts or Pydantic sub-models
            item_dict = item.model_dump() if hasattr(item, "model_dump") else item
            md = item_dict.get("markdown", "")
            meta = item_dict.get("metadata", {})
            meta["reference"] = meta.get("url", url)
            documents.append(Document(page_content=md, metadata=meta))
        return documents
--- a/deepsearcher/loader/web_crawler/jina_crawler.py
+++ b/deepsearcher/loader/web_crawler/jina_crawler.py
@ -0,0 +1,62 @@
 import os
 from typing import List
 import requests
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 class JinaCrawler(BaseCrawler):
    """
    Web crawler using Jina AI's rendering service.
    This crawler uses Jina AI's rendering service to crawl web pages and convert them
    into markdown format for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the JinaCrawler.
        Args:
            **kwargs: Optional keyword arguments.
        Raises:
            ValueError: If the JINA_API_TOKEN environment variable is not set.
        """
        super().__init__(**kwargs)
        self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
        if not self.jina_api_token:
            raise ValueError("Missing JINA_API_TOKEN environment variable")
    def crawl_url(self, url: str) -> List[Document]:
        """
        Crawl a single URL using Jina AI's rendering service.
        Args:
            url: The URL to crawl.
        Returns:
            A list containing a single Document object with the markdown content and metadata.
        Raises:
            HTTPError: If the request to Jina AI's service fails.
        """
        jina_url = f"https://r.jina.ai/{url}"
        headers = {
            "Authorization": f"Bearer {self.jina_api_token}",
            "X-Return-Format": "markdown",
        }
        response = requests.get(jina_url, headers=headers)
        response.raise_for_status()
        markdown_content = response.text
        metadata = {
            "reference": url,
            "status_code": response.status_code,
            "headers": dict(response.headers),
        }
        return [Document(page_content=markdown_content, metadata=metadata)]
--- a/deepsearcher/offline_loading.py
+++ b/deepsearcher/offline_loading.py
@ -0,0 +1,119 @@
 import os
 from typing import List, Union
 from tqdm import tqdm
 # from deepsearcher.configuration import embedding_model, vector_db, file_loader
 from deepsearcher import configuration
 from deepsearcher.loader.splitter import split_docs_to_chunks
 def load_from_local_files(
    paths_or_directory: Union[str, List[str]],
    collection_name: str = None,
    collection_description: str = None,
    force_new_collection: bool = False,
    chunk_size: int = 1500,
    chunk_overlap: int = 100,
    batch_size: int = 256,
 ):
    """
    Load knowledge from local files or directories into the vector database.
    This function processes files from the specified paths or directories,
    splits them into chunks, embeds the chunks, and stores them in the vector database.
    Args:
        paths_or_directory: A single path or a list of paths to files or directories to load.
        collection_name: Name of the collection to store the data in. If None, uses the default collection.
        collection_description: Description of the collection. If None, no description is set.
        force_new_collection: If True, drops the existing collection and creates a new one.
        chunk_size: Size of each chunk in characters.
        chunk_overlap: Number of characters to overlap between chunks.
        batch_size: Number of chunks to process at once during embedding.
    Raises:
        FileNotFoundError: If any of the specified paths do not exist.
    """
    vector_db = configuration.vector_db
    if collection_name is None:
        collection_name = vector_db.default_collection
    collection_name = collection_name.replace(" ", "_").replace("-", "_")
    embedding_model = configuration.embedding_model
    file_loader = configuration.file_loader
    vector_db.init_collection(
        dim=embedding_model.dimension,
        collection=collection_name,
        description=collection_description,
        force_new_collection=force_new_collection,
    )
    if isinstance(paths_or_directory, str):
        paths_or_directory = [paths_or_directory]
    all_docs = []
    for path in tqdm(paths_or_directory, desc="Loading files"):
        if not os.path.exists(path):
            raise FileNotFoundError(f"Error: File or directory '{path}' does not exist.")
        if os.path.isdir(path):
            docs = file_loader.load_directory(path)
        else:
            docs = file_loader.load_file(path)
        all_docs.extend(docs)
    # print("Splitting docs to chunks...")
    chunks = split_docs_to_chunks(
        all_docs,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
    vector_db.insert_data(collection=collection_name, chunks=chunks)
 def load_from_website(
    urls: Union[str, List[str]],
    collection_name: str = None,
    collection_description: str = None,
    force_new_collection: bool = False,
    chunk_size: int = 1500,
    chunk_overlap: int = 100,
    batch_size: int = 256,
    **crawl_kwargs,
 ):
    """
    Load knowledge from websites into the vector database.
    This function crawls the specified URLs, processes the content,
    splits it into chunks, embeds the chunks, and stores them in the vector database.
    Args:
        urls: A single URL or a list of URLs to crawl.
        collection_name: Name of the collection to store the data in. If None, uses the default collection.
        collection_description: Description of the collection. If None, no description is set.
        force_new_collection: If True, drops the existing collection and creates a new one.
        chunk_size: Size of each chunk in characters.
        chunk_overlap: Number of characters to overlap between chunks.
        batch_size: Number of chunks to process at once during embedding.
        **crawl_kwargs: Additional keyword arguments to pass to the web crawler.
    """
    if isinstance(urls, str):
        urls = [urls]
    vector_db = configuration.vector_db
    embedding_model = configuration.embedding_model
    web_crawler = configuration.web_crawler
    vector_db.init_collection(
        dim=embedding_model.dimension,
        collection=collection_name,
        description=collection_description,
        force_new_collection=force_new_collection,
    )
    all_docs = web_crawler.crawl_urls(urls, **crawl_kwargs)
    chunks = split_docs_to_chunks(
        all_docs,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunks = embedding_model.embed_chunks(chunks, batch_size=batch_size)
    vector_db.insert_data(collection=collection_name, chunks=chunks)
--- a/deepsearcher/online_query.py
+++ b/deepsearcher/online_query.py
@ -0,0 +1,96 @@
 from typing import List, Tuple
 # from deepsearcher.configuration import vector_db, embedding_model, llm
 from deepsearcher import configuration
 from deepsearcher.vector_db.base import RetrievalResult
 def query(original_query: str, max_iter: int = 3) -> Tuple[str, List[RetrievalResult], int]:
    """
    Query the knowledge base with a question and get an answer.
    This function uses the default searcher to query the knowledge base and generate
    an answer based on the retrieved information.
    Args:
        original_query: The question or query to search for.
        max_iter: Maximum number of iterations for the search process.
    Returns:
        A tuple containing:
            - The generated answer as a string
            - A list of retrieval results that were used to generate the answer
            - The number of tokens consumed during the process
    """
    default_searcher = configuration.default_searcher
    return default_searcher.query(original_query, max_iter=max_iter)
 def retrieve(
    original_query: str, max_iter: int = 3
 ) -> Tuple[List[RetrievalResult], List[str], int]:
    """
    Retrieve relevant information from the knowledge base without generating an answer.
    This function uses the default searcher to retrieve information from the knowledge base
    that is relevant to the query.
    Args:
        original_query: The question or query to search for.
        max_iter: Maximum number of iterations for the search process.
    Returns:
        A tuple containing:
            - A list of retrieval results
            - An empty list (placeholder for future use)
            - The number of tokens consumed during the process
    """
    default_searcher = configuration.default_searcher
    retrieved_results, consume_tokens, metadata = default_searcher.retrieve(
        original_query, max_iter=max_iter
    )
    return retrieved_results, [], consume_tokens
 def naive_retrieve(query: str, collection: str = None, top_k=10) -> List[RetrievalResult]:
    """
    Perform a simple retrieval from the knowledge base using the naive RAG approach.
    This function uses the naive RAG agent to retrieve information from the knowledge base
    without any advanced techniques like iterative refinement.
    Args:
        query: The question or query to search for.
        collection: The name of the collection to search in. If None, searches in all collections.
        top_k: The maximum number of results to return.
    Returns:
        A list of retrieval results.
    """
    naive_rag = configuration.naive_rag
    all_retrieved_results, consume_tokens, _ = naive_rag.retrieve(query)
    return all_retrieved_results
 def naive_rag_query(
    query: str, collection: str = None, top_k=10
 ) -> Tuple[str, List[RetrievalResult]]:
    """
    Query the knowledge base using the naive RAG approach and get an answer.
    This function uses the naive RAG agent to query the knowledge base and generate
    an answer based on the retrieved information, without any advanced techniques.
    Args:
        query: The question or query to search for.
        collection: The name of the collection to search in. If None, searches in all collections.
        top_k: The maximum number of results to consider.
    Returns:
        A tuple containing:
            - The generated answer as a string
            - A list of retrieval results that were used to generate the answer
    """
    naive_rag = configuration.naive_rag
    answer, retrieved_results, consume_tokens = naive_rag.query(query)
    return answer, retrieved_results
--- a/deepsearcher/utils/init.py
+++ b/deepsearcher/utils/init.py
--- a/deepsearcher/utils/log.py
+++ b/deepsearcher/utils/log.py
@ -0,0 +1,160 @@
 import logging
 from termcolor import colored
 class ColoredFormatter(logging.Formatter):
    """
    A custom formatter for logging that adds colors to log messages.
    This formatter adds colors to log messages based on their level,
    making it easier to distinguish between different types of logs.
    Attributes:
        COLORS: A dictionary mapping log levels to colors.
    """
    COLORS = {
        "DEBUG": "cyan",
        "INFO": "green",
        "WARNING": "yellow",
        "ERROR": "red",
        "CRITICAL": "magenta",
    }
    def format(self, record):
        """
        Format a log record with colors.
        Args:
            record: The log record to format.
        Returns:
            The formatted log message with colors.
        """
        # all line in log will be colored
        log_message = super().format(record)
        return colored(log_message, self.COLORS.get(record.levelname, "white"))
        # only log level will be colored
        # levelname_colored = colored(record.levelname, self.COLORS.get(record.levelname, 'white'))
        # record.levelname = levelname_colored
        # return super().format(record)
        # only keywords will be colored
        # message = record.msg
        # for word, color in self.KEYWORDS.items():
        #     if word in message:
        #         message = message.replace(word, colored(word, color))
        # record.msg = message
        # return super().format(record)
 # config log
 dev_logger = logging.getLogger("dev")
 dev_formatter = ColoredFormatter("%(asctime)s - %(levelname)s - %(message)s")
 dev_handler = logging.StreamHandler()
 dev_handler.setFormatter(dev_formatter)
 dev_logger.addHandler(dev_handler)
 dev_logger.setLevel(logging.INFO)
 progress_logger = logging.getLogger("progress")
 progress_handler = logging.StreamHandler()
 progress_handler.setFormatter(ColoredFormatter("%(message)s"))
 progress_logger.addHandler(progress_handler)
 progress_logger.setLevel(logging.INFO)
 dev_mode = False
 def set_dev_mode(mode: bool):
    """
    Set the development mode.
    When in development mode, debug, info, and warning logs are displayed.
    When not in development mode, only error and critical logs are displayed.
    Args:
        mode: True to enable development mode, False to disable it.
    """
    global dev_mode
    dev_mode = mode
 def set_level(level):
    """
    Set the logging level for the development logger.
    Args:
        level: The logging level to set (e.g., logging.DEBUG, logging.INFO).
    """
    dev_logger.setLevel(level)
 def debug(message):
    """
    Log a debug message.
    Args:
        message: The message to log.
    """
    if dev_mode:
        dev_logger.debug(message)
 def info(message):
    """
    Log an info message.
    Args:
        message: The message to log.
    """
    if dev_mode:
        dev_logger.info(message)
 def warning(message):
    """
    Log a warning message.
    Args:
        message: The message to log.
    """
    if dev_mode:
        dev_logger.warning(message)
 def error(message):
    """
    Log an error message.
    Args:
        message: The message to log.
    """
    if dev_mode:
        dev_logger.error(message)
 def critical(message):
    """
    Log a critical message and raise a RuntimeError.
    Args:
        message: The message to log.
    Raises:
        RuntimeError: Always raised with the provided message.
    """
    dev_logger.critical(message)
    raise RuntimeError(message)
 def color_print(message, **kwargs):
    """
    Print a colored message to the progress logger.
    Args:
        message: The message to print.
        **kwargs: Additional keyword arguments to pass to the logger.
    """
    progress_logger.info(message)
--- a/deepsearcher/vector_db/init.py
+++ b/deepsearcher/vector_db/init.py
@ -0,0 +1,6 @@
 from .azure_search import AzureSearch
 from .milvus import Milvus, RetrievalResult
 from .oracle import OracleDB
 from .qdrant import Qdrant
 __all__ = ["Milvus", "RetrievalResult", "OracleDB", "Qdrant", "AzureSearch"]
--- a/deepsearcher/vector_db/azure_search.py
+++ b/deepsearcher/vector_db/azure_search.py
@ -0,0 +1,279 @@
 import uuid
 from typing import Any, Dict, List, Optional
 from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
 class AzureSearch(BaseVectorDB):
    def __init__(self, endpoint, index_name, api_key, vector_field):
        super().__init__(default_collection=index_name)
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents import SearchClient
        self.client = SearchClient(
            endpoint=endpoint,
            index_name=index_name,
            credential=AzureKeyCredential(api_key),
        )
        self.vector_field = vector_field
        self.endpoint = endpoint
        self.index_name = index_name
        self.api_key = api_key
    def init_collection(self):
        """Initialize Azure Search index with proper schema"""
        from azure.core.credentials import AzureKeyCredential
        from azure.core.exceptions import ResourceNotFoundError
        from azure.search.documents.indexes import SearchIndexClient
        from azure.search.documents.indexes.models import (
            SearchableField,
            SearchField,
            SearchIndex,
            SimpleField,
        )
        index_client = SearchIndexClient(
            endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
        )
        # Create the index (simplified for compatibility with older SDK versions)
        fields = [
            SimpleField(name="id", type="Edm.String", key=True),
            SearchableField(name="content", type="Edm.String"),
            SearchField(
                name="content_vector",
                type="Collection(Edm.Single)",
                searchable=True,
                vector_search_dimensions=1536,
            ),
        ]
        # Create index with fields
        index = SearchIndex(name=self.index_name, fields=fields)
        try:
            # Try to delete existing index
            try:
                index_client.delete_index(self.index_name)
            except ResourceNotFoundError:
                pass
            # Create the index
            index_client.create_index(index)
        except Exception as e:
            print(f"Error creating index: {str(e)}")
    def insert_data(self, documents: List[dict]):
        """Batch insert documents with vector embeddings"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents import SearchClient
        search_client = SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=AzureKeyCredential(self.api_key),
        )
        actions = [
            {
                "@search.action": "upload" if doc.get("id") else "merge",
                "id": doc.get("id", str(uuid.uuid4())),
                "content": doc["text"],
                "content_vector": doc["vector"],
            }
            for doc in documents
        ]
        result = search_client.upload_documents(actions)
        return [x.succeeded for x in result]
    def search_data(
        self, collection: Optional[str], vector: List[float], top_k: int = 50
    ) -> List[RetrievalResult]:
        """Azure Cognitive Search implementation with compatibility for older SDK versions"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents import SearchClient
        search_client = SearchClient(
            endpoint=self.endpoint,
            index_name=collection or self.index_name,
            credential=AzureKeyCredential(self.api_key),
        )
        # Validate that vector is not empty
        if not vector or len(vector) == 0:
            print("Error: Empty vector provided for search. Vector must have 1536 dimensions.")
            return []
        # Debug vector and field info
        print(f"Vector length for search: {len(vector)}")
        print(f"Vector field name: {self.vector_field}")
        # Ensure vector has the right dimensions
        if len(vector) != 1536:
            print(f"Warning: Vector length {len(vector)} does not match expected 1536 dimensions")
            return []
        # Execute search with direct parameters - simpler approach
        try:
            print(f"Executing search with top_k={top_k}")
            # Directly use the search_by_vector method for compatibility
            body = {
                "search": "*",
                "select": "id,content",
                "top": top_k,
                "vectorQueries": [
                    {
                        "vector": vector,
                        "fields": self.vector_field,
                        "k": top_k,
                        "kind": "vector",
                    }
                ],
            }
            # Print the search request body for debugging
            print(f"Search request body: {body}")
            # Use the REST API directly
            result = search_client._client.documents.search_post(
                search_request=body, headers={"api-key": self.api_key}
            )
            # Format results
            search_results = []
            if hasattr(result, "results"):
                for doc in result.results:
                    try:
                        doc_dict = doc.as_dict() if hasattr(doc, "as_dict") else doc
                        content = doc_dict.get("content", "")
                        doc_id = doc_dict.get("id", "")
                        score = doc_dict.get("@search.score", 0.0)
                        result = RetrievalResult(
                            embedding=[],  # We don't get the vectors back
                            text=content,
                            reference=doc_id,
                            metadata={"source": doc_id},
                            score=score,
                        )
                        search_results.append(result)
                    except Exception as e:
                        print(f"Error processing result: {str(e)}")
            return search_results
        except Exception as e:
            print(f"Search error: {str(e)}")
            # Try another approach if the first one fails
            try:
                print("Trying alternative search method...")
                results = search_client.search(search_text="*", select=["id", "content"], top=top_k)
                # Process results
                alt_results = []
                for doc in results:
                    try:
                        # Handle different result formats
                        if isinstance(doc, dict):
                            content = doc.get("content", "")
                            doc_id = doc.get("id", "")
                            score = doc.get("@search.score", 0.0)
                        else:
                            content = getattr(doc, "content", "")
                            doc_id = getattr(doc, "id", "")
                            score = getattr(doc, "@search.score", 0.0)
                        result = RetrievalResult(
                            embedding=[],
                            text=content,
                            reference=doc_id,
                            metadata={"source": doc_id},
                            score=score,
                        )
                        alt_results.append(result)
                    except Exception as e:
                        print(f"Error processing result: {str(e)}")
                return alt_results
            except Exception as e:
                print(f"Alternative search failed: {str(e)}")
                return []
    def clear_db(self):
        """Delete all documents in the index"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents import SearchClient
        search_client = SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=AzureKeyCredential(self.api_key),
        )
        docs = search_client.search(search_text="*", include_total_count=True, select=["id"])
        ids = [doc["id"] for doc in docs]
        if ids:
            search_client.delete_documents([{"id": id} for id in ids])
        return len(ids)
    def get_all_collections(self) -> List[str]:
        """List all search indices in Azure Cognitive Search"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents.indexes import SearchIndexClient
        try:
            index_client = SearchIndexClient(
                endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
            )
            return [index.name for index in index_client.list_indexes()]
        except Exception as e:
            print(f"Failed to list indices: {str(e)}")
            return []
    def get_collection_info(self, name: str) -> Dict[str, Any]:
        """Retrieve index metadata"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents.indexes import SearchIndexClient
        index_client = SearchIndexClient(
            endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
        )
        return index_client.get_index(name).__dict__
    def collection_exists(self, name: str) -> bool:
        """Check index existence"""
        from azure.core.exceptions import ResourceNotFoundError
        try:
            self.get_collection_info(name)
            return True
        except ResourceNotFoundError:
            return False
    def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
        """List all Azure Search indices with metadata"""
        from azure.core.credentials import AzureKeyCredential
        from azure.search.documents.indexes import SearchIndexClient
        try:
            index_client = SearchIndexClient(
                endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
            )
            collections = []
            for index in index_client.list_indexes():
                collections.append(
                    CollectionInfo(
                        collection_name=index.name,
                        description=f"Azure Search Index with {len(index.fields) if hasattr(index, 'fields') else 0} fields",
                    )
                )
            return collections
        except Exception as e:
            print(f"Collection listing failed: {str(e)}")
            return []
--- a/deepsearcher/vector_db/base.py
+++ b/deepsearcher/vector_db/base.py
@ -0,0 +1,207 @@
 from abc import ABC, abstractmethod
 from typing import List, Union
 import numpy as np
 from deepsearcher.loader.splitter import Chunk
 class RetrievalResult:
    """
    Represents a result retrieved from the vector database.
    This class encapsulates the information about a retrieved document,
    including its embedding, text content, reference, metadata, and similarity score.
    Attributes:
        embedding: The vector embedding of the document.
        text: The text content of the document.
        reference: A reference to the source of the document.
        metadata: Additional metadata associated with the document.
        score: The similarity score of the document to the query.
    """
    def __init__(
        self,
        embedding: np.array,
        text: str,
        reference: str,
        metadata: dict,
        score: float = 0.0,
    ):
        """
        Initialize a RetrievalResult object.
        Args:
            embedding: The vector embedding of the document.
            text: The text content of the document.
            reference: A reference to the source of the document.
            metadata: Additional metadata associated with the document.
            score: The similarity score of the document to the query. Defaults to 0.0.
        """
        self.embedding = embedding
        self.text = text
        self.reference = reference
        self.metadata = metadata
        self.score: float = score
    def __repr__(self):
        """
        Return a string representation of the RetrievalResult.
        Returns:
            A string representation of the RetrievalResult object.
        """
        return f"RetrievalResult(score={self.score}, embedding={self.embedding}, text={self.text}, reference={self.reference}), metadata={self.metadata}"
 def deduplicate_results(results: List[RetrievalResult]) -> List[RetrievalResult]:
    """
    Remove duplicate results based on text content.
    This function removes duplicate results from a list of RetrievalResult objects
    by keeping only the first occurrence of each unique text content.
    Args:
        results: A list of RetrievalResult objects to deduplicate.
    Returns:
        A list of deduplicated RetrievalResult objects.
    """
    all_text_set = set()
    deduplicated_results = []
    for result in results:
        if result.text not in all_text_set:
            all_text_set.add(result.text)
            deduplicated_results.append(result)
    return deduplicated_results
 class CollectionInfo:
    """
    Represents information about a collection in the vector database.
    This class encapsulates the name and description of a collection.
    Attributes:
        collection_name: The name of the collection.
        description: The description of the collection.
    """
    def __init__(self, collection_name: str, description: str):
        """
        Initialize a CollectionInfo object.
        Args:
            collection_name: The name of the collection.
            description: The description of the collection.
        """
        self.collection_name = collection_name
        self.description = description
 class BaseVectorDB(ABC):
    """
    Abstract base class for vector database implementations.
    This class defines the interface for vector database implementations,
    including methods for initializing collections, inserting data, searching,
    listing collections, and clearing the database.
    Attributes:
        default_collection: The name of the default collection.
    """
    def __init__(
        self,
        default_collection: str = "deepsearcher",
        *args,
        **kwargs,
    ):
        """
        Initialize a BaseVectorDB object.
        Args:
            default_collection: The name of the default collection. Defaults to "deepsearcher".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        self.default_collection = default_collection
    @abstractmethod
    def init_collection(
        self,
        dim: int,
        collection: str,
        description: str,
        force_new_collection=False,
        *args,
        **kwargs,
    ):
        """
        Initialize a collection in the vector database.
        Args:
            dim: The dimensionality of the vectors in the collection.
            collection: The name of the collection.
            description: The description of the collection.
            force_new_collection: If True, drop the existing collection and create a new one.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        pass
    @abstractmethod
    def insert_data(self, collection: str, chunks: List[Chunk], *args, **kwargs):
        """
        Insert data into a collection in the vector database.
        Args:
            collection: The name of the collection.
            chunks: A list of Chunk objects to insert.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        pass
    @abstractmethod
    def search_data(
        self, collection: str, vector: Union[np.array, List[float]], *args, **kwargs
    ) -> List[RetrievalResult]:
        """
        Search for similar vectors in a collection.
        Args:
            collection: The name of the collection.
            vector: The query vector to search for.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            A list of RetrievalResult objects representing the search results.
        """
        pass
    def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
        """
        List all collections in the vector database.
        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            A list of CollectionInfo objects representing the collections.
        """
        pass
    @abstractmethod
    def clear_db(self, *args, **kwargs):
        """
        Clear the vector database.
        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        pass
--- a/deepsearcher/vector_db/milvus.py
+++ b/deepsearcher/vector_db/milvus.py
@ -0,0 +1,305 @@
 from typing import List, Optional, Union
 import numpy as np
 from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, RRFRanker
 from deepsearcher.loader.splitter import Chunk
 from deepsearcher.utils import log
 from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
 class Milvus(BaseVectorDB):
    """Milvus class is a subclass of DB class."""
    client: MilvusClient = None
    def __init__(
        self,
        default_collection: str = "deepsearcher",
        uri: str = "http://localhost:19530",
        token: str = "root:Milvus",
        user: str = "",
        password: str = "",
        db: str = "default",
        hybrid: bool = False,
        **kwargs,
    ):
        """
        Initialize the Milvus client.
        Args:
            default_collection (str, optional): Default collection name. Defaults to "deepsearcher".
            uri (str, optional): URI for connecting to Milvus server. Defaults to "http://localhost:19530".
            token (str, optional): Authentication token for Milvus. Defaults to "root:Milvus".
            user (str, optional): Username for authentication. Defaults to "".
            password (str, optional): Password for authentication. Defaults to "".
            db (str, optional): Database name. Defaults to "default".
            hybrid (bool, optional): Whether to enable hybrid search. Defaults to False.
            **kwargs: Additional keyword arguments to pass to the MilvusClient.
        """
        super().__init__(default_collection)
        self.default_collection = default_collection
        self.client = MilvusClient(
            uri=uri, user=user, password=password, token=token, db_name=db, timeout=30, **kwargs
        )
        self.hybrid = hybrid
    def init_collection(
        self,
        dim: int,
        collection: Optional[str] = "deepsearcher",
        description: Optional[str] = "",
        force_new_collection: bool = False,
        text_max_length: int = 65_535,
        reference_max_length: int = 2048,
        metric_type: str = "L2",
        *args,
        **kwargs,
    ):
        """
        Initialize a collection in Milvus.
        Args:
            dim (int): Dimension of the vector embeddings.
            collection (Optional[str], optional): Collection name. Defaults to "deepsearcher".
            description (Optional[str], optional): Collection description. Defaults to "".
            force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
            text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
            reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
            metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        if not collection:
            collection = self.default_collection
        if description is None:
            description = ""
        self.metric_type = metric_type
        try:
            has_collection = self.client.has_collection(collection, timeout=5)
            if force_new_collection and has_collection:
                self.client.drop_collection(collection)
            elif has_collection:
                return
            schema = self.client.create_schema(
                enable_dynamic_field=False, auto_id=True, description=description
            )
            schema.add_field("id", DataType.INT64, is_primary=True)
            schema.add_field("embedding", DataType.FLOAT_VECTOR, dim=dim)
            if self.hybrid:
                analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}
                schema.add_field(
                    "text",
                    DataType.VARCHAR,
                    max_length=text_max_length,
                    analyzer_params=analyzer_params,
                    enable_match=True,
                    enable_analyzer=True,
                )
            else:
                schema.add_field("text", DataType.VARCHAR, max_length=text_max_length)
            schema.add_field("reference", DataType.VARCHAR, max_length=reference_max_length)
            schema.add_field("metadata", DataType.JSON)
            if self.hybrid:
                schema.add_field("sparse_vector", DataType.SPARSE_FLOAT_VECTOR)
                bm25_function = Function(
                    name="bm25",
                    function_type=FunctionType.BM25,
                    input_field_names=["text"],
                    output_field_names="sparse_vector",
                )
                schema.add_function(bm25_function)
            index_params = self.client.prepare_index_params()
            index_params.add_index(field_name="embedding", metric_type=metric_type)
            if self.hybrid:
                index_params.add_index(
                    field_name="sparse_vector",
                    index_type="SPARSE_INVERTED_INDEX",
                    metric_type="BM25",
                )
            self.client.create_collection(
                collection,
                schema=schema,
                index_params=index_params,
                consistency_level="Strong",
            )
            log.color_print(f"create collection [{collection}] successfully")
        except Exception as e:
            log.critical(f"fail to init db for milvus, error info: {e}")
    def insert_data(
        self,
        collection: Optional[str],
        chunks: List[Chunk],
        batch_size: int = 256,
        *args,
        **kwargs,
    ):
        """
        Insert data into a Milvus collection.
        Args:
            collection (Optional[str]): Collection name. If None, uses default_collection.
            chunks (List[Chunk]): List of Chunk objects to insert.
            batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        if not collection:
            collection = self.default_collection
        texts = [chunk.text for chunk in chunks]
        references = [chunk.reference for chunk in chunks]
        metadatas = [chunk.metadata for chunk in chunks]
        embeddings = [chunk.embedding for chunk in chunks]
        datas = [
            {
                "embedding": embedding,
                "text": text,
                "reference": reference,
                "metadata": metadata,
            }
            for embedding, text, reference, metadata in zip(
                embeddings, texts, references, metadatas
            )
        ]
        batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)]
        try:
            for batch_data in batch_datas:
                self.client.insert(collection_name=collection, data=batch_data)
        except Exception as e:
            log.critical(f"fail to insert data, error info: {e}")
    def search_data(
        self,
        collection: Optional[str],
        vector: Union[np.array, List[float]],
        top_k: int = 5,
        query_text: Optional[str] = None,
        *args,
        **kwargs,
    ) -> List[RetrievalResult]:
        """
        Search for similar vectors in a Milvus collection.
        Args:
            collection (Optional[str]): Collection name. If None, uses default_collection.
            vector (Union[np.array, List[float]]): Query vector for similarity search.
            top_k (int, optional): Number of results to return. Defaults to 5.
            query_text (Optional[str], optional): Original query text for hybrid search. Defaults to None.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[RetrievalResult]: List of retrieval results containing similar vectors.
        """
        if not collection:
            collection = self.default_collection
        try:
            use_hybrid = self.hybrid and query_text
            if use_hybrid:
                sparse_search_params = {"metric_type": "BM25"}
                sparse_request = AnnSearchRequest(
                    [query_text], "sparse_vector", sparse_search_params, limit=top_k
                )
                dense_search_params = {"metric_type": self.metric_type}
                dense_request = AnnSearchRequest(
                    [vector], "embedding", dense_search_params, limit=top_k
                )
                search_results = self.client.hybrid_search(
                    collection_name=collection,
                    reqs=[sparse_request, dense_request],
                    ranker=RRFRanker(),
                    limit=top_k,
                    output_fields=["embedding", "text", "reference", "metadata"],
                    timeout=10,
                )
            else:
                search_results = self.client.search(
                    collection_name=collection,
                    data=[vector],
                    limit=top_k,
                    output_fields=["embedding", "text", "reference", "metadata"],
                    timeout=10,
                )
            return [
                RetrievalResult(
                    embedding=b["entity"]["embedding"],
                    text=b["entity"]["text"],
                    reference=b["entity"]["reference"],
                    score=b["distance"],
                    metadata=b["entity"]["metadata"],
                )
                for a in search_results
                for b in a
            ]
        except Exception as e:
            log.critical(f"fail to search data, error info: {e}")
            return []
    def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
        """
        List all collections in the Milvus database.
        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[CollectionInfo]: List of collection information objects.
        """
        collection_infos = []
        dim = kwargs.pop("dim", 0)
        try:
            collections = self.client.list_collections()
            for collection in collections:
                description = self.client.describe_collection(collection)
                if dim != 0:
                    skip = False
                    for field_dict in description["fields"]:
                        if (
                            field_dict["name"] == "embedding"
                            and field_dict["type"] == DataType.FLOAT_VECTOR
                        ):
                            if field_dict["params"]["dim"] != dim:
                                skip = True
                    if skip:
                        continue
                collection_infos.append(
                    CollectionInfo(
                        collection_name=collection,
                        description=description["description"],
                    )
                )
        except Exception as e:
            log.critical(f"fail to list collections, error info: {e}")
        return collection_infos
    def clear_db(self, collection: str = "deepsearcher", *args, **kwargs):
        """
        Clear (drop) a collection from the Milvus database.
        Args:
            collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        if not collection:
            collection = self.default_collection
        try:
            self.client.drop_collection(collection)
        except Exception as e:
            log.warning(f"fail to clear db, error info: {e}")
--- a/deepsearcher/vector_db/oracle.py
+++ b/deepsearcher/vector_db/oracle.py
@ -0,0 +1,536 @@
 import array
 import json
 from typing import List, Optional, Union
 import numpy as np
 from deepsearcher.loader.splitter import Chunk
 from deepsearcher.utils import log
 from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
 class OracleDB(BaseVectorDB):
    """OracleDB class is a subclass of DB class."""
    client = None
    def __init__(
        self,
        user: str,
        password: str,
        dsn: str,
        config_dir: str,
        wallet_location: str,
        wallet_password: str,
        min: int = 1,
        max: int = 10,
        increment: int = 1,
        default_collection: str = "deepsearcher",
    ):
        """
        Initialize the Oracle database connection.
        Args:
            user (str): Oracle database username.
            password (str): Oracle database password.
            dsn (str): Oracle database connection string.
            config_dir (str): Directory containing Oracle configuration files.
            wallet_location (str): Location of the Oracle wallet.
            wallet_password (str): Password for the Oracle wallet.
            min (int, optional): Minimum number of connections in the pool. Defaults to 1.
            max (int, optional): Maximum number of connections in the pool. Defaults to 10.
            increment (int, optional): Increment for adding new connections. Defaults to 1.
            default_collection (str, optional): Default collection name. Defaults to "deepsearcher".
        """
        super().__init__(default_collection)
        self.default_collection = default_collection
        import oracledb
        oracledb.defaults.fetch_lobs = False
        self.DB_TYPE_VECTOR = oracledb.DB_TYPE_VECTOR
        try:
            self.client = oracledb.create_pool(
                user=user,
                password=password,
                dsn=dsn,
                config_dir=config_dir,
                wallet_location=wallet_location,
                wallet_password=wallet_password,
                min=min,
                max=max,
                increment=increment,
            )
            log.color_print(f"Connected to Oracle database at {dsn}")
            self.check_table()
        except Exception as e:
            log.critical(f"Failed to connect to Oracle database at {dsn}")
            log.critical(f"Oracle database error in init: {e}")
            raise
    def numpy_converter_in(self, value):
        """Convert numpy array to array.array"""
        if value.dtype == np.float64:
            dtype = "d"
        elif value.dtype == np.float32:
            dtype = "f"
        else:
            dtype = "b"
        return array.array(dtype, value)
    def input_type_handler(self, cursor, value, arraysize):
        """Set the type handler for the input data"""
        if isinstance(value, np.ndarray):
            return cursor.var(
                self.DB_TYPE_VECTOR,
                arraysize=arraysize,
                inconverter=self.numpy_converter_in,
            )
    def numpy_converter_out(self, value):
        """Convert array.array to numpy array"""
        if value.typecode == "b":
            dtype = np.int8
        elif value.typecode == "f":
            dtype = np.float32
        else:
            dtype = np.float64
        return np.array(value, copy=False, dtype=dtype)
    def output_type_handler(self, cursor, metadata):
        """Set the type handler for the output data"""
        if metadata.type_code is self.DB_TYPE_VECTOR:
            return cursor.var(
                metadata.type_code,
                arraysize=cursor.arraysize,
                outconverter=self.numpy_converter_out,
            )
    def query(self, sql: str, params: dict = None) -> Union[dict, None]:
        """
        Execute a SQL query and return the results.
        Args:
            sql (str): SQL query to execute.
            params (dict, optional): Parameters for the SQL query. Defaults to None.
        Returns:
            Union[dict, None]: Query results as a dictionary or None if no results.
        Raises:
            Exception: If there's an error executing the query.
        """
        with self.client.acquire() as connection:
            connection.inputtypehandler = self.input_type_handler
            connection.outputtypehandler = self.output_type_handler
            with connection.cursor() as cursor:
                try:
                    if log.dev_mode:
                        print("sql:\n", sql)
                    # log.debug("def query:"+params)
                    # print("sql:\n",sql)
                    # print("params:\n",params)
                    cursor.execute(sql, params)
                except Exception as e:
                    log.critical(f"Oracle database error in query: {e}")
                    raise
                columns = [column[0].lower() for column in cursor.description]
                rows = cursor.fetchall()
                if rows:
                    data = [dict(zip(columns, row)) for row in rows]
                else:
                    data = []
                if log.dev_mode:
                    print("data:\n", data)
                return data
            # self.client.drop(connection)
    def execute(self, sql: str, data: Union[list, dict] = None):
        """
        Execute a SQL statement without returning results.
        Args:
            sql (str): SQL statement to execute.
            data (Union[list, dict], optional): Data for the SQL statement. Defaults to None.
        Raises:
            Exception: If there's an error executing the statement.
        """
        try:
            with self.client.acquire() as connection:
                connection.inputtypehandler = self.input_type_handler
                connection.outputtypehandler = self.output_type_handler
                with connection.cursor() as cursor:
                    # print("sql:\n",sql)
                    # print("data:\n",data)
                    if data is None:
                        cursor.execute(sql)
                    else:
                        cursor.execute(sql, data)
                    connection.commit()
        except Exception as e:
            log.critical(f"Oracle database error in execute: {e}")
            log.error("ERROR sql:\n" + sql)
            log.error("ERROR data:\n" + data)
            raise
    def has_collection(self, collection: str = "deepsearcher"):
        """
        Check if a collection exists in the database.
        Args:
            collection (str, optional): Collection name to check. Defaults to "deepsearcher".
        Returns:
            bool: True if the collection exists, False otherwise.
        """
        SQL = SQL_TEMPLATES["has_collection"]
        params = {"collection": collection}
        res = self.query(SQL, params)
        if res:
            if res[0]["rowcnt"] > 0:
                return True
            else:
                return False
        else:
            return False
    def check_table(self):
        """
        Check if required tables exist and create them if they don't.
        Raises:
            Exception: If there's an error checking or creating tables.
        """
        SQL = SQL_TEMPLATES["has_table"]
        try:
            res = self.query(SQL)
            if len(res) < 2:
                missing_table = TABLES.keys() - set([i["table_name"] for i in res])
                for table in missing_table:
                    self.create_tables(table)
        except Exception as e:
            log.critical(f"Failed to check table in Oracle database, error info: {e}")
            raise
    def create_tables(self, table_name):
        """
        Create a table in the database.
        Args:
            table_name: Name of the table to create.
        Raises:
            Exception: If there's an error creating the table.
        """
        SQL = TABLES[table_name]
        try:
            self.execute(SQL)
            log.color_print(f"Created table {table_name} in Oracle database")
        except Exception as e:
            log.critical(f"Failed to create table {table_name} in Oracle database, error info: {e}")
            raise
    def drop_collection(self, collection: str = "deepsearcher"):
        """
        Drop a collection from the database.
        Args:
            collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
        Raises:
            Exception: If there's an error dropping the collection.
        """
        try:
            params = {"collection": collection}
            SQL = SQL_TEMPLATES["drop_collection"]
            self.execute(SQL, params)
            SQL = SQL_TEMPLATES["drop_collection_item"]
            self.execute(SQL, params)
            log.color_print(f"Collection {collection} dropped")
        except Exception as e:
            log.critical(f"fail to drop collection, error info: {e}")
            raise
    def insertone(self, data):
        """
        Insert a single record into the database.
        Args:
            data: Data to insert.
        """
        SQL = SQL_TEMPLATES["insert"]
        self.execute(SQL, data)
        log.debug("insert done!")
    def searchone(
        self,
        collection: Optional[str],
        vector: Union[np.array, List[float]],
        top_k: int = 5,
    ):
        """
        Search for similar vectors in a collection.
        Args:
            collection (Optional[str]): Collection name to search in.
            vector (Union[np.array, List[float]]): Query vector for similarity search.
            top_k (int, optional): Number of results to return. Defaults to 5.
        Returns:
            list: List of search results.
        Raises:
            Exception: If there's an error during search.
        """
        log.debug("def searchone:" + collection)
        try:
            if isinstance(vector, List):
                vector = np.array(vector)
            embedding_string = "[" + ", ".join(map(str, vector.tolist())) + "]"
            dimension = vector.shape[0]
            dtype = str(vector.dtype).upper()
            SQL = SQL_TEMPLATES["search"].format(dimension=dimension, dtype=dtype)
            max_distance = 0.8
            params = {
                "collection": collection,
                "embedding_string": embedding_string,
                "top_k": top_k,
                "max_distance": max_distance,
            }
            res = self.query(SQL, params)
            if res:
                return res
            else:
                return []
        except Exception as e:
            log.critical(f"fail to search data, error info: {e}")
            raise
    def init_collection(
        self,
        dim: int,
        collection: Optional[str] = "deepsearcher",
        description: Optional[str] = "",
        force_new_collection: bool = False,
        text_max_length: int = 65_535,
        reference_max_length: int = 2048,
        metric_type: str = "L2",
        *args,
        **kwargs,
    ):
        """
        Initialize a collection in the database.
        Args:
            dim (int): Dimension of the vector embeddings.
            collection (Optional[str], optional): Collection name. Defaults to "deepsearcher".
            description (Optional[str], optional): Collection description. Defaults to "".
            force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
            text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
            reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
            metric_type (str, optional): Metric type for vector similarity search. Defaults to "L2".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Raises:
            Exception: If there's an error initializing the collection.
        """
        if not collection:
            collection = self.default_collection
        if description is None:
            description = ""
        try:
            has_collection = self.has_collection(collection)
            if force_new_collection and has_collection:
                self.drop_collection(collection)
            elif has_collection:
                return
            # insert collection info
            SQL = SQL_TEMPLATES["insert_collection"]
            params = {"collection": collection, "description": description}
            self.execute(SQL, params)
        except Exception as e:
            log.critical(f"fail to init_collection for oracle, error info: {e}")
    def insert_data(
        self,
        collection: Optional[str],
        chunks: List[Chunk],
        batch_size: int = 256,
        *args,
        **kwargs,
    ):
        """
        Insert data into a collection.
        Args:
            collection (Optional[str]): Collection name. If None, uses default_collection.
            chunks (List[Chunk]): List of Chunk objects to insert.
            batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Raises:
            Exception: If there's an error inserting data.
        """
        if not collection:
            collection = self.default_collection
        datas = []
        for chunk in chunks:
            _data = {
                "embedding": self.numpy_converter_in(np.array(chunk.embedding)),
                "text": chunk.text,
                "reference": chunk.reference,
                "metadata": json.dumps(chunk.metadata),
                "collection": collection,
            }
            datas.append(_data)
        batch_datas = [datas[i : i + batch_size] for i in range(0, len(datas), batch_size)]
        try:
            for batch_data in batch_datas:
                for _data in batch_data:
                    self.insertone(data=_data)
            log.color_print(f"Successfully insert {len(datas)} data")
        except Exception as e:
            log.critical(f"fail to insert data, error info: {e}")
            raise
    def search_data(
        self,
        collection: Optional[str],
        vector: Union[np.array, List[float]],
        top_k: int = 5,
        *args,
        **kwargs,
    ) -> List[RetrievalResult]:
        """
        Search for similar vectors in a collection.
        Args:
            collection (Optional[str]): Collection name. If None, uses default_collection.
            vector (Union[np.array, List[float]]): Query vector for similarity search.
            top_k (int, optional): Number of results to return. Defaults to 5.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[RetrievalResult]: List of retrieval results containing similar vectors.
        Raises:
            Exception: If there's an error during search.
        """
        if not collection:
            collection = self.default_collection
        try:
            # print("def search_data:",collection)
            # print("def search_data:",type(vector))
            search_results = self.searchone(collection=collection, vector=vector, top_k=top_k)
            # print("def search_data: search_results",search_results)
            return [
                RetrievalResult(
                    embedding=b["embedding"],
                    text=b["text"],
                    reference=b["reference"],
                    score=b["distance"],
                    metadata=json.loads(b["metadata"]),
                )
                for b in search_results
            ]
        except Exception as e:
            log.critical(f"fail to search data, error info: {e}")
            raise
            # return []
    def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
        """
        List all collections in the database.
        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[CollectionInfo]: List of collection information objects.
        """
        collection_infos = []
        try:
            SQL = SQL_TEMPLATES["list_collections"]
            log.debug("def list_collections:" + SQL)
            collections = self.query(SQL)
            if collections:
                for collection in collections:
                    collection_infos.append(
                        CollectionInfo(
                            collection_name=collection["collection"],
                            description=collection["description"],
                        )
                    )
            return collection_infos
        except Exception as e:
            log.critical(f"fail to list collections, error info: {e}")
            raise
    def clear_db(self, collection: str = "deepsearcher", *args, **kwargs):
        """
        Clear (drop) a collection from the database.
        Args:
            collection (str, optional): Collection name to drop. Defaults to "deepsearcher".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        if not collection:
            collection = self.default_collection
        try:
            self.client.drop_collection(collection)
        except Exception as e:
            log.warning(f"fail to clear db, error info: {e}")
            raise
 TABLES = {
    "DEEPSEARCHER_COLLECTION_INFO": """CREATE TABLE DEEPSEARCHER_COLLECTION_INFO (    
        id INT generated by default as identity primary key,
        collection varchar(256),
        description CLOB,
        status NUMBER DEFAULT 1,
        createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        updatetime TIMESTAMP DEFAULT NULL)""",
    "DEEPSEARCHER_COLLECTION_ITEM": """CREATE TABLE DEEPSEARCHER_COLLECTION_ITEM (    
        id INT generated by default as identity primary key,
        collection varchar(256),
        embedding VECTOR,
        text CLOB,
        reference varchar(4000),
        metadata CLOB,
        status NUMBER DEFAULT 1,
        createtime TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        updatetime TIMESTAMP DEFAULT NULL)""",
 }
 SQL_TEMPLATES = {
    "has_table": f"""SELECT table_name FROM all_tables 
        WHERE table_name in ({",".join([f"'{k}'" for k in TABLES.keys()])})""",
    "has_collection": "select count(*) as rowcnt from DEEPSEARCHER_COLLECTION_INFO where collection=:collection and status=1",
    "list_collections": "select collection,description from DEEPSEARCHER_COLLECTION_INFO where status=1",
    "drop_collection": "update DEEPSEARCHER_COLLECTION_INFO set status=0 where collection=:collection and status=1",
    "drop_collection_item": "update DEEPSEARCHER_COLLECTION_ITEM set status=0 where collection=:collection and status=1",
    "insert_collection": """INSERT INTO DEEPSEARCHER_COLLECTION_INFO (collection,description) 
        values (:collection,:description)""",
    "insert": """INSERT INTO DEEPSEARCHER_COLLECTION_ITEM (collection,embedding,text,reference,metadata) 
        values (:collection,:embedding,:text,:reference,:metadata)""",
    "search": """SELECT * FROM 
        (SELECT t.*,
            VECTOR_DISTANCE(t.embedding,vector(:embedding_string,{dimension},{dtype}),COSINE) as distance
        FROM DEEPSEARCHER_COLLECTION_ITEM t 
        JOIN DEEPSEARCHER_COLLECTION_INFO c ON t.collection=c.collection 
        WHERE t.collection=:collection AND t.status=1 AND c.status=1)
        WHERE distance<:max_distance ORDER BY distance ASC FETCH FIRST :top_k ROWS ONLY""",
 }
--- a/deepsearcher/vector_db/qdrant.py
+++ b/deepsearcher/vector_db/qdrant.py
@ -0,0 +1,290 @@
 import uuid
 from typing import List, Optional, Union
 import numpy as np
 from deepsearcher.loader.splitter import Chunk
 from deepsearcher.utils import log
 from deepsearcher.vector_db.base import BaseVectorDB, CollectionInfo, RetrievalResult
 DEFAULT_COLLECTION_NAME = "deepsearcher"
 TEXT_PAYLOAD_KEY = "text"
 REFERENCE_PAYLOAD_KEY = "reference"
 METADATA_PAYLOAD_KEY = "metadata"
 class Qdrant(BaseVectorDB):
    """Vector DB implementation powered by [Qdrant](https://qdrant.tech/)"""
    def __init__(
        self,
        location: Optional[str] = None,
        url: Optional[str] = None,
        port: Optional[int] = 6333,
        grpc_port: int = 6334,
        prefer_grpc: bool = False,
        https: Optional[bool] = None,
        api_key: Optional[str] = None,
        prefix: Optional[str] = None,
        timeout: Optional[int] = None,
        host: Optional[str] = None,
        path: Optional[str] = None,
        default_collection: str = DEFAULT_COLLECTION_NAME,
    ):
        """
        Initialize the Qdrant client with flexible connection options.
        Args:
            location (Optional[str], optional):
                - If ":memory:" - use in-memory Qdrant instance.
                - If str - use it as a URL parameter.
                - If None - use default values for host and port.
                Defaults to None.
            url (Optional[str], optional):
                URL for Qdrant service, can include scheme, host, port, and prefix.
                Allows flexible connection string specification.
                Defaults to None.
            port (Optional[int], optional):
                Port of the REST API interface.
                Defaults to 6333.
            grpc_port (int, optional):
                Port of the gRPC interface.
                Defaults to 6334.
            prefer_grpc (bool, optional):
                If True, use gRPC interface whenever possible in custom methods.
                Defaults to False.
            https (Optional[bool], optional):
                If True, use HTTPS (SSL) protocol.
                Defaults to None.
            api_key (Optional[str], optional):
                API key for authentication in Qdrant Cloud.
                Defaults to None.
            prefix (Optional[str], optional):
                If not None, add prefix to the REST URL path.
                Example: 'service/v1' results in 'http://localhost:6333/service/v1/{qdrant-endpoint}'
                Defaults to None.
            timeout (Optional[int], optional):
                Timeout for REST and gRPC API requests.
                Default is 5 seconds for REST and unlimited for gRPC.
                Defaults to None.
            host (Optional[str], optional):
                Host name of Qdrant service.
                If url and host are None, defaults to 'localhost'.
                Defaults to None.
            path (Optional[str], optional):
                Persistence path for QdrantLocal.
                Defaults to None.
            default_collection (str, optional):
                Default collection name to be used.
        """
        try:
            from qdrant_client import QdrantClient
        except ImportError as original_error:
            raise ImportError(
                "Qdrant client is not installed. Install it using: pip install qdrant-client\n"
            ) from original_error
        super().__init__(default_collection)
        self.client = QdrantClient(
            location=location,
            url=url,
            port=port,
            grpc_port=grpc_port,
            prefer_grpc=prefer_grpc,
            https=https,
            api_key=api_key,
            prefix=prefix,
            timeout=timeout,
            host=host,
            path=path,
        )
    def init_collection(
        self,
        dim: int,
        collection: Optional[str] = None,
        description: Optional[str] = "",
        force_new_collection: bool = False,
        text_max_length: int = 65_535,
        reference_max_length: int = 2048,
        distance_metric: str = "Cosine",
        *args,
        **kwargs,
    ):
        """
        Initialize a collection in Qdrant.
        Args:
            dim (int): Dimension of the vector embeddings.
            collection (Optional[str], optional): Collection name.
            description (Optional[str], optional): Collection description. Defaults to "".
            force_new_collection (bool, optional): Whether to force create a new collection if it already exists. Defaults to False.
            text_max_length (int, optional): Maximum length for text field. Defaults to 65_535.
            reference_max_length (int, optional): Maximum length for reference field. Defaults to 2048.
            distance_metric (str, optional): Metric type for vector similarity search. Defaults to "Cosine".
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        from qdrant_client import models
        collection = collection or self.default_collection
        try:
            collection_exists = self.client.collection_exists(collection_name=collection)
            if force_new_collection and collection_exists:
                self.client.delete_collection(collection_name=collection)
                collection_exists = False
            if not collection_exists:
                self.client.create_collection(
                    collection_name=collection,
                    vectors_config=models.VectorParams(size=dim, distance=distance_metric),
                    *args,
                    **kwargs,
                )
                log.color_print(f"Created collection [{collection}] successfully")
        except Exception as e:
            log.critical(f"Failed to init Qdrant collection, error info: {e}")
    def insert_data(
        self,
        collection: Optional[str],
        chunks: List[Chunk],
        batch_size: int = 256,
        *args,
        **kwargs,
    ):
        """
        Insert data into a Qdrant collection.
        Args:
            collection (Optional[str]): Collection name.
            chunks (List[Chunk]): List of Chunk objects to insert.
            batch_size (int, optional): Number of chunks to insert in each batch. Defaults to 256.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        from qdrant_client import models
        try:
            for i in range(0, len(chunks), batch_size):
                batch_chunks = chunks[i : i + batch_size]
                points = [
                    models.PointStruct(
                        id=uuid.uuid4().hex,
                        vector=chunk.embedding,
                        payload={
                            TEXT_PAYLOAD_KEY: chunk.text,
                            REFERENCE_PAYLOAD_KEY: chunk.reference,
                            METADATA_PAYLOAD_KEY: chunk.metadata,
                        },
                    )
                    for chunk in batch_chunks
                ]
                self.client.upsert(
                    collection_name=collection or self.default_collection, points=points
                )
        except Exception as e:
            log.critical(f"Failed to insert data, error info: {e}")
    def search_data(
        self,
        collection: Optional[str],
        vector: Union[np.array, List[float]],
        top_k: int = 5,
        *args,
        **kwargs,
    ) -> List[RetrievalResult]:
        """
        Search for similar vectors in a Qdrant collection.
        Args:
            collection (Optional[str]): Collection name..
            vector (Union[np.array, List[float]]): Query vector for similarity search.
            top_k (int, optional): Number of results to return. Defaults to 5.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[RetrievalResult]: List of retrieval results containing similar vectors.
        """
        try:
            results = self.client.query_points(
                collection_name=collection or self.default_collection,
                query=vector,
                limit=top_k,
                with_payload=True,
                with_vectors=True,
            ).points
            return [
                RetrievalResult(
                    embedding=result.vector,
                    text=result.payload.get(TEXT_PAYLOAD_KEY, ""),
                    reference=result.payload.get(REFERENCE_PAYLOAD_KEY, ""),
                    score=result.score,
                    metadata=result.payload.get(METADATA_PAYLOAD_KEY, {}),
                )
                for result in results
            ]
        except Exception as e:
            log.critical(f"Failed to search data, error info: {e}")
            return []
    def list_collections(self, *args, **kwargs) -> List[CollectionInfo]:
        """
        List all collections in the Qdrant database.
        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        Returns:
            List[CollectionInfo]: List of collection information objects.
        """
        collection_infos = []
        try:
            collections = self.client.get_collections().collections
            for collection in collections:
                collection_infos.append(
                    CollectionInfo(
                        collection_name=collection.name,
                        # Qdrant doesn't have a native description field
                        description=collection.name,
                    )
                )
        except Exception as e:
            log.critical(f"Failed to list collections, error info: {e}")
        return collection_infos
    def clear_db(self, collection: Optional[str] = None, *args, **kwargs):
        """
        Clear (drop) a collection from the Qdrant database.
        Args:
            collection (str, optional): Collection name to drop.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.
        """
        try:
            self.client.delete_collection(collection_name=collection or self.default_collection)
        except Exception as e:
            log.warning(f"Failed to drop collection, error info: {e}")
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,42 @@
 # DeepSearcher Documentation
 This directory contains the documentation for DeepSearcher, powered by MkDocs.
 ## Setup
 1. Install MkDocs and required plugins:
 ```bash
 pip install mkdocs mkdocs-material mkdocs-jupyter pymdown-extensions
 ```
 2. Clone the repository:
 ```bash
 git clone https://github.com/zilliztech/deep-searcher.git
 cd deep-searcher
 ```
 ## Development
 To serve the documentation locally:
 ```bash
 mkdocs serve
 ```
 This will start a local server at http://127.0.0.1:8000/ where you can preview the documentation.
 ## Building
 To build the static site:
 ```bash
 mkdocs build
 ```
 This will generate the static site in the `site` directory.
 ## Deployment
 The documentation is automatically deployed when changes are pushed to the main branch using GitHub Actions. 
--- a/docs/assets/pic/deep-searcher-arch.png
+++ b/docs/assets/pic/deep-searcher-arch.png
--- a/docs/assets/pic/demo.gif
+++ b/docs/assets/pic/demo.gif
--- a/docs/assets/pic/logo-badge.png
+++ b/docs/assets/pic/logo-badge.png
--- a/docs/assets/pic/logo.png
+++ b/docs/assets/pic/logo.png
--- a/docs/configuration/embedding.md
+++ b/docs/configuration/embedding.md
@ -0,0 +1,126 @@
 # Embedding Model Configuration
 DeepSearcher supports various embedding models to convert text into vector representations for semantic search.
 ## 📝 Basic Configuration
 ```python
 config.set_provider_config("embedding", "(EmbeddingModelName)", "(Arguments dict)")
 ```
 ## 📋 Available Embedding Providers
 | Provider | Description | Key Features |
 |----------|-------------|--------------|
 | **OpenAIEmbedding** | OpenAI's text embedding models | High quality, production-ready |
 | **MilvusEmbedding** | Built-in embedding models via Pymilvus | Multiple model options |
 | **VoyageEmbedding** | VoyageAI embedding models | Specialized for search |
 | **BedrockEmbedding** | Amazon Bedrock embedding | AWS integration |
 | **GeminiEmbedding** | Google's Gemini embedding | High performance |
 | **GLMEmbedding** | ChatGLM embeddings | Chinese language support |
 | **OllamaEmbedding** | Local embedding with Ollama | Self-hosted option |
 | **PPIOEmbedding** | PPIO cloud embedding | Scalable solution |
 | **SiliconflowEmbedding** | Siliconflow's models | Enterprise support |
 | **VolcengineEmbedding** | Volcengine embedding | High throughput |
 | **NovitaEmbedding** | Novita AI embedding | Cost-effective |
 | **SentenceTransformerEmbedding** | Sentence Transfomer Embedding | Self-hosted option |
 | **IBM watsonx.ai** | Various options | IBM's Enterprise AI platform |
 ## 🔍 Provider Examples
 ### OpenAI Embedding
 ```python
 config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-3-small"})
 ```
 *Requires `OPENAI_API_KEY` environment variable*
 ### Milvus Built-in Embedding
 ```python
 config.set_provider_config("embedding", "MilvusEmbedding", {"model": "BAAI/bge-base-en-v1.5"})
 ```
 ```python
 config.set_provider_config("embedding", "MilvusEmbedding", {"model": "jina-embeddings-v3"})
 ```
 *For Jina's embedding model, requires `JINAAI_API_KEY` environment variable*
 ### VoyageAI Embedding
 ```python
 config.set_provider_config("embedding", "VoyageEmbedding", {"model": "voyage-3"})
 ```
 *Requires `VOYAGE_API_KEY` environment variable and `pip install voyageai`*
 ## 📚 Additional Providers
 ??? example "Amazon Bedrock"
    ```python
    config.set_provider_config("embedding", "BedrockEmbedding", {"model": "amazon.titan-embed-text-v2:0"})
    ```
    *Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`*
 ??? example "Novita AI"
    ```python
    config.set_provider_config("embedding", "NovitaEmbedding", {"model": "baai/bge-m3"})
    ```
    *Requires `NOVITA_API_KEY` environment variable*
 ??? example "Siliconflow"
    ```python
    config.set_provider_config("embedding", "SiliconflowEmbedding", {"model": "BAAI/bge-m3"})
    ```
    *Requires `SILICONFLOW_API_KEY` environment variable*
 ??? example "Volcengine"
    ```python
    config.set_provider_config("embedding", "VolcengineEmbedding", {"model": "doubao-embedding-text-240515"})
    ```
    *Requires `VOLCENGINE_API_KEY` environment variable*
 ??? example "GLM"
    ```python
    config.set_provider_config("embedding", "GLMEmbedding", {"model": "embedding-3"})
    ```
    *Requires `GLM_API_KEY` environment variable and `pip install zhipuai`*
 ??? example "Google Gemini"
    ```python
    config.set_provider_config("embedding", "GeminiEmbedding", {"model": "text-embedding-004"})
    ```
    *Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`*
 ??? example "Ollama"
    ```python
    config.set_provider_config("embedding", "OllamaEmbedding", {"model": "bge-m3"})
    ```
    *Requires local Ollama installation and `pip install ollama`*
 ??? example "PPIO"
    ```python
    config.set_provider_config("embedding", "PPIOEmbedding", {"model": "baai/bge-m3"})
    ```
    *Requires `PPIO_API_KEY` environment variable*
 ??? example "SentenceTransformer"
    ```python
    config.set_provider_config("embedding", "SentenceTransformerEmbedding", {"model": "BAAI/bge-large-zh-v1.5"})
    ```
    *Requires `pip install sentence-transformers`*
 ??? example "IBM WatsonX"
    ```python
    config.set_provider_config("embedding", "WatsonXEmbedding", {"model": "ibm/slate-125m-english-rtrvr-v2"})
    ```
    *Requires `pip install ibm-watsonx-ai`*
--- a/docs/configuration/file_loader.md
+++ b/docs/configuration/file_loader.md
@ -0,0 +1,70 @@
 # File Loader Configuration
 DeepSearcher supports various file loaders to extract and process content from different file formats.
 ## 📝 Basic Configuration
 ```python
 config.set_provider_config("file_loader", "(FileLoaderName)", "(Arguments dict)")
 ```
 ## 📋 Available File Loaders
 | Loader | Description | Supported Formats |
 |--------|-------------|-------------------|
 | **UnstructuredLoader** | General purpose document loader with broad format support | PDF, DOCX, PPT, HTML, etc. |
 | **DoclingLoader** | Document processing library with extraction capabilities | See [documentation](https://docling-project.github.io/docling/usage/supported_formats/) |
 ## 🔍 File Loader Options
 ### Unstructured
 [Unstructured](https://unstructured.io/) is a powerful library for extracting content from various document formats.
 ```python
 config.set_provider_config("file_loader", "UnstructuredLoader", {})
 ```
 ??? tip "Setup Instructions"
    You can use Unstructured in two ways:
    1. **With API** (recommended for production)
       - Set environment variables:
         - `UNSTRUCTURED_API_KEY`
         - `UNSTRUCTURED_API_URL`
    2. **Local Processing**
       - Simply don't set the API environment variables
       - Install required dependencies:
         ```bash
         # Install core dependencies
         pip install unstructured-ingest
         # For all document formats
         pip install "unstructured[all-docs]"
         # For specific formats (e.g., PDF only)
         pip install "unstructured[pdf]"
         ```
    For more information:
    - [Unstructured Documentation](https://docs.unstructured.io/ingestion/overview)
    - [Installation Guide](https://docs.unstructured.io/open-source/installation/full-installation)
 ### Docling
 [Docling](https://docling-project.github.io/docling/) provides document processing capabilities with support for multiple formats.
 ```python
 config.set_provider_config("file_loader", "DoclingLoader", {})
 ```
 ??? tip "Setup Instructions"
    1. Install Docling:
       ```bash
       pip install docling
       ```
    2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats). 
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@ -0,0 +1,33 @@
 # Configuration Overview
 DeepSearcher provides flexible configuration options for all its components. You can customize the following aspects of the system:
 ## 📋 Components
 | Component | Purpose | Documentation |
 |-----------|---------|---------------|
 | **LLM** | Large Language Models for query processing | [LLM Configuration](llm.md) |
 | **Embedding Models** | Text embedding for vector retrieval | [Embedding Models](embedding.md) |
 | **Vector Database** | Storage and retrieval of vector embeddings | [Vector Database](vector_db.md) |
 | **File Loader** | Loading and processing various file formats | [File Loader](file_loader.md) |
 | **Web Crawler** | Gathering information from web sources | [Web Crawler](web_crawler.md) |
 ## 🔄 Configuration Method
 DeepSearcher uses a consistent configuration approach for all components:
 ```python
 from deepsearcher.configuration import Configuration, init_config
 # Create configuration
 config = Configuration()
 # Set provider configurations
 config.set_provider_config("[component]", "[provider]", {"option": "value"})
 # Initialize with configuration
 init_config(config=config)
 ```
 For detailed configuration options for each component, please visit the corresponding documentation pages linked in the table above.
--- a/docs/configuration/llm.md
+++ b/docs/configuration/llm.md
@ -0,0 +1,192 @@
 # LLM Configuration
 DeepSearcher supports various Large Language Models (LLMs) for processing queries and generating responses.
 ## 📝 Basic Configuration
 ```python
 config.set_provider_config("llm", "(LLMName)", "(Arguments dict)")
 ```
 ## 📋 Available LLM Providers
 | Provider | Description | Key Models |
 |----------|-------------|------------|
 | **OpenAI** | OpenAI's API for GPT models | o1-mini, GPT-4 |
 | **DeepSeek** | DeepSeek AI offering | deepseek-reasoner, coder |
 | **Anthropic** | Anthropic's Claude models | claude-sonnet-4-0 |
 | **Gemini** | Google's Gemini models | gemini-1.5-pro, gemini-2.0-flash |
 | **XAI** | X.AI's Grok models | grok-2-latest |
 | **Ollama** | Local LLM deployment | llama3, qwq, etc. |
 | **SiliconFlow** | Enterprise AI platform | deepseek-r1 |
 | **TogetherAI** | Multiple model options | llama-4, deepseek |
 | **PPIO** | Cloud AI infrastructure | deepseek, llama |
 | **Volcengine** | ByteDance LLM platform | deepseek-r1 |
 | **GLM** | ChatGLM models | glm-4-plus |
 | **Bedrock** | Amazon Bedrock LLMs | anthropic.claude, ai21.j2 |
 | **Novita** | Novita AI models | Various options |
 | **IBM watsonx.ai** | IBM Enterprise AI platform | Various options |
 ## 🔍 Provider Examples
 ### OpenAI
 ```python
 config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
 ```
 *Requires `OPENAI_API_KEY` environment variable*
 ### DeepSeek
 ```python
 config.set_provider_config("llm", "DeepSeek", {"model": "deepseek-reasoner"})
 ```
 *Requires `DEEPSEEK_API_KEY` environment variable*
 ### IBM WatsonX
 ```python
 config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"})
 ```
 *Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables*
 ## 📚 Additional Providers
 ??? example "DeepSeek from SiliconFlow"
    ```python
    config.set_provider_config("llm", "SiliconFlow", {"model": "deepseek-ai/DeepSeek-R1"})
    ```
    *Requires `SILICONFLOW_API_KEY` environment variable*
    More details about SiliconFlow: [https://docs.siliconflow.cn/quickstart](https://docs.siliconflow.cn/quickstart)
 ??? example "DeepSeek from TogetherAI"
    *Requires `TOGETHER_API_KEY` environment variable and `pip install together`*
    For DeepSeek R1:
    ```python
    config.set_provider_config("llm", "TogetherAI", {"model": "deepseek-ai/DeepSeek-R1"})
    ```
    For Llama 4:
    ```python
    config.set_provider_config("llm", "TogetherAI", {"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct"})
    ```
    More details about TogetherAI: [https://www.together.ai/](https://www.together.ai/)
 ??? example "XAI Grok"
    ```python
    config.set_provider_config("llm", "XAI", {"model": "grok-2-latest"})
    ```
    *Requires `XAI_API_KEY` environment variable*
    More details about XAI Grok: [https://docs.x.ai/docs/overview#featured-models](https://docs.x.ai/docs/overview#featured-models)
 ??? example "Claude"
    ```python
    config.set_provider_config("llm", "Anthropic", {"model": "claude-sonnet-4-0"})
    ```
    *Requires `ANTHROPIC_API_KEY` environment variable*
    More details about Anthropic Claude: [https://docs.anthropic.com/en/home](https://docs.anthropic.com/en/home)
 ??? example "Google Gemini"
    ```python
    config.set_provider_config('llm', 'Gemini', { 'model': 'gemini-2.0-flash' })
    ```
    *Requires `GEMINI_API_KEY` environment variable and `pip install google-genai`*
    More details about Gemini: [https://ai.google.dev/gemini-api/docs](https://ai.google.dev/gemini-api/docs)
 ??? example "DeepSeek from PPIO"
    ```python
    config.set_provider_config("llm", "PPIO", {"model": "deepseek/deepseek-r1-turbo"})
    ```
    *Requires `PPIO_API_KEY` environment variable*
    More details about PPIO: [https://ppinfra.com/docs/get-started/quickstart.html](https://ppinfra.com/docs/get-started/quickstart.html)
 ??? example "Ollama"
    ```python
    config.set_provider_config("llm", "Ollama", {"model": "qwq"})
    ```
    Follow [these instructions](https://github.com/jmorganca/ollama) to set up and run a local Ollama instance:
    1. [Download](https://ollama.ai/download) and install Ollama
    2. View available models via the [model library](https://ollama.ai/library)
    3. Pull models with `ollama pull <name-of-model>`
    4. By default, Ollama has a REST API on [http://localhost:11434](http://localhost:11434)
 ??? example "Volcengine"
    ```python
    config.set_provider_config("llm", "Volcengine", {"model": "deepseek-r1-250120"})
    ```
    *Requires `VOLCENGINE_API_KEY` environment variable*
    More details about Volcengine: [https://www.volcengine.com/docs/82379/1099455](https://www.volcengine.com/docs/82379/1099455)
 ??? example "GLM"
    ```python
    config.set_provider_config("llm", "GLM", {"model": "glm-4-plus"})
    ```
    *Requires `GLM_API_KEY` environment variable and `pip install zhipuai`*
    More details about GLM: [https://bigmodel.cn/dev/welcome](https://bigmodel.cn/dev/welcome)
 ??? example "Amazon Bedrock"
    ```python
    config.set_provider_config("llm", "Bedrock", {"model": "us.deepseek.r1-v1:0"})
    ```
    *Requires `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables and `pip install boto3`*
    More details about Amazon Bedrock: [https://docs.aws.amazon.com/bedrock/](https://docs.aws.amazon.com/bedrock/)
 ??? example "Aliyun Bailian"
    ```python
    config.set_provider_config("llm", "OpenAI", {"model": "deepseek-r1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1"})
    ```
    *Requires `OPENAI_API_KEY` environment variable*
    More details about Aliyun Bailian models: [https://bailian.console.aliyun.com](https://bailian.console.aliyun.com) 
 ??? example "IBM watsonx.ai LLM"
    ```python
    config.set_provider_config("llm", "WatsonX", {"model": "ibm/granite-3-3-8b-instruct"})
    ```
    With custom parameters:
    ```python
    config.set_provider_config("llm", "WatsonX", {
        "model": "ibm/granite-3-3-8b-instruct",
        "max_new_tokens": 1000,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 50
    })
    ```
    With space_id instead of project_id:
    ```python
    config.set_provider_config("llm", "WatsonX", {
        "model": "ibm/granite-3-3-8b-instruct""
    })
    ```
    *Requires `WATSONX_APIKEY`, `WATSONX_URL`, and `WATSONX_PROJECT_ID` environment variables and `pip install ibm-watsonx-ai`*
    More details about WatsonX: [https://www.ibm.com/products/watsonx-ai/foundation-models](https://www.ibm.com/products/watsonx-ai/foundation-models)
 ```
--- a/docs/configuration/vector_db.md
+++ b/docs/configuration/vector_db.md
@ -0,0 +1,52 @@
 # Vector Database Configuration
 DeepSearcher uses vector databases to store and retrieve document embeddings for efficient semantic search.
 ## 📝 Basic Configuration
 ```python
 config.set_provider_config("vector_db", "(VectorDBName)", "(Arguments dict)")
 ```
 Currently supported vector databases:
 - Milvus (including Milvus Lite and Zilliz Cloud)
 ## 🔍 Milvus Configuration
 ```python
 config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})
 ```
 ### Deployment Options
 ??? example "Local Storage with Milvus Lite"
    Setting the `uri` as a local file (e.g., `./milvus.db`) automatically utilizes [Milvus Lite](https://milvus.io/docs/milvus_lite.md) to store all data in this file. This is the most convenient method for development and smaller datasets.
    ```python
    config.set_provider_config("vector_db", "Milvus", {"uri": "./milvus.db", "token": ""})
    ```
 ??? example "Standalone Milvus Server"
    For larger datasets, you can set up a more performant Milvus server using [Docker or Kubernetes](https://milvus.io/docs/quickstart.md). In this setup, use the server URI as your `uri` parameter:
    ```python
    config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "token": ""})
    ```
    Also, you could specify other connection parameters supported by Milvus such as `user`, `password`, `secure` or others.
    ```python
    config.set_provider_config("vector_db", "Milvus", {"uri": "http://localhost:19530", "user": "<username>", "password":"<password>", "secure": True, "token": ""})
    ```
 ??? example "Zilliz Cloud (Managed Service)"
    [Zilliz Cloud](https://zilliz.com/cloud) provides a fully managed cloud service for Milvus. To use Zilliz Cloud, adjust the `uri` and `token` according to the [Public Endpoint and API Key](https://docs.zilliz.com/docs/on-zilliz-cloud-console#free-cluster-details):
    ```python
    config.set_provider_config("vector_db", "Milvus", {
        "uri": "https://your-instance-id.api.gcp-us-west1.zillizcloud.com", 
        "token": "your_api_key"
    })
    ``` 
--- a/docs/configuration/web_crawler.md
+++ b/docs/configuration/web_crawler.md
@ -0,0 +1,97 @@
 # Web Crawler Configuration
 DeepSearcher supports various web crawlers to collect data from websites for processing and indexing.
 ## 📝 Basic Configuration
 ```python
 config.set_provider_config("web_crawler", "(WebCrawlerName)", "(Arguments dict)")
 ```
 ## 📋 Available Web Crawlers
 | Crawler | Description | Key Feature |
 |---------|-------------|-------------|
 | **FireCrawlCrawler** | Cloud-based web crawling service | Simple API, managed service |
 | **Crawl4AICrawler** | Browser automation crawler | Full JavaScript support |
 | **JinaCrawler** | Content extraction service | High accuracy parsing |
 | **DoclingCrawler** | Doc processing with crawling | Multiple format support |
 ## 🔍 Web Crawler Options
 ### FireCrawl
 [FireCrawl](https://docs.firecrawl.dev/introduction) is a cloud-based web crawling service designed for AI applications.
 **Key features:**
 - Simple API
 - Managed Service
 - Advanced Parsing
 ```python
 config.set_provider_config("web_crawler", "FireCrawlCrawler", {})
 ```
 ??? tip "Setup Instructions"
    1. Sign up for FireCrawl and get an API key
    2. Set the API key as an environment variable:
       ```bash
       export FIRECRAWL_API_KEY="your_api_key"
       ```
    3. For more information, see the [FireCrawl documentation](https://docs.firecrawl.dev/introduction)
 ### Crawl4AI
 [Crawl4AI](https://docs.crawl4ai.com/) is a Python package for web crawling with browser automation capabilities.
 ```python
 config.set_provider_config("web_crawler", "Crawl4AICrawler", {"browser_config": {"headless": True, "verbose": True}})
 ```
 ??? tip "Setup Instructions"
    1. Install Crawl4AI:
       ```bash
       pip install crawl4ai
       ```
    2. Run the setup command:
       ```bash
       crawl4ai-setup
       ```
    3. For more information, see the [Crawl4AI documentation](https://docs.crawl4ai.com/)
 ### Jina Reader
 [Jina Reader](https://jina.ai/reader/) is a service for extracting content from web pages with high accuracy.
 ```python
 config.set_provider_config("web_crawler", "JinaCrawler", {})
 ```
 ??? tip "Setup Instructions"
    1. Get a Jina API key
    2. Set the API key as an environment variable:
       ```bash
       export JINA_API_TOKEN="your_api_key"
       # or
       export JINAAI_API_KEY="your_api_key"
       ```
    3. For more information, see the [Jina Reader documentation](https://jina.ai/reader/)
 ### Docling Crawler
 [Docling](https://docling-project.github.io/docling/) provides web crawling capabilities alongside its document processing features.
 ```python
 config.set_provider_config("web_crawler", "DoclingCrawler", {})
 ```
 ??? tip "Setup Instructions"
    1. Install Docling:
       ```bash
       pip install docling
       ```
    2. For information on supported formats, see the [Docling documentation](https://docling-project.github.io/docling/usage/supported_formats/#supported-output-formats) 
--- a/docs/contributing/index.md
+++ b/docs/contributing/index.md
@ -0,0 +1,159 @@
 # Contributing to DeepSearcher
 We welcome contributions from everyone. This document provides guidelines to make the contribution process straightforward.
 ## Pull Request Process
 1. Fork the repository and create your branch from `master`.
 2. Make your changes.
 3. Run tests and linting to ensure your code meets the project's standards.
 4. Update documentation if necessary.
 5. Submit a pull request.
 ## Linting and Formatting
 Keeping a consistent style for code, code comments, commit messages, and PR descriptions will greatly accelerate your PR review process.
 We require you to run code linter and formatter before submitting your pull requests:
 To check the coding styles:
 ```shell
 make lint
 ```
 To fix the coding styles:
 ```shell
 make format
 ```
 Our CI pipeline also runs these checks automatically on all pull requests to ensure code quality and consistency.
 ## Development Environment Setup with uv
 DeepSearcher uses [uv](https://github.com/astral-sh/uv) as the recommended package manager. uv is a fast, reliable Python package manager and installer. The project's `pyproject.toml` is configured to work with uv, which will provide faster dependency resolution and package installation compared to traditional tools.
 ### Install Project in Development Mode(aka Editable Installation)
 1. Install uv if you haven't already:
   Follow the [offical installation instructions](https://docs.astral.sh/uv/getting-started/installation/).
 2. Clone the repository and navigate to the project directory:
   ```shell
   git clone https://github.com/zilliztech/deep-searcher.git && cd deep-searcher
   ```
 3. Synchronize and install dependencies:
   ```shell
   uv sync
   source .venv/bin/activate
   ```
   `uv sync` will install all dependencies specified in `uv.lock` file. And the `source .venv/bin/activate` command will activate the virtual environment.
   - (Optional) To install all optional dependencies:
      ```shell
      uv sync --all-extras --dev
      ```
   - (Optional) To install specific optional dependencies:
      ```shell
      # Take optional `ollama` dependency for example
      uv sync --extra ollama
      ```
   For more optional dependencies, refer to the `[project.optional-dependencies]` part of `pyproject.toml` file.
 ### Adding Dependencies
 When you need to add new dependencies to the `pyproject.toml` file, you can use the following commands:
 ```shell
 uv add <package_name>
 ```
 DeepSearcher uses optional dependencies to keep the default installation lightweight. Optional features can be installed using the syntax `deepsearcher[<extra>]`. To add a dependency to an optional extra, use the following command:
 ```shell
 uv add <package_name> --optional <extra>
 ```
 For more details, refer to the [offical Managing dependencies documentation](https://docs.astral.sh/uv/concepts/projects/dependencies/).
 ### Dependencies Locking
 For development, we use lockfiles to ensure consistent dependencies. You can use 
 ```shell
 uv lock --check
 ```
 to verify if your lockfile is up-to-date with your project dependencies.
 When you modify or add dependencies in the project, the lockfile will be automatically updated the next time you run a uv command. You can also explicitly update the lockfile using:
 ```shell
 uv lock
 ```
 While the environment is synced automatically, it may also be explicitly synced using uv sync:
 ```shell
 uv sync
 ```
 Syncing the environment manually is especially useful for ensuring your editor has the correct versions of dependencies.
 For more detailed information about dependency locking and syncing, refer to the [offical Locking and syncing documentation](https://docs.astral.sh/uv/concepts/projects/sync/).
 ## Running Tests
 Before submitting your pull request, make sure to run the test suite to ensure your changes haven't introduced any regressions.
 ### Installing Test Dependencies
 First, ensure you have pytest installed. If you haven't installed the development dependencies yet, you can do so with:
 ```shell
 uv sync --all-extras --dev
 ```
 This will install all development dependencies and optional dependencies including pytest and other testing tools.
 ### Running the Tests
 To run all tests in the `tests` directory:
 ```shell
 uv run pytest tests
 ```
 For more verbose output that shows individual test results:
 ```shell
 uv run pytest tests -v
 ```
 You can also run tests for specific directories or files. For example:
 ```shell
 # Run tests in a specific directory
 uv run pytest tests/embedding
 # Run tests in a specific file
 uv run pytest tests/embedding/test_bedrock_embedding.py
 # Run a specific test class
 uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding
 # Run a specific test method
 uv run pytest tests/embedding/test_bedrock_embedding.py::TestBedrockEmbedding::test_init_default
 ```
 The `-v` flag (verbose mode) provides more detailed output, showing each test case and its result individually. This is particularly useful when you want to see which specific tests are passing or failing.
 ## Developer Certificate of Origin (DCO)
 All contributions require a sign-off, acknowledging the [Developer Certificate of Origin](https://developercertificate.org/). 
 Add a `Signed-off-by` line to your commit message:
 ```text
 Signed-off-by: Your Name <your.email@example.com>
 ``` 
--- a/docs/examples/basic_example.md
+++ b/docs/examples/basic_example.md
@ -0,0 +1,65 @@
 # Basic Example
 This example demonstrates the core functionality of DeepSearcher - loading documents and performing semantic search.
 ## Overview
 The script performs these steps:
 1. Configures DeepSearcher with default settings
 2. Loads a PDF document about Milvus
 3. Asks a question about Milvus and vector databases
 4. Displays token usage information
 ## Code Example
 ```python
 import logging
 import os
 from deepsearcher.offline_loading import load_from_local_files
 from deepsearcher.online_query import query
 from deepsearcher.configuration import Configuration, init_config
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 current_dir = os.path.dirname(os.path.abspath(__file__))
 config = Configuration()  # Customize your config here
 init_config(config=config)
 # You should clone the milvus docs repo to your local machine first, execute:
 # git clone https://github.com/milvus-io/milvus-docs.git
 # Then replace the path below with the path to the milvus-docs repo on your local machine
 # import glob
 # all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True)
 # load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents")
 # Hint: You can also load a single file, please execute it in the root directory of the deep searcher project
 load_from_local_files(
    paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
    collection_name="milvus_docs",
    collection_description="All Milvus Documents",
    # force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
 )
 question = "Write a report comparing Milvus with other vector databases."
 _, _, consumed_token = query(question, max_iter=1)
 print(f"Consumed tokens: {consumed_token}")
 ```
 ## Running the Example
 1. Make sure you have installed DeepSearcher: `pip install deepsearcher`
 2. Create a data directory and add a PDF about Milvus (or use your own data)
 3. Run the script: `python basic_example.py`
 ## Key Concepts
 - **Configuration**: Using the default configuration
 - **Document Loading**: Loading a single PDF file
 - **Querying**: Asking a complex question requiring synthesis of information
 - **Token Tracking**: Monitoring token usage from the LLM 
--- a/docs/examples/docling.md
+++ b/docs/examples/docling.md
@ -0,0 +1,101 @@
 # Docling Integration Example
 This example shows how to use Docling for loading local files and crawling web content.
 ## Overview
 The script demonstrates:
 1. Configuring DeepSearcher to use Docling for both file loading and web crawling
 2. Loading data from local files using Docling's document parser
 3. Crawling web content from multiple sources including Markdown and PDF files
 4. Querying the loaded data
 ## Code Example
 ```python
 import logging
 import os
 from deepsearcher.offline_loading import load_from_local_files, load_from_website
 from deepsearcher.online_query import query
 from deepsearcher.configuration import Configuration, init_config
 # Suppress unnecessary logging from third-party libraries
 logging.getLogger("httpx").setLevel(logging.WARNING)
 def main():
    # Step 1: Initialize configuration
    config = Configuration()
    # Configure Vector Database and Docling providers
    config.set_provider_config("vector_db", "Milvus", {})
    config.set_provider_config("file_loader", "DoclingLoader", {})
    config.set_provider_config("web_crawler", "DoclingCrawler", {})
    # Apply the configuration
    init_config(config)
    # Step 2a: Load data from a local file using DoclingLoader
    local_file = "your_local_file_or_directory"
    local_collection_name = "DoclingLocalFiles"
    local_collection_description = "Milvus Documents loaded using DoclingLoader"
    print("\n=== Loading local files using DoclingLoader ===")
    try:
        load_from_local_files(
            paths_or_directory=local_file, 
            collection_name=local_collection_name, 
            collection_description=local_collection_description,
            force_new_collection=True
        )
        print(f"Successfully loaded: {local_file}")
    except ValueError as e:
        print(f"Validation error: {str(e)}")
    except Exception as e:
        print(f"Error: {str(e)}")
    print("Successfully loaded all local files")
    # Step 2b: Crawl URLs using DoclingCrawler
    urls = [
        # Markdown documentation files
        "https://milvus.io/docs/quickstart.md",
        "https://milvus.io/docs/overview.md",
        # PDF example - can handle various URL formats
        "https://arxiv.org/pdf/2408.09869",
    ]
    web_collection_name = "DoclingWebCrawl"
    web_collection_description = "Milvus Documentation crawled using DoclingCrawler"
    print("\n=== Crawling web pages using DoclingCrawler ===")
    load_from_website(
        urls=urls,
        collection_name=web_collection_name,
        collection_description=web_collection_description,
        force_new_collection=True
    )
    print("Successfully crawled all URLs")
    # Step 3: Query the loaded data
    question = "What is Milvus?"
    result = query(question)
 if __name__ == "__main__":
    main()
 ```
 ## Running the Example
 1. Install DeepSearcher and Docling: `pip install deepsearcher docling`
 2. Replace `your_local_file_or_directory` with your actual file/directory path
 3. Run the script: `python load_and_crawl_using_docling.py`
 ## Key Concepts
 - **Multiple Providers**: Configuring both file loader and web crawler to use Docling
 - **Local Files**: Loading documents from your local filesystem
 - **Web Crawling**: Retrieving content from multiple web URLs with different formats
 - **Error Handling**: Graceful error handling for loading operations 
--- a/docs/examples/firecrawl.md
+++ b/docs/examples/firecrawl.md
@ -0,0 +1,82 @@
 # FireCrawl Integration Example
 This example demonstrates how to use FireCrawl with DeepSearcher to crawl and extract content from websites.
 ## Overview
 FireCrawl is a specialized web crawling service designed for AI applications. This example shows:
 1. Setting up FireCrawl with DeepSearcher
 2. Configuring API keys for the service
 3. Crawling a website and extracting content
 4. Querying the extracted content
 ## Code Example
 ```python
 import logging
 import os
 from deepsearcher.offline_loading import load_from_website
 from deepsearcher.online_query import query
 from deepsearcher.configuration import Configuration, init_config
 # Suppress unnecessary logging from third-party libraries
 logging.getLogger("httpx").setLevel(logging.WARNING)
 # Set API keys (ensure these are set securely in real applications)
 os.environ['OPENAI_API_KEY'] = 'sk-***************'
 os.environ['FIRECRAWL_API_KEY'] = 'fc-***************'
 def main():
    # Step 1: Initialize configuration
    config = Configuration()
    # Set up Vector Database (Milvus) and Web Crawler (FireCrawlCrawler)
    config.set_provider_config("vector_db", "Milvus", {})
    config.set_provider_config("web_crawler", "FireCrawlCrawler", {})
    # Apply the configuration
    init_config(config)
    # Step 2: Load data from a website into Milvus
    website_url = "https://example.com"  # Replace with your target website
    collection_name = "FireCrawl"
    collection_description = "All Milvus Documents"
    # crawl a single webpage
    load_from_website(urls=website_url, collection_name=collection_name, collection_description=collection_description)
    # only applicable if using Firecrawl: deepsearcher can crawl multiple webpages, by setting max_depth, limit, allow_backward_links
    # load_from_website(urls=website_url, max_depth=2, limit=20, allow_backward_links=True, collection_name=collection_name, collection_description=collection_description)
    # Step 3: Query the loaded data
    question = "What is Milvus?"  # Replace with your actual question
    result = query(question)
 if __name__ == "__main__":
    main()
 ```
 ## Running the Example
 1. Install DeepSearcher: `pip install deepsearcher`
 2. Sign up for a FireCrawl API key at [firecrawl.dev](https://docs.firecrawl.dev/introduction)
 3. Replace the placeholder API keys with your actual keys
 4. Change the `website_url` to the website you want to crawl
 5. Run the script: `python load_website_using_firecrawl.py`
 ## Advanced Crawling Options
 FireCrawl provides several advanced options for crawling:
 - `max_depth`: Control how many links deep the crawler should go
 - `limit`: Set a maximum number of pages to crawl
 - `allow_backward_links`: Allow the crawler to navigate to parent/sibling pages
 ## Key Concepts
 - **Web Crawling**: Extracting content from websites
 - **Depth Control**: Managing how deep the crawler navigates
 - **URL Processing**: Handling multiple pages from a single starting point
 - **Vector Storage**: Storing the crawled content in a vector database for search 
--- a/docs/examples/index.md
+++ b/docs/examples/index.md
@ -0,0 +1,15 @@
 # Usage Examples
 DeepSearcher provides several example scripts to help you get started quickly. These examples demonstrate different ways to use DeepSearcher for various use cases.
 ## 📋 Available Examples
 | Example | Description | Key Features |
 |---------|-------------|--------------|
 | [Basic Example](basic_example.md) | Simple example showing core functionality | Loading PDFs, querying |
 | [Docling Integration](docling.md) | Using Docling for file loading and web crawling | Multiple sources, local and web |
 | [Unstructured Integration](unstructured.md) | Using Unstructured for parsing documents | API and local processing |
 | [FireCrawl Integration](firecrawl.md) | Web crawling with FireCrawl | Website data extraction |
 | [Oracle Setup](oracle.md) | Advanced configuration with Oracle | Path setup, token tracking |
 Click on any example to see detailed code and explanations. 
--- a/docs/examples/oracle.md
+++ b/docs/examples/oracle.md
@ -0,0 +1,70 @@
 # Oracle Example
 This example demonstrates an advanced setup using path manipulation and detailed token tracking.
 ## Overview
 This example shows:
 1. Setting up Python path for importing from the parent directory
 2. Initializing DeepSearcher with default configuration
 3. Loading a PDF document and creating a vector database
 4. Performing a complex query with full result and token tracking
 5. Optional token consumption monitoring
 ## Code Example
 ```python
 import sys, os
 from pathlib import Path
 script_directory = Path(__file__).resolve().parent.parent
 sys.path.append(os.path.abspath(script_directory))
 import logging
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # Customize your config here
 from deepsearcher.configuration import Configuration, init_config
 config = Configuration()
 init_config(config=config)
 # Load your local data
 # Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project
 from deepsearcher.offline_loading import load_from_local_files
 load_from_local_files(
    paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
    collection_name="milvus_docs",
    collection_description="All Milvus Documents",
    # force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
 )
 # Query
 from deepsearcher.online_query import query
 question = 'Write a report comparing Milvus with other vector databases.'
 answer, retrieved_results, consumed_token = query(question)
 print(answer)
 # get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model
 # print(f"Consumed tokens: {consumed_token}")
 ```
 ## Running the Example
 1. Install DeepSearcher: `pip install deepsearcher`
 2. Make sure you have the data directory with "WhatisMilvus.pdf" (or change the path)
 3. Run the script: `python basic_example_oracle.py`
 ## Key Concepts
 - **Path Management**: Setting up Python path to import from parent directory
 - **Query Unpacking**: Getting full result details (answer, retrieved context, and tokens)
 - **Complex Querying**: Asking for a comparative analysis that requires synthesis
 - **Token Economy**: Monitoring token usage for cost optimization 
--- a/docs/examples/unstructured.md
+++ b/docs/examples/unstructured.md
@ -0,0 +1,76 @@
 # Unstructured Integration Example
 This example demonstrates how to use the Unstructured library with DeepSearcher for advanced document parsing.
 ## Overview
 Unstructured is a powerful document processing library that can extract content from various document formats. This example shows:
 1. Setting up Unstructured with DeepSearcher
 2. Configuring the Unstructured API keys (optional)
 3. Loading documents with Unstructured's parser
 4. Querying the extracted content
 ## Code Example
 ```python
 import logging
 import os
 from deepsearcher.offline_loading import load_from_local_files
 from deepsearcher.online_query import query
 from deepsearcher.configuration import Configuration, init_config
 # Suppress unnecessary logging from third-party libraries
 logging.getLogger("httpx").setLevel(logging.WARNING)
 # (Optional) Set API keys (ensure these are set securely in real applications)
 os.environ['UNSTRUCTURED_API_KEY'] = '***************'
 os.environ['UNSTRUCTURED_API_URL'] = '***************'
 def main():
    # Step 1: Initialize configuration
    config = Configuration()
    # Configure Vector Database (Milvus) and File Loader (UnstructuredLoader)
    config.set_provider_config("vector_db", "Milvus", {})
    config.set_provider_config("file_loader", "UnstructuredLoader", {})
    # Apply the configuration
    init_config(config)
    # Step 2: Load data from a local file or directory into Milvus
    input_file = "your_local_file_or_directory"  # Replace with your actual file path
    collection_name = "Unstructured"
    collection_description = "All Milvus Documents"
    load_from_local_files(paths_or_directory=input_file, collection_name=collection_name, collection_description=collection_description)
    # Step 3: Query the loaded data
    question = "What is Milvus?"  # Replace with your actual question
    result = query(question)
 if __name__ == "__main__":
    main()
 ```
 ## Running the Example
 1. Install DeepSearcher with Unstructured support: `pip install deepsearcher "unstructured[all-docs]"`
 2. (Optional) Sign up for the Unstructured API at [unstructured.io](https://unstructured.io) if you want to use their cloud service
 3. Replace `your_local_file_or_directory` with your own document file path or directory
 4. Run the script: `python load_local_file_using_unstructured.py`
 ## Unstructured Options
 You can use Unstructured in two modes:
 1. **API Mode**: Set the environment variables `UNSTRUCTURED_API_KEY` and `UNSTRUCTURED_API_URL` to use their cloud service
 2. **Local Mode**: Don't set the environment variables, and Unstructured will process documents locally on your machine
 ## Key Concepts
 - **Document Processing**: Advanced document parsing for various formats
 - **API/Local Options**: Flexibility in deployment based on your needs
 - **Integration**: Seamless integration with DeepSearcher's vector database and query capabilities 
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -0,0 +1,73 @@
 # Frequently Asked Questions
 ## 🔍 Common Issues and Solutions
 ---
 ### 💬 Q1: Why am I failing to parse LLM output format / How to select the right LLM?
 <div class="faq-answer">
 <p><strong>Solution:</strong> Small language models often struggle to follow prompts and generate responses in the expected format. For better results, we recommend using large reasoning models such as:</p>
 <ul>
  <li>DeepSeek-R1 671B</li>
  <li>OpenAI o-series models</li>
  <li>Claude 3.7 Sonnet</li>
 </ul>
 <p>These models provide superior reasoning capabilities and are more likely to produce correctly formatted outputs.</p>
 </div>
 ---
 ### 🌐 Q2: "We couldn't connect to 'https://huggingface.co'" error
 <div class="faq-answer">
 <p><strong>Error Message:</strong></p>
 <div class="error-message">
 OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like GPTCache/paraphrase-albert-small-v2 is not the path to a directory containing a file named config.json.
 Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
 </div>
 <p><strong>Solution:</strong> This issue is typically caused by network access problems to Hugging Face. Try these solutions:</p>
 <details>
 <summary><strong>Network Issue? Try Using a Mirror</strong></summary>
 ```bash
 export HF_ENDPOINT=https://hf-mirror.com
 ```
 </details>
 <details>
 <summary><strong>Permission Issue? Set Up a Personal Token</strong></summary>
 ```bash
 export HUGGING_FACE_HUB_TOKEN=xxxx
 ```
 </details>
 </div>
 ---
 ### 📓 Q3: DeepSearcher doesn't run in Jupyter notebook
 <div class="faq-answer">
 <p><strong>Solution:</strong> This is a common issue with asyncio in Jupyter notebooks. Install <code>nest_asyncio</code> and add the following code to the top of your notebook:</p>
 <div class="code-steps">
 <p><strong>Step 1:</strong> Install the required package</p>
 ```bash
 pip install nest_asyncio
 ```
 <p><strong>Step 2:</strong> Add these lines to the beginning of your notebook</p>
 ```python
 import nest_asyncio
 nest_asyncio.apply()
 ```
 </div>
 </div>
 </div> 
--- a/docs/future_plans.md
+++ b/docs/future_plans.md
@ -0,0 +1,8 @@
 # Future Plans
 - Enhance web crawling functionality
 - Support more vector databases (e.g., FAISS...)
 - Add support for additional large models
 - Provide RESTful API interface (**DONE**)
 We welcome contributions! Star & Fork the project and help us build a more powerful DeepSearcher! 🎯 
--- a/docs/index.md
+++ b/docs/index.md
@ -0,0 +1,45 @@
 # 🔍 DeepSearcher
 ![DeepSearcher](./assets/pic/logo.png)
 <div align="center">
  <a href="https://opensource.org/licenses/Apache-2.0">
    <img height="28" src="https://img.shields.io/badge/License-Apache%202.0-blue.svg?style=flat" alt="License">
  </a>
  <a href="https://twitter.com/zilliz_universe">
    <img height="28" src="https://img.shields.io/badge/Follow-%40Zilliz-1DA1F2?style=flat&logo=twitter" alt="Twitter">
  </a>
  <a href="https://discord.gg/mKc3R95yE5">
    <img height="28" src="https://img.shields.io/badge/Discord-Join%20Chat-5865F2?style=flat&logo=discord&logoColor=white" alt="Discord">
  </a>
 </div>
 ---
 ## ✨ Overview
 DeepSearcher combines cutting-edge LLMs (OpenAI o1, o3-mini, DeepSeek, Grok 3, Claude 4 Sonnet, Llama 4, QwQ, etc.) and Vector Databases (Milvus, Zilliz Cloud etc.) to perform search, evaluation, and reasoning based on private data, providing highly accurate answers and comprehensive reports.
 > **Perfect for:** Enterprise knowledge management, intelligent Q&A systems, and information retrieval scenarios.
 ![Architecture](./assets/pic/deep-searcher-arch.png)
 ## 🚀 Key Features
 | Feature | Description |
 |---------|-------------|
 | 🔒 **Private Data Search** | Maximizes utilization of enterprise internal data while ensuring data security. When necessary, integrates online content for more accurate answers. |
 | 🗄️ **Vector Database Management** | Supports Milvus and other vector databases, allowing data partitioning for efficient retrieval. |
 | 🧩 **Flexible Embedding Options** | Compatible with multiple embedding models for optimal selection based on your needs. |
 | 🤖 **Multiple LLM Support** | Supports DeepSeek, OpenAI, and other large models for intelligent Q&A and content generation. |
 | 📄 **Document Loader** | Supports local file loading, with web crawling capabilities under development. |
 ## 🎬 Demo
 ![Demo](./assets/pic/demo.gif)
--- a/docs/installation/development.md
+++ b/docs/installation/development.md
@ -0,0 +1,64 @@
 # 🛠️ Development Mode Installation
 This guide is for contributors who want to modify DeepSearcher's code or develop new features.
 ## 📋 Prerequisites
 - Python 3.10 or higher
 - git
 - [uv](https://github.com/astral-sh/uv) package manager (recommended for faster installation)
 ## 🔄 Installation Steps
 ### Step 1: Install uv (Recommended)
 [uv](https://github.com/astral-sh/uv) is a faster alternative to pip for Python package management.
 === "Using pip"
    ```bash
    pip install uv
    ```
 === "Using curl (Unix/macOS)"
    ```bash
    curl -LsSf https://astral.sh/uv/install.sh | sh
    ```
 === "Using PowerShell (Windows)"
    ```powershell
    irm https://astral.sh/uv/install.ps1 | iex
    ```
 For more options, see the [official uv installation guide](https://docs.astral.sh/uv/getting-started/installation/).
 ### Step 2: Clone the repository
 ```bash
 git clone https://github.com/zilliztech/deep-searcher.git
 cd deep-searcher
 ```
 ### Step 3: Set up the development environment
 === "Using uv (Recommended)"
    ```bash
    uv sync
    source .venv/bin/activate
    ```
 === "Using pip"
    ```bash
    python -m venv .venv
    source .venv/bin/activate  # On Windows: .venv\Scripts\activate
    pip install -e ".[dev,all]"
    ```
 ## 🧪 Running Tests
 ```bash
 pytest tests/
 ```
 ## 📚 Additional Resources
 For more detailed development setup instructions, including contribution guidelines, code style, and testing procedures, please refer to the [CONTRIBUTING.md](https://github.com/zilliztech/deep-searcher/blob/main/CONTRIBUTING.md) file in the repository. 
--- a/docs/installation/index.md
+++ b/docs/installation/index.md
@ -0,0 +1,29 @@
 # 🔧 Installation
 DeepSearcher offers multiple installation methods to suit different user needs.
 ## 📋 Installation Options
 | Method | Best For | Description |
 |--------|----------|-------------|
 | [📦 Installation via pip](pip.md) | Most users | Quick and easy installation using pip package manager |
 | [🛠️ Development mode](development.md) | Contributors | Setup for those who want to modify the code or contribute |
 ## 🚀 Quick Start
 Once installed, you can verify your installation:
 ```python
 from deepsearcher.configuration import Configuration
 from deepsearcher.online_query import query
 # Initialize with default configuration
 config = Configuration()
 print("DeepSearcher installed successfully!")
 ```
 ## 💻 System Requirements
 - Python 3.10 or higher
 - 4GB RAM minimum (8GB+ recommended)
 - Internet connection for downloading models and dependencies 
--- a/docs/installation/pip.md
+++ b/docs/installation/pip.md
@ -0,0 +1,52 @@
 # 📦 Installation via pip
 This method is recommended for most users who want to use DeepSearcher without modifying its source code.
 ## 📋 Prerequisites
 - Python 3.10 or higher
 - pip package manager (included with Python)
 - Virtual environment tool (recommended)
 ## 🔄 Step-by-Step Installation
 ### Step 1: Create a virtual environment
 ```bash
 python -m venv .venv
 ```
 ### Step 2: Activate the virtual environment
 === "Linux/macOS"
    ```bash
    source .venv/bin/activate
    ```
 === "Windows"
    ```bash
    .venv\Scripts\activate
    ```
 ### Step 3: Install DeepSearcher
 ```bash
 pip install deepsearcher
 ```
 ## 🧩 Optional Dependencies
 DeepSearcher supports various integrations through optional dependencies.
 | Integration | Command | Description |
 |-------------|---------|-------------|
 | Ollama | `pip install "deepsearcher[ollama]"` | For local LLM deployment |
 | All extras | `pip install "deepsearcher[all]"` | Installs all optional dependencies |
 ## ✅ Verify Installation
 ```python
 # Simple verification
 from deepsearcher import __version__
 print(f"DeepSearcher version: {__version__}")
 ``` 
--- a/docs/integrations/index.md
+++ b/docs/integrations/index.md
@ -0,0 +1,75 @@
 # Module Support
 DeepSearcher supports various integration modules including embedding models, large language models, document loaders and vector databases.
 ## 📊 Overview
 | Module Type | Count | Description |
 |-------------|-------|-------------|
 | [Embedding Models](#embedding-models) | 7+ | Text vectorization tools |
 | [Large Language Models](#llm-support) | 11+ | Query processing and text generation |
 | [Document Loaders](#document-loader) | 5+ | Parse and process documents in various formats |
 | [Vector Databases](#vector-database-support) | 2+ | Store and retrieve vector data |
 ## 🔢 Embedding Models {#embedding-models}
 Support for various embedding models to convert text into vector representations for semantic search.
 | Provider | Required Environment Variables | Features |
 |----------|--------------------------------|---------|
 | **[Open-source models](https://milvus.io/docs/embeddings.md)** | None | Locally runnable open-source models |
 | **[OpenAI](https://platform.openai.com/docs/guides/embeddings/use-cases)** | `OPENAI_API_KEY` | High-quality embeddings, easy to use |
 | **[VoyageAI](https://docs.voyageai.com/embeddings/)** | `VOYAGE_API_KEY` | Embeddings optimized for retrieval |
 | **[Amazon Bedrock](https://docs.aws.amazon.com/bedrock/)** | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` | AWS integration, enterprise-grade |
 | **[FastEmbed](https://qdrant.github.io/fastembed/)** | None | Fast lightweight embeddings |
 | **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Flexible cloud embeddings |
 | **[Novita AI](https://novita.ai/docs/api-reference/model-apis-llm-create-embeddings)** | `NOVITA_API_KEY` | Rich model selection |
 | **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmembedding)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform |
 ## 🧠 Large Language Models {#llm-support}
 Support for various large language models (LLMs) to process queries and generate responses.
 | Provider | Required Environment Variables | Features |
 |----------|--------------------------------|---------|
 | **[OpenAI](https://platform.openai.com/docs/models)** | `OPENAI_API_KEY` | GPT model family |
 | **[DeepSeek](https://api-docs.deepseek.com/)** | `DEEPSEEK_API_KEY` | Powerful reasoning capabilities |
 | **[XAI Grok](https://x.ai/blog/grok-3)** | `XAI_API_KEY` | Real-time knowledge and humor |
 | **[Anthropic Claude](https://docs.anthropic.com/en/home)** | `ANTHROPIC_API_KEY` | Excellent long-context understanding |
 | **[SiliconFlow](https://docs.siliconflow.cn/en/userguide/introduction)** | `SILICONFLOW_API_KEY` | Enterprise inference service |
 | **[PPIO](https://ppinfra.com/model-api/product/llm-api)** | `PPIO_API_KEY` | Diverse model support |
 | **[TogetherAI](https://docs.together.ai/docs/introduction)** | `TOGETHER_API_KEY` | Wide range of open-source models |
 | **[Google Gemini](https://ai.google.dev/gemini-api/docs)** | `GEMINI_API_KEY` | Google's multimodal models |
 | **[SambaNova](https://docs.together.ai/docs/introduction)** | `SAMBANOVA_API_KEY` | High-performance AI platform |
 | **[Ollama](https://ollama.com/)** | None | Local LLM deployment |
 | **[Novita AI](https://novita.ai/docs/guides/introduction)** | `NOVITA_API_KEY` | Diverse AI services |
 | **[IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai/foundation-models#ibmfm)** | `WATSONX_APIKEY`, `WATSONX_URL`, `WATSONX_PROJECT_ID` | IBM's Enterprise AI platform |
 ## 📄 Document Loader {#document-loader}
 Support for loading and processing documents from various sources.
 ### Local File Loaders
 | Loader | Supported Formats | Required Environment Variables |
 |--------|-------------------|--------------------------------|
 | **Built-in Loader** | PDF, TXT, MD | None |
 | **[Unstructured](https://unstructured.io/)** | Multiple document formats | `UNSTRUCTURED_API_KEY`, `UNSTRUCTURED_URL` (optional) |
 ### Web Crawlers
 | Crawler | Description | Required Environment Variables/Setup |
 |---------|-------------|--------------------------------------|
 | **[FireCrawl](https://docs.firecrawl.dev/introduction)** | Crawler designed for AI applications | `FIRECRAWL_API_KEY` |
 | **[Jina Reader](https://jina.ai/reader/)** | High-accuracy web content extraction | `JINA_API_TOKEN` |
 | **[Crawl4AI](https://docs.crawl4ai.com/)** | Browser automation crawler | Run `crawl4ai-setup` for first-time use |
 ## 💾 Vector Database Support {#vector-database-support}
 Support for various vector databases for efficient storage and retrieval of embeddings.
 | Database | Description | Features |
 |----------|-------------|----------|
 | **[Milvus](https://milvus.io/)** | Open-source vector database | High-performance, scalable |
 | **[Zilliz Cloud](https://www.zilliz.com/)** | Managed Milvus service | Fully managed, maintenance-free |
 | **[Qdrant](https://qdrant.tech/)** | Vector similarity search engine | Simple, efficient | 
--- a/docs/overrides/.gitkeep
+++ b/docs/overrides/.gitkeep
--- a/docs/stylesheets/extra.css
+++ b/docs/stylesheets/extra.css
@ -0,0 +1,78 @@
 /* Add your custom CSS here */ 
 /* FAQ Styling */
 .faq-answer {
  background-color: #f8f9fa;
  border-left: 4px solid #5c6bc0;
  padding: 15px 20px;
  margin-bottom: 20px;
  border-radius: 4px;
 }
 .error-message {
  background-color: #ffebee;
  border-left: 4px solid #f44336;
  padding: 10px 15px;
  margin: 10px 0;
  font-family: monospace;
  white-space: pre-wrap;
  font-size: 0.9em;
  border-radius: 4px;
 }
 .code-steps {
  margin: 15px 0;
 }
 .code-steps p {
  margin-bottom: 5px;
 }
 details {
  margin-bottom: 10px;
  padding: 10px;
  background-color: #e3f2fd;
  border-radius: 4px;
 }
 summary {
  cursor: pointer;
  padding: 8px 0;
 }
 details[open] summary {
  margin-bottom: 10px;
 }
 h3 {
  margin-top: 30px;
  margin-bottom: 15px;
 }
 /* Add smooth transition for collapsible sections */
 details summary {
  transition: margin 0.3s ease;
 }
 /* Styling for code blocks within FAQ */
 .faq-answer pre {
  border-radius: 4px;
  margin: 10px 0;
 }
 /* Add styling for list items */
 .faq-answer ul {
  padding-left: 25px;
 }
 .faq-answer ul li {
  margin: 5px 0;
 }
 /* Add horizontal rule styling */
 hr {
  border: 0;
  height: 1px;
  background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.1), rgba(0, 0, 0, 0));
  margin: 25px 0;
 } 
--- a/docs/usage/cli.md
+++ b/docs/usage/cli.md
@ -0,0 +1,63 @@
 # 💻 Command Line Interface
 DeepSearcher provides a convenient command line interface for loading data and querying.
 ## 📥 Loading Data
 Load data from files or URLs:
 ```shell
 deepsearcher load "your_local_path_or_url"
 ```
 Load into a specific collection:
 ```shell
 deepsearcher load "your_local_path_or_url" --collection_name "your_collection_name" --collection_desc "your_collection_description"
 ```
 ### Examples
 #### Loading from local files:
 ```shell
 # Load a single file
 deepsearcher load "/path/to/your/local/file.pdf"
 # Load multiple files at once
 deepsearcher load "/path/to/your/local/file1.pdf" "/path/to/your/local/file2.md"
 ```
 #### Loading from URL:
 > **Note:** Set `FIRECRAWL_API_KEY` in your environment variables. See [FireCrawl documentation](https://docs.firecrawl.dev/introduction) for more details.
 ```shell
 deepsearcher load "https://www.wikiwand.com/en/articles/DeepSeek"
 ```
 ## 🔍 Querying Data
 Query your loaded data:
 ```shell
 deepsearcher query "Write a report about xxx."
 ```
 ## ❓ Help Commands
 Get general help information:
 ```shell
 deepsearcher --help
 ```
 Get help for specific subcommands:
 ```shell
 # Help for load command
 deepsearcher load --help
 # Help for query command
 deepsearcher query --help
 ``` 
--- a/docs/usage/deployment.md
+++ b/docs/usage/deployment.md
@ -0,0 +1,73 @@
 # 🌐 Deployment
 This guide explains how to deploy DeepSearcher as a web service.
 ## ⚙️ Configure Modules
 You can configure all arguments by modifying the configuration file:
 ```yaml
 # config.yaml - https://github.com/zilliztech/deep-searcher/blob/main/config.yaml
 llm:
  provider: "OpenAI"
  api_key: "your_openai_api_key_here"
  # Additional configuration options...
 ```
 > **Important:** Set your `OPENAI_API_KEY` in the `llm` section of the YAML file.
 ## 🚀 Start Service
 The main script will run a FastAPI service with default address `localhost:8000`:
 ```shell
 $ python main.py
 ```
 Once started, you should see output indicating the service is running successfully.
 ## 🔍 Access via Browser
 You can access the web service through your browser:
 1. Open your browser and navigate to [http://localhost:8000/docs](http://localhost:8000/docs)
 2. The Swagger UI will display all available API endpoints
 3. Click the "Try it out" button on any endpoint to interact with it
 4. Fill in the required parameters and execute the request
 This interactive documentation makes it easy to test and use all DeepSearcher API functionality.
 ## 🐳 Docker Deployment
 You can also deploy DeepSearcher using Docker for easier environment setup and management.
 ### Build Docker Image
 To build the Docker image, run the following command from the project root directory:
 ```shell
 docker build -t deepsearcher:latest .
 ```
 This command builds a Docker image using the Dockerfile in the current directory and tags it as `deepsearcher:latest`.
 ### Run Docker Container
 Once the image is built, you can run it as a container:
 ```shell
 docker run -p 8000:8000 \
  -e OPENAI_API_KEY=your_openai_api_key \
  -v $(pwd)/data:/app/data \
  -v $(pwd)/logs:/app/logs \
  -v $(pwd)/deepsearcher/config.yaml:/app/deepsearcher/config.yaml \
  deepsearcher:latest
 ```
 This command:
 - Maps port 8000 from the container to port 8000 on your host
 - Sets the `OPENAI_API_KEY` environment variable
 - Mounts the local `data`, `logs`, and configuration file to the container
 - Runs the previously built `deepsearcher:latest` image
 > **Note:** Replace `your_openai_api_key` with your actual OpenAI API key, or set any other environment variables required for your configuration. 
--- a/docs/usage/index.md
+++ b/docs/usage/index.md
@ -0,0 +1,13 @@
 # 📚 Usage Guide
 DeepSearcher provides multiple ways to use the system, including Python API, command line interface, and web service deployment.
 ## 🔍 Usage Overview
 | Guide | Description |
 |-------|-------------|
 | [🚀 Quick Start](quick_start.md) | Quick start guide for Python API integration |
 | [💻 Command Line Interface](cli.md) | Instructions for using the command line interface |
 | [🌐 Deployment](deployment.md) | Guide for deploying as a web service |
 Choose the method that best suits your needs and follow the instructions on the corresponding page.
--- a/docs/usage/quick_start.md
+++ b/docs/usage/quick_start.md
@ -0,0 +1,42 @@
 # 🚀 Quick Start
 ## Prerequisites
 ✅ Before you begin, prepare your `OPENAI_API_KEY` in your environment variables. If you change the LLM in the configuration, make sure to prepare the corresponding API key.
 ## Basic Usage
 ```python
 # Import configuration modules
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.online_query import query
 # Initialize configuration
 config = Configuration()
 # Customize your config here
 # (See the Configuration Details section below for more options)
 config.set_provider_config("llm", "OpenAI", {"model": "o1-mini"})
 config.set_provider_config("embedding", "OpenAIEmbedding", {"model": "text-embedding-ada-002"})
 init_config(config=config)
 # Load data from local files
 from deepsearcher.offline_loading import load_from_local_files
 load_from_local_files(paths_or_directory=your_local_path)
 # (Optional) Load data from websites
 # Requires FIRECRAWL_API_KEY environment variable
 from deepsearcher.offline_loading import load_from_website
 load_from_website(urls=website_url)
 # Query your data
 result = query("Write a report about xxx.")  # Replace with your question
 print(result)
 ```
 ## Next Steps
 After completing this quick start, you might want to explore:
 - [Command Line Interface](cli.md) for non-programmatic usage
 - [Deployment](deployment.md) for setting up a web service 
--- a/evaluation/README.md
+++ b/evaluation/README.md
@ -0,0 +1,53 @@
 # Evaluation of DeepSearcher
 ## Introduction
 DeepSearcher is very good at answering complex queries. In this evaluation introduction, we provide some scripts to evaluate the performance of DeepSearcher vs. naive RAG.
 The evaluation is based on the Recall metric:
 > Recall@K: The percentage of relevant documents that are retrieved among the top K documents returned by the search engine.
 Currently, we support the multi-hop question answering dataset of [2WikiMultiHopQA](https://paperswithcode.com/dataset/2wikimultihopqa). More dataset will be added in the future.
 ## Evaluation Script
 The main evaluation script is `evaluate.py`. 
 Your can provide a config file, say `eval_config.yaml`, to specify the LLM, embedding model, and other provider and parameters.
 ```shell
 python evaluate.py \
 --dataset 2wikimultihopqa \
 --config_yaml ./eval_config.yaml \
 --pre_num 5 \
 --output_dir ./eval_output
 ```
 `pre_num` is the number of samples to evaluate, the more samples, the more accurate the results will be, but it will consume more time and your LLM api token usage.
 After you have loaded the dataset into vectorDB in the first run, if you want to skip loading dataset again, you can set the flag `--skip_load` in the command line.
 For more arguments details, you can run
 ```shell
 python evaluate.py --help
 ```
 ## Evaluation Results  
 We conducted tests using the commonly used 2WikiMultiHopQA dataset. (Due to the high consumption of API tokens for testing, we only tested the first 50 samples. This may introduce some fluctuations compared to testing the entire dataset, but it can still roughly reflect the general landscape of performance.)
 ### Recall Comparison between Naive RAG and DeepSearcher with Different Models
 With Max Iterations on the horizontal axis and Recall on the vertical axis, the following chart compares the recall rates of Deep Searcher and naive RAG.
 ![](plot_results/max_iter_vs_recall.png)
 #### Performance Improvement with Iterations
 As we can see, as the number of Max Iterations increases, the recall performance of Deep Searcher improves significantly. And all the model results from Deep Searcher are significantly higher than those from naive RAG.
 #### Diminishing Returns
 However, it is also evident that as the number of iterations gradually increases, the marginal gains decrease, indicating that there may be a certain limit reached after increasing the feedback iterations, and further feedback might not yield significantly better results.
 #### Model Performance Comparison
 Claude-3-7-sonnet (red line) demonstrates superior performance throughout, achieving nearly perfect recall at 7 iterations. Most models show significant improvement as iterations increase, with the steepest gains occurring between 2-4 iterations. Models like o1-mini (yellow) and deepseek-r1 (green) exhibit strong performance at higher iteration counts. Since our sample number for testing is limited, the results of each test may vary somewhat. 
 Overall, reasoning models generally perform better than non-reasoning models. 
 #### Limitations of Non-Reasoning Models
 Additionally, in our tests, weaker and smaller non-reasoning models sometimes failed to complete the entire agent query pipeline, due to their inadequate instruction-following capabilities.
 ### Token Consumption
 We plotted the graph below with the number of iterations on the horizontal axis and the average token consumption per sample on the vertical axis:  
 ![](plot_results/max_iter_vs_avg_token_usage.png)  
 It is evident that as the number of iterations increases, the token consumption of Deep Searcher rises linearly. Based on this approximate token consumption, you can check the pricing on your model provider's website to estimate the cost of running evaluations with different iteration settings.
--- a/evaluation/eval_config.yaml
+++ b/evaluation/eval_config.yaml
@ -0,0 +1,119 @@
 provide_settings:
  llm:
    provider: "OpenAI"
    config:
      model: "o1-mini"
 #      api_key: "sk-xxxx"  # Uncomment to override the `OPENAI_API_KEY` set in the environment variable
 #      base_url: ""
 #    provider: "DeepSeek"
 #    config:
 #      model: "deepseek-reasoner"
 ##      api_key: "sk-xxxx"  # Uncomment to override the `DEEPSEEK_API_KEY` set in the environment variable
 ##      base_url: ""
 #    provider: "SiliconFlow"
 #    config:
 #      model: "deepseek-ai/DeepSeek-R1"
 ##      api_key: "xxxx"  # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable
 ##      base_url: ""
 #    provider: "PPIO"
 #    config:
 #      model: "deepseek/deepseek-r1-turbo"
 ##      api_key: "xxxx"  # Uncomment to override the `PPIO_API_KEY` set in the environment variable
 ##      base_url: ""
 #    provider: "TogetherAI"
 #    config:
 #      model: "deepseek-ai/DeepSeek-R1"
 ##      api_key: "xxxx"  # Uncomment to override the `TOGETHER_API_KEY` set in the environment variable
 #    provider: "AzureOpenAI"
 #    config:
 #      model: ""
 #      api_version: ""
 ##      azure_endpoint: "xxxx"  # Uncomment to override the `AZURE_OPENAI_ENDPOINT` set in the environment variable
 ##      api_key: "xxxx"  # Uncomment to override the `AZURE_OPENAI_KEY` set in the environment variable
 #    provider: "Ollama"
 #    config:
 #      model: "qwq"
 ##      base_url: ""
 #    provider: "Novita"
 #    config:
 #      model: "deepseek/deepseek-v3-0324"
 ##      api_key: "xxxx"  # Uncomment to override the `NOVITA_API_KEY` set in the environment variable
 ##      base_url: ""
  embedding:
    provider: "OpenAIEmbedding"
    config:
      model: "text-embedding-ada-002"
 #      api_key: ""  # Uncomment to override the `OPENAI_API_KEY` set in the environment variable
 #    provider: "MilvusEmbedding"
 #    config:
 #      model: "default"
 #    provider: "VoyageEmbedding"
 #    config:
 #      model: "voyage-3"
 ##      api_key: ""  # Uncomment to override the `VOYAGE_API_KEY` set in the environment variable
 #    provider: "BedrockEmbedding"
 #    config:
 #      model: "amazon.titan-embed-text-v2:0"
 ##      aws_access_key_id: ""  # Uncomment to override the `AWS_ACCESS_KEY_ID` set in the environment variable
 ##      aws_secret_access_key: ""  # Uncomment to override the `AWS_SECRET_ACCESS_KEY` set in the environment variable
 #    provider: "SiliconflowEmbedding"
 #    config:
 #      model: "BAAI/bge-m3"
 # .    api_key: ""   # Uncomment to override the `SILICONFLOW_API_KEY` set in the environment variable   
 #    provider: "NovitaEmbedding"
 #    config:
 #      model: "baai/bge-m3"
 # .    api_key: ""   # Uncomment to override the `NOVITA_API_KEY` set in the environment variable   
  file_loader:
 #    provider: "PDFLoader"
 #    config: {}
    provider: "JsonFileLoader"
    config:
      text_key: "text"
 #    provider: "TextLoader"
 #    config: {}
 #    provider: "UnstructuredLoader"
 #    config: {}
  web_crawler:
    provider: "FireCrawlCrawler"
    config: {}
 #    provider: "Crawl4AICrawler"
 #    config: {}
 #    provider: "JinaCrawler"
 #    config: {}
  vector_db:
    provider: "Milvus"
    config:
      default_collection: "deepsearcher"
      uri: "./milvus.db"
      token: "root:Milvus"
      db: "default"
 query_settings:
  max_iter: 3
 load_settings:
  chunk_size: 1500
  chunk_overlap: 100
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@ -0,0 +1,329 @@
 # Some test dataset and evaluation method are ref from https://github.com/OSU-NLP-Group/HippoRAG/tree/main/data , many thanks
 ################################################################################
 # Note: This evaluation script will cost a lot of LLM token usage, please make sure you have enough token budget.
 ################################################################################
 import argparse
 import ast
 import json
 import logging
 import os
 import time
 import warnings
 from collections import defaultdict
 from typing import List, Tuple
 import pandas as pd
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.offline_loading import load_from_local_files
 from deepsearcher.online_query import naive_retrieve, retrieve
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 warnings.simplefilter(action="ignore", category=FutureWarning)  # disable warning output
 current_dir = os.path.dirname(os.path.abspath(__file__))
 k_list = [2, 5]
 def _deepsearch_retrieve_titles(
    question: str,
    retry_num: int = 4,
    base_wait_time: int = 4,
    max_iter: int = 3,
 ) -> Tuple[List[str], int, bool]:
    """
    Retrieve document titles using DeepSearcher with retry mechanism.
    Args:
        question (str): The query question.
        retry_num (int, optional): Number of retry attempts. Defaults to 4.
        base_wait_time (int, optional): Base wait time between retries in seconds. Defaults to 4.
        max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3.
    Returns:
        Tuple[List[str], int, bool]: A tuple containing:
            - List of retrieved document titles
            - Number of tokens consumed
            - Boolean indicating whether the retrieval failed
    """
    retrieved_results = []
    consume_tokens = 0
    for i in range(retry_num):
        try:
            retrieved_results, _, consume_tokens = retrieve(question, max_iter=max_iter)
            break
        except Exception:
            wait_time = base_wait_time * (2**i)
            print(f"Parse LLM's output failed, retry again after {wait_time} seconds...")
            time.sleep(wait_time)
    if retrieved_results:
        retrieved_titles = [
            retrieved_result.metadata["title"] for retrieved_result in retrieved_results
        ]
        fail = False
    else:
        print("Pipeline error, no retrieved results.")
        retrieved_titles = []
        fail = True
    return retrieved_titles, consume_tokens, fail
 def _naive_retrieve_titles(question: str) -> List[str]:
    """
    Retrieve document titles using naive retrieval method.
    Args:
        question (str): The query question.
    Returns:
        List[str]: List of retrieved document titles.
    """
    retrieved_results = naive_retrieve(question)
    retrieved_titles = [
        retrieved_result.metadata["title"] for retrieved_result in retrieved_results
    ]
    return retrieved_titles
 def _calcu_recall(sample, retrieved_titles, dataset) -> dict:
    """
    Calculate recall metrics for retrieved titles.
    Args:
        sample: The sample data containing ground truth information.
        retrieved_titles: List of retrieved document titles.
        dataset (str): The name of the dataset being evaluated.
    Returns:
        dict: Dictionary containing recall values at different k values.
    Raises:
        NotImplementedError: If the dataset is not supported.
    """
    if dataset in ["2wikimultihopqa"]:
        gold_passages = [item for item in sample["supporting_facts"]]
        gold_items = set([item[0] for item in gold_passages])
        retrieved_items = retrieved_titles
    else:
        raise NotImplementedError
    recall = dict()
    for k in k_list:
        recall[k] = round(
            sum(1 for t in gold_items if t in retrieved_items[:k]) / len(gold_items), 4
        )
    return recall
 def _print_recall_line(recall: dict, pre_str="", post_str="\n"):
    """
    Print recall metrics in a formatted line.
    Args:
        recall (dict): Dictionary containing recall values at different k values.
        pre_str (str, optional): String to print before recall values. Defaults to "".
        post_str (str, optional): String to print after recall values. Defaults to "\n".
    """
    print(pre_str, end="")
    for k in k_list:
        print(f"R@{k}: {recall[k]:.3f} ", end="")
    print(post_str, end="")
 def evaluate(
    dataset: str,
    output_root: str,
    pre_num: int = 10,
    max_iter: int = 3,
    skip_load=False,
    flag: str = "result",
 ):
    """
    Evaluate the retrieval performance on a dataset.
    Args:
        dataset (str): Name of the dataset to evaluate.
        output_root (str): Root directory for output files.
        pre_num (int, optional): Number of samples to evaluate. Defaults to 10.
        max_iter (int, optional): Maximum number of iterations for retrieval. Defaults to 3.
        skip_load (bool, optional): Whether to skip loading the dataset. Defaults to False.
        flag (str, optional): Flag for the evaluation run. Defaults to "result".
    """
    corpus_file = os.path.join(current_dir, f"../examples/data/{dataset}_corpus.json")
    if not skip_load:
        # set chunk size to a large number to avoid chunking, because the dataset was chunked already.
        load_from_local_files(
            corpus_file, force_new_collection=True, chunk_size=999999, chunk_overlap=0
        )
    eval_output_subdir = os.path.join(output_root, flag)
    os.makedirs(eval_output_subdir, exist_ok=True)
    csv_file_path = os.path.join(eval_output_subdir, "details.csv")
    statistics_file_path = os.path.join(eval_output_subdir, "statistics.json")
    data_with_gt_file_path = os.path.join(current_dir, f"../examples/data/{dataset}.json")
    data_with_gt = json.load(open(data_with_gt_file_path, "r"))
    if not pre_num:
        pre_num = len(data_with_gt)
    pipeline_error_num = 0
    end_ind = min(pre_num, len(data_with_gt))
    start_ind = 0
    existing_df = pd.DataFrame()
    existing_statistics = defaultdict(dict)
    existing_token_usage = 0
    existing_error_num = 0
    existing_sample_num = 0
    if os.path.exists(csv_file_path):
        existing_df = pd.read_csv(csv_file_path)
        start_ind = len(existing_df)
        print(f"Loading results from {csv_file_path}, start_index = {start_ind}")
    if os.path.exists(statistics_file_path):
        existing_statistics = json.load(open(statistics_file_path, "r"))
        print(
            f"Loading statistics from {statistics_file_path}, will recalculate the statistics based on both new and existing results."
        )
        existing_token_usage = existing_statistics["deepsearcher"]["token_usage"]
        existing_error_num = existing_statistics["deepsearcher"].get("error_num", 0)
        existing_sample_num = existing_statistics["deepsearcher"].get("sample_num", 0)
    for sample_idx, sample in enumerate(data_with_gt[start_ind:end_ind]):
        global_idx = sample_idx + start_ind
        question = sample["question"]
        retrieved_titles, consume_tokens, fail = _deepsearch_retrieve_titles(
            question, max_iter=max_iter
        )
        retrieved_titles_naive = _naive_retrieve_titles(question)
        if fail:
            pipeline_error_num += 1
            print(
                f"Pipeline error, no retrieved results. Current pipeline_error_num = {pipeline_error_num}"
            )
        print(f"idx: {global_idx}: ")
        recall = _calcu_recall(sample, retrieved_titles, dataset)
        recall_naive = _calcu_recall(sample, retrieved_titles_naive, dataset)
        current_result = [
            {
                "idx": global_idx,
                "question": question,
                "recall": recall,
                "recall_naive": recall_naive,
                "gold_titles": [item[0] for item in sample["supporting_facts"]],
                "retrieved_titles": retrieved_titles,
                "retrieved_titles_naive": retrieved_titles_naive,
            }
        ]
        current_df = pd.DataFrame(current_result)
        existing_df = pd.concat([existing_df, current_df], ignore_index=True)
        existing_df.to_csv(csv_file_path, index=False)
        average_recall = dict()
        average_recall_naive = dict()
        for k in k_list:
            average_recall[k] = sum(
                [
                    ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k)
                    for d in existing_df["recall"]
                ]
            ) / len(existing_df)
            average_recall_naive[k] = sum(
                [
                    ast.literal_eval(d).get(k) if isinstance(d, str) else d.get(k)
                    for d in existing_df["recall_naive"]
                ]
            ) / len(existing_df)
        _print_recall_line(average_recall, pre_str="Average recall of DeepSearcher: ")
        _print_recall_line(average_recall_naive, pre_str="Average recall of naive RAG   : ")
        existing_token_usage += consume_tokens
        existing_error_num += 1 if fail else 0
        existing_sample_num += 1
        existing_statistics["deepsearcher"]["average_recall"] = average_recall
        existing_statistics["deepsearcher"]["token_usage"] = existing_token_usage
        existing_statistics["deepsearcher"]["error_num"] = existing_error_num
        existing_statistics["deepsearcher"]["sample_num"] = existing_sample_num
        existing_statistics["deepsearcher"]["token_usage_per_sample"] = (
            existing_token_usage / existing_sample_num
        )
        existing_statistics["naive_rag"]["average_recall"] = average_recall_naive
        json.dump(existing_statistics, open(statistics_file_path, "w"), indent=4)
        print("")
    print("Finish results to save.")
 def main_eval():
    """
    Main function for running the evaluation from command line.
    This function parses command line arguments and calls the evaluate function
    with the appropriate parameters.
    """
    parser = argparse.ArgumentParser(prog="evaluate", description="Deep Searcher evaluation.")
    parser.add_argument(
        "--dataset",
        type=str,
        default="2wikimultihopqa",
        help="Dataset name, default is `2wikimultihopqa`. More datasets will be supported in the future.",
    )
    parser.add_argument(
        "--config_yaml",
        type=str,
        default="./eval_config.yaml",
        help="Configuration yaml file path, default is `./eval_config.yaml`",
    )
    parser.add_argument(
        "--pre_num",
        type=int,
        default=30,
        help="Number of samples to evaluate, default is 30",
    )
    parser.add_argument(
        "--max_iter",
        type=int,
        default=3,
        help="Max iterations of reflection. Default is 3. It will overwrite the one in config yaml file.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./eval_output",
        help="Output root directory, default is `./eval_output`",
    )
    parser.add_argument(
        "--skip_load",
        action="store_true",
        help="Whether to skip loading the dataset. Default it don't skip loading. If you want to skip loading, please set this flag.",
    )
    parser.add_argument(
        "--flag",
        type=str,
        default="result",
        help="Flag for evaluation, default is `result`",
    )
    args = parser.parse_args()
    config = Configuration(config_path=args.config_yaml)
    init_config(config=config)
    evaluate(
        dataset=args.dataset,
        output_root=args.output_dir,
        pre_num=args.pre_num,
        max_iter=args.max_iter,
        skip_load=args.skip_load,
        flag=args.flag,
    )
 if __name__ == "__main__":
    main_eval()
--- a/evaluation/plot_results/max_iter_vs_avg_token_usage.png
+++ b/evaluation/plot_results/max_iter_vs_avg_token_usage.png
--- a/evaluation/plot_results/max_iter_vs_error_num.png
+++ b/evaluation/plot_results/max_iter_vs_error_num.png
--- a/evaluation/plot_results/max_iter_vs_recall.png
+++ b/evaluation/plot_results/max_iter_vs_recall.png
--- a/examples/basic_example.py
+++ b/examples/basic_example.py
@ -0,0 +1,35 @@
 import logging
 import os
 from deepsearcher.offline_loading import load_from_local_files
 from deepsearcher.online_query import query
 from deepsearcher.configuration import Configuration, init_config
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 current_dir = os.path.dirname(os.path.abspath(__file__))
 config = Configuration()  # Customize your config here
 init_config(config=config)
 # You should clone the milvus docs repo to your local machine first, execute:
 # git clone https://github.com/milvus-io/milvus-docs.git
 # Then replace the path below with the path to the milvus-docs repo on your local machine
 # import glob
 # all_md_files = glob.glob('xxx/milvus-docs/site/en/**/*.md', recursive=True)
 # load_from_local_files(paths_or_directory=all_md_files, collection_name="milvus_docs", collection_description="All Milvus Documents")
 # Hint: You can also load a single file, please execute it in the root directory of the deep searcher project
 load_from_local_files(
    paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
    collection_name="milvus_docs",
    collection_description="All Milvus Documents",
    # force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
 )
 question = "Write a report comparing Milvus with other vector databases."
 _, _, consumed_token = query(question, max_iter=1)
 print(f"Consumed tokens: {consumed_token}")
--- a/examples/basic_example_azuresearch.py
+++ b/examples/basic_example_azuresearch.py
@ -0,0 +1,68 @@
 import logging
 import os
 import time
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.online_query import query
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
 )
 logger = logging.getLogger(__name__)
 logger.info("Initializing DeepSearcher configuration")
 config = Configuration()
 config.set_provider_config("llm", "AzureOpenAI", {
    "model": "gpt-4.1",
    "api_key": "<yourkey>",
    "base_url": "https://<youraifoundry>.openai.azure.com/openai/",
    "api_version": "2024-12-01-preview"
 })
 config.set_provider_config("embedding", "OpenAIEmbedding", {
    "model": "text-embedding-ada-002",
    "api_key": "<yourkey>",
    "azure_endpoint": "https://<youraifoundry>.openai.azure.com/",
    "api_version": "2023-05-15"
    # Remove api_version and other Azure-specific parameters
 })
 config.set_provider_config("vector_db", "AzureSearch", {
    "endpoint": "https://<yourazureaisearch>.search.windows.net",
    "index_name": "<yourindex>",
    "api_key": "<yourkey>",
    "vector_field": "content_vector"
 })
 logger.info("Configuration initialized successfully")
 try:
    logger.info("Applying global configuration")
    init_config(config)
    logger.info("Configuration applied globally")
    # Example question
    question = "Create a detailed report about what Python is all about"
    logger.info(f"Processing query: '{question}'")
    start_time = time.time()
    result = query(question)
    query_time = time.time() - start_time
    logger.info(f"Query processed in {query_time:.2f} seconds")
    logger.info("Retrieved result successfully")
    print(result[0])  # Print the first element of the tuple
    # Check if there's a second element in the tuple that contains source documents
    if len(result) > 1 and hasattr(result[1], "__len__"):
        logger.info(f"Found {len(result[1])} source documents")
        for i, doc in enumerate(result[1]):
            if hasattr(doc, "metadata") and "source" in doc.metadata:
                logger.info(f"Source {i+1}: {doc.metadata['source']}")
 except Exception as e:
    logger.error(f"Error executing query: {str(e)}")
    import traceback
    logger.error(traceback.format_exc())
--- a/examples/basic_example_oracle.py
+++ b/examples/basic_example_oracle.py
@ -0,0 +1,40 @@
 import sys, os
 from pathlib import Path
 script_directory = Path(__file__).resolve().parent.parent
 sys.path.append(os.path.abspath(script_directory))
 import logging
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # Customize your config here
 from deepsearcher.configuration import Configuration, init_config
 config = Configuration()
 init_config(config=config)
 # # Load your local data
 # # Hint: You can load from a directory or a single file, please execute it in the root directory of the deep searcher project
 from deepsearcher.offline_loading import load_from_local_files
 load_from_local_files(
    paths_or_directory=os.path.join(current_dir, "data/WhatisMilvus.pdf"),
    collection_name="milvus_docs",
    collection_description="All Milvus Documents",
    # force_new_collection=True, # If you want to drop origin collection and create a new collection every time, set force_new_collection to True
 )
 # Query
 from deepsearcher.online_query import query
 question = 'Write a report comparing Milvus with other vector databases.'
 answer, retrieved_results, consumed_token = query(question)
 print(answer)
 # # get consumed tokens, about: 2.5~3w tokens when using openai gpt-4o model
 # print(f"Consumed tokens: {consumed_token}")