更换provider，修复语法错误

1 week ago · 2b20787e50
8 changed files with 39 additions and 88 deletions
--- a/deepsearcher/config.yaml
+++ b/deepsearcher/config.yaml
@ -2,16 +2,16 @@ provide_settings:
  llm:
    provider: "OpenAILLM"
    config:
-      model: "Qwen/Qwen3-8B-FP8"
+      model: "Qwen/Qwen3-30B-A3B"
-      api_key: "empty"
+      api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt"
-      base_url: "http://localhost:8000/v1"
+      base_url: "https://api.siliconflow.cn/v1"
  embedding:
    provider: "OpenAIEmbedding"
-    config:
+    config: 
      model: "Qwen/Qwen3-Embedding-0.6B"
-      api_key: "empty"
+      api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt"
-      base_url: "http://localhost:8001/v1"
+      base_url: "https://api.siliconflow.cn/v1"
      dimension: 1024
      dim_change: false
--- a/deepsearcher/embedding/base.py
+++ b/deepsearcher/embedding/base.py
@ -1,5 +1,3 @@
 from typing import List
 from tqdm import tqdm
 from deepsearcher.loader.splitter import Chunk
@ -14,7 +12,7 @@ class BaseEmbedding:
    for the dimensionality of the embeddings.
    """
-    def embed_query(self, text: str) -> List[float]:
+    def embed_query(self, text: str) -> list[float]:
        """
        Embed a single query text.
@ -26,7 +24,7 @@ class BaseEmbedding:
        """
        pass
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """
        Embed a list of document texts.
@ -41,7 +39,7 @@ class BaseEmbedding:
        """
        return [self.embed_query(text) for text in texts]
-    def embed_chunks(self, chunks: List[Chunk], batch_size: int = 256) -> List[Chunk]:
+    def embed_chunks(self, chunks: list[Chunk], batch_size: int = 256) -> list[Chunk]:
        """
        Embed a list of Chunk objects.
--- a/deepsearcher/embedding/openai_embedding.py
+++ b/deepsearcher/embedding/openai_embedding.py
@ -1,5 +1,4 @@
 import os
 from typing import List
 from openai import OpenAI
 from openai._types import NOT_GIVEN
@ -45,20 +44,20 @@ class OpenAIEmbedding(BaseEmbedding):
            model = kwargs.pop("model_name")
        if "dimension" in kwargs:
-            dimension = kwargs.pop("dimension") 
+            dimension = kwargs.pop("dimension")
        else:
            dimension = NOT_GIVEN
-        
+
        if "dim_change" in kwargs:
            dim_change = kwargs.pop("dim_change")
-        
+
        self.dim = dimension
        self.dim_change = dim_change
        self.model = model
        self.client = OpenAI(api_key=api_key, base_url=base_url, **kwargs)
-    def embed_query(self, text: str) -> List[float]:
+    def embed_query(self, text: str) -> list[float]:
        """
        Embed a single query text.
@ -75,7 +74,7 @@ class OpenAIEmbedding(BaseEmbedding):
        return response.data[0].embedding
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """
        Embed a list of document texts.
--- a/deepsearcher/llm/openai_llm.py
+++ b/deepsearcher/llm/openai_llm.py
@ -59,7 +59,9 @@ class OpenAILLM(BaseLLM):
        for chunk in completion:
            stream_response = chunk.choices[0].delta.content
            if stream_response:
                print(stream_response, end="", flush=True)
                response += stream_response
                if stream_callback:
                    stream_callback(stream_response)
        print("\n")
        return response
--- a/deepsearcher/offline_loading.py
+++ b/deepsearcher/offline_loading.py
@ -1,6 +1,5 @@
 import hashlib
 import os
 from typing import List, Union
 from tqdm import tqdm
@ -10,7 +9,7 @@ from deepsearcher.loader.splitter import split_docs_to_chunks
 def load_from_local_files(
-    paths_or_directory: Union[str, List[str]],
+    paths_or_directory: str | list[str],
    collection_name: str = None,
    collection_description: str = None,
    force_new_collection: bool = False,
@ -50,7 +49,7 @@ def load_from_local_files(
        description=collection_description,
        force_new_collection=force_new_collection,
    )
-    
+
    # 如果force_rebuild为True，则强制重建集合
    if force_rebuild:
        vector_db.init_collection(
@ -59,7 +58,7 @@ def load_from_local_files(
            description=collection_description,
            force_new_collection=True,
        )
-    
+
    if isinstance(paths_or_directory, str):
        paths_or_directory = [paths_or_directory]
    all_docs = []
@ -92,7 +91,7 @@ def load_from_local_files(
 def load_from_website(
-    urls: Union[str, List[str]],
+    urls: str | list[str],
    collection_name: str = None,
    collection_description: str = None,
    force_new_collection: bool = False,
--- a/deepsearcher/online_query.py
+++ b/deepsearcher/online_query.py
@ -1,11 +1,9 @@
 from typing import List, Tuple
 # from deepsearcher.configuration import vector_db, embedding_model, llm
 from deepsearcher import configuration
 from deepsearcher.vector_db.base import RetrievalResult
-def query(original_query: str, max_iter: int = 3) -> Tuple[str, List[RetrievalResult]]:
+def query(original_query: str, max_iter: int = 3) -> tuple[str, list[RetrievalResult]]:
    """
    Query the knowledge base with a question and get an answer.
@ -27,7 +25,7 @@ def query(original_query: str, max_iter: int = 3) -> Tuple[str, List[RetrievalRe
 def retrieve(
    original_query: str, max_iter: int = 3
-) -> Tuple[List[RetrievalResult], List[str]]:
+) -> tuple[list[RetrievalResult], list[str]]:
    """
    Retrieve relevant information from the knowledge base without generating an answer.
@ -48,47 +46,3 @@ def retrieve(
        original_query, max_iter=max_iter
    )
    return retrieved_results, []
 def naive_retrieve(query: str, collection: str = None, top_k=10) -> List[RetrievalResult]:
    """
    Perform a simple retrieval from the knowledge base using the naive RAG approach.
    This function uses the naive RAG agent to retrieve information from the knowledge base
    without any advanced techniques like iterative refinement.
    Args:
        query: The question or query to search for.
        collection: The name of the collection to search in. If None, searches in all collections.
        top_k: The maximum number of results to return.
    Returns:
        A list of retrieval results.
    """
    naive_rag = configuration.naive_rag
    all_retrieved_results, _ = naive_rag.retrieve(query)
    return all_retrieved_results
 def naive_rag_query(
    query: str, collection: str = None, top_k=10
 ) -> Tuple[str, List[RetrievalResult]]:
    """
    Query the knowledge base using the naive RAG approach and get an answer.
    This function uses the naive RAG agent to query the knowledge base and generate
    an answer based on the retrieved information, without any advanced techniques.
    Args:
        query: The question or query to search for.
        collection: The name of the collection to search in. If None, searches in all collections.
        top_k: The maximum number of results to consider.
    Returns:
        A tuple containing:
            - The generated answer as a string
            - A list of retrieval results that were used to generate the answer
    """
    naive_rag = configuration.naive_rag
    answer, retrieved_results = naive_rag.query(query)
    return answer, retrieved_results
--- a/main.py
+++ b/main.py
@ -1,5 +1,4 @@
 import argparse
 from typing import Dict, List, Union
 import uvicorn
 from fastapi import Body, FastAPI, HTTPException, Query
@ -31,7 +30,7 @@ class ProviderConfigRequest(BaseModel):
    feature: str
    provider: str
-    config: Dict
+    config: dict
@app.get("/", response_class=HTMLResponse)
@ -43,15 +42,15 @@ async def read_root():
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # 构建模板文件路径 - 修复路径问题
    template_path = os.path.join(current_dir, "deepsearcher", "backend", "templates", "index.html")
-    
+
    # 读取HTML文件内容
    try:
-        with open(template_path, "r", encoding="utf-8") as file:
+        with open(template_path, encoding="utf-8") as file:
            html_content = file.read()
        return HTMLResponse(content=html_content, status_code=200)
    except FileNotFoundError:
        # 如果找不到文件，提供一个简单的默认页面
-        default_html = """
+        default_html = f"""
        <!DOCTYPE html>
        <html>
        <head>
@ -71,7 +70,7 @@ async def read_root():
                <div class="info">
                    <p>欢迎使用 DeepSearcher 智能搜索系统!</p>
                    <p>系统正在运行，但未找到前端模板文件。</p>
-                    <p>请确认文件是否存在: {}</p>
+                    <p>请确认文件是否存在: {template_path}</p>
                </div>
                <div class="info">
                    <h2>API 接口</h2>
@ -86,7 +85,7 @@ async def read_root():
            </div>
        </body>
        </html>
-        """.format(template_path)
+        """
        return HTMLResponse(content=default_html, status_code=200)
@ -118,7 +117,7 @@ def set_provider_config(request: ProviderConfigRequest):
@app.post("/load-files/")
 def load_files(
-    paths: Union[str, List[str]] = Body(
+    paths: str | list[str] = Body(
        ...,
        description="A list of file paths to be loaded.",
        examples=["/path/to/file1", "/path/to/file2", "/path/to/dir1"],
@ -160,7 +159,7 @@ def load_files(
            paths_or_directory=paths,
            collection_name=collection_name,
            collection_description=collection_description,
-            batch_size=batch_size if batch_size is not None else 256,  # 提供默认值
+            batch_size=batch_size if batch_size is not None else 8,  # 提供默认值
        )
        return {"message": "Files loaded successfully."}
    except Exception as e:
@ -169,7 +168,7 @@ def load_files(
@app.post("/load-website/")
 def load_website(
-    urls: Union[str, List[str]] = Body(
+    urls: str | list[str] = Body(
        ...,
        description="A list of URLs of websites to be loaded.",
        examples=["https://milvus.io/docs/overview.md"],
@ -249,15 +248,15 @@ def perform_query(
        # 清除之前的进度消息
        from deepsearcher.utils.log import clear_progress_messages
        clear_progress_messages()
-        
+
        result_text, _, consume_token = query(original_query, max_iter)
-        
+
        # 获取进度消息
        from deepsearcher.utils.log import get_progress_messages
        progress_messages = get_progress_messages()
-        
+
        return {
-            "result": result_text, 
+            "result": result_text,
            "consume_token": consume_token,
            "progress_messages": progress_messages
        }
@ -271,7 +270,7 @@ if __name__ == "__main__":
    parser.add_argument("--port", type=int, default=8000, help="Port to bind the server to")
    parser.add_argument("--enable-cors", action="store_true", default=False, help="Enable CORS support")
    args = parser.parse_args()
-    
+
    if args.enable_cors:
        app.add_middleware(
            CORSMiddleware,
@ -283,6 +282,6 @@ if __name__ == "__main__":
        print("CORS is enabled.")
    else:
        print("CORS is disabled.")
-        
+
    print(f"Starting server on {args.host}:{args.port}")
-    uvicorn.run(app, host=args.host, port=args.port)
+    uvicorn.run(app, host=args.host, port=args.port)
--- a/test.py
+++ b/test.py
@ -11,7 +11,7 @@ config.load_config_from_yaml("deepsearcher/config.yaml")
 init_config(config = config)
 # Load your local data
-load_from_local_files(paths_or_directory="examples/data", force_rebuild=True)
+load_from_local_files(paths_or_directory="examples/data", force_rebuild=True, batch_size=8)
 # (Optional) Load from web crawling (`FIRECRAWL_API_KEY` env variable required)
 # from deepsearcher.offline_loading import load_from_website