diff --git a/deepsearcher/agent/deep_search.py b/deepsearcher/agent/deep_search.py index f2e48e3..fddf54b 100644 --- a/deepsearcher/agent/deep_search.py +++ b/deepsearcher/agent/deep_search.py @@ -20,7 +20,7 @@ COLLECTION_ROUTE_PROMPT = """ "合集信息": {collection_info} 使用的语言与问题相同 -你需要返回的格式是 a python list of str without any addtional content: +你需要返回的格式是 a python list[str] without any addtional content: """ @@ -29,7 +29,12 @@ SUB_QUERY_PROMPT = """ 请你使用自顶向下和自底向上两种方向来思考如何拆分问题 子问题的数量不可以太多,但是也不可以太少,应当保证问题的回答全面性,请根据问题复杂程度来决定子问题的数量 如果原问题本身非常简单,没有必要进行拆分,则保留输出原问题本身 -需要保证每个子问题都具体、清晰、不可分(原子性,即不可以再包含更细分的子问题),子问题中不要包含"请你回答"、"请你总结"、"请你分析"等祈使类型词语 +需要保证每个子问题都具体、清晰、不可分(原子性,即不可以再包含更细分的子问题),但是也不可以过于片面狭窄,否则会降低回答的全面性 +例如对于机器学习,如果问题是"XGBoost是什么",那么子问题: +"XGBoost和其他Boosting算法有什么区别"(正确) +"XGBoost和Gradient Boost有什么区别"(错误) +"XGBoost和其他Boosting算法有什么区别(如AdaBoost、Gradient Boost)"(错误) +同时,子问题中不要包含"请你回答"、"请你总结"、"请你分析"等祈使类型词语 你需要最终返回一个字符串列表 原问题: {original_query} @@ -43,14 +48,14 @@ SUB_QUERY_PROMPT = """ "什么是机器学习?", "机器学习的使用目的", "机器学习的常用算法", - "机器学习的历史演进过程", - "机器学习和深度学习的区别是什么?" + "机器学习算法的演进过程", + "机器学习和现在流行的深度学习的区别是什么?" ] 使用的语言与原问题相同 -你需要返回的是 a python list of str without any addtional content: +你需要返回的是 a python list[str] without any addtional content: """ @@ -66,7 +71,7 @@ RERANK_PROMPT = """ 例如,假如给出4个chunks(实际检索到的文档片段不一定是这么多),返回4个"True"或者"False"(注意这只是一个示例,不代表实际判断): ["True", "False", "True", "True"] 使用的语言与问题相同 -你需要返回的是 a python list of str(bool) without any addtional content: +你需要返回的是 a python list[str(bool)] without any addtional content: """ @@ -85,7 +90,7 @@ REFLECT_PROMPT = """ {chunks} 使用的语言与原问题相同 -你需要返回的是 a python list of str without any addtional content: +你需要返回的是 a python list[str] without any addtional content: """ @@ -93,6 +98,7 @@ SUMMARY_PROMPT = """ 你是一个内容分析专家 请你综合已经提出的问题和检索到的信息,以原问题为中心,生成详细准确、层次分明(多级标题,从一级开始)、尽可能长的回答。 如果检索到的信息不足以回答问题,你应该使用你的知识来进行扩展补充。 +如果检索到的文档片段中有其他有用的信息,但是没有在之前的问题中被提出,你也应该添加进最终回答中。 注意,不要逐个回答问题,而是应该综合所有问题和信息,生成一个完整的回答。 同时,你应该根据提供的信息生成文内引用"[^index]"(markdown文内引用)。 来自的引用序号从[^index]从index=1开始,来源需要与前文中的"id"一致。 @@ -248,9 +254,9 @@ class DeepSearch(BaseAgent): send_info(f"本地向量搜索找到 {len(vector_results)} 个结果") # 网页搜索 - self.web_search = WebSearch() if kwargs.get('web_search', False) else None + self.web_search = True if kwargs.get('web_search', False) else None if self.web_search: - web_results = self.web_search.search_with_retry(query, size=2) + web_results = WebSearch().search_with_retry(query, size=4) if web_results: send_info(f"网页搜索找到 {len(web_results)} 个结果") else: @@ -473,7 +479,7 @@ class DeepSearch(BaseAgent): absolute_path = str(Path(reference).resolve()) encoded_path = urllib.parse.quote(absolute_path, safe='') # 使用相对路径,这样可以在不同的服务器配置下工作 - formated_refs.append(f"[^{i + 1}]: [/file/{encoded_path}](/file/{encoded_path})\n") + formated_refs.append(f"[^{i + 1}]: [{absolute_path}](/file/{encoded_path})\n") except Exception as _: formated_refs.append(f"[^{i + 1}]: {reference}\n") diff --git a/deepsearcher/config.yaml b/deepsearcher/config.yaml index 1d70dbb..f003222 100644 --- a/deepsearcher/config.yaml +++ b/deepsearcher/config.yaml @@ -2,12 +2,12 @@ provide_settings: llm: provider: "OpenAILLM" config: - # model: "Qwen/Qwen3-32B" - # api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt" - #base_url: "https://api.siliconflow.cn/v1" - model: qwen3-32b - api_key: sk-14f39f0c530d4aa0b5588454bff859d6 - base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 + model: "Qwen/Qwen3-32B" + api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt" + base_url: "https://api.siliconflow.cn/v1" + # model: qwen3-32b + # api_key: sk-14f39f0c530d4aa0b5588454bff859d6 + # base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 embedding: provider: "OpenAIEmbedding" diff --git a/deepsearcher/online_query.py b/deepsearcher/online_query.py index 3ca20af..4c3105b 100644 --- a/deepsearcher/online_query.py +++ b/deepsearcher/online_query.py @@ -23,24 +23,3 @@ def query(original_query: str, **kwargs) -> tuple[str, list[RetrievalResult]]: max_iter = kwargs.get("max_iter", 3) web_search = kwargs.get("web_search", False) return default_searcher.query(original_query, max_iter=max_iter, web_search=web_search) - - -def retrieve(original_query: str, max_iter: int | None = None) -> tuple[list[RetrievalResult], list[str]]: - """ - Retrieve relevant information from the knowledge base without generating an answer. - - This function uses the default searcher to retrieve information from the knowledge base - that is relevant to the query. - - Args: - original_query: The question or query to search for. - max_iter: Maximum number of iterations for the search process. - - Returns: - A tuple containing: - - A list of retrieval results - - A list of strings representing consumed tokens - """ - default_searcher = configuration.default_searcher - retrieved_results, metadata = default_searcher.retrieve(original_query, max_iter=max_iter) - return retrieved_results diff --git a/deepsearcher/templates/static/themes/Readme.md b/deepsearcher/templates/static/themes/Readme.md deleted file mode 100644 index c14481a..0000000 --- a/deepsearcher/templates/static/themes/Readme.md +++ /dev/null @@ -1,4 +0,0 @@ -The built-in CSS will be replaced after update / reinstall, DO NOT MODIFY THEM. - -Refer https://support.typora.io/Add-Custom-CSS/ when you want to modify those CSS. -Refer https://support.typora.io/About-Themes/ if you want to create / install new themes. \ No newline at end of file diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-105-Heavy.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-105-Heavy.ttf new file mode 100755 index 0000000..45dcb88 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-105-Heavy.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-115-Black.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-115-Black.ttf new file mode 100755 index 0000000..70281c0 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-115-Black.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-35-Thin.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-35-Thin.ttf new file mode 100755 index 0000000..7245c8e Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-35-Thin.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-45-Light.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-45-Light.ttf new file mode 100755 index 0000000..7d15c74 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-45-Light.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-55-Regular.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-55-Regular.ttf new file mode 100755 index 0000000..b69b510 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-55-Regular.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-65-Medium.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-65-Medium.ttf new file mode 100755 index 0000000..588a02d Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-65-Medium.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-75-SemiBold.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-75-SemiBold.ttf new file mode 100755 index 0000000..23e4502 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-75-SemiBold.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-85-Bold.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-85-Bold.ttf new file mode 100755 index 0000000..05c3f88 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-85-Bold.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-95-ExtraBold.ttf b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-95-ExtraBold.ttf new file mode 100755 index 0000000..5d02af4 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-95-ExtraBold.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/STSongti-SC-Black.ttf b/deepsearcher/templates/static/themes/latex/STSongti-SC-Black.ttf new file mode 100755 index 0000000..bcf01d4 Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/STSongti-SC-Black.ttf differ diff --git a/deepsearcher/templates/static/themes/latex/STSongti-SC-Regular.ttf b/deepsearcher/templates/static/themes/latex/STSongti-SC-Regular.ttf new file mode 100755 index 0000000..6e3bfbb Binary files /dev/null and b/deepsearcher/templates/static/themes/latex/STSongti-SC-Regular.ttf differ diff --git a/deepsearcher/web_search.py b/deepsearcher/web_search.py index 11ce773..27260e6 100644 --- a/deepsearcher/web_search.py +++ b/deepsearcher/web_search.py @@ -6,12 +6,12 @@ from deepsearcher.utils import log class WebSearch: - """网页搜索类,用于调用metaso.cn API进行网页搜索""" + """Web search class for calling metaso.cn API to perform web searches""" def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"): """ - 初始化网页搜索 + Initialize web search Args: - api_key (str): metaso.cn API密钥 + api_key (str): metaso.cn API key """ self.api_key = api_key self.base_url = "metaso.cn" @@ -19,15 +19,15 @@ class WebSearch: def search(self, query: str, size: int = 4) -> list[RetrievalResult]: """ - 执行网页搜索 + Execute web search Args: - query (str): 搜索查询 - size (int): 返回结果数量,默认为4 + query (str): Search query + size (int): Number of results to return, default is 4 Returns: - List[RetrievalResult]: 搜索结果列表 + List[RetrievalResult]: List of search results """ try: - # 构建请求数据 + # Build request data payload = json.dumps({ "q": query, "scope": "webpage", @@ -43,33 +43,33 @@ class WebSearch: 'Content-Type': 'application/json' } - # 发送请求 + # Send request conn = http.client.HTTPSConnection(self.base_url) conn.request("POST", self.endpoint, payload, headers) res = conn.getresponse() data = res.read() if res.status != 200: - log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}") + log.error(f"Web search request failed: {res.status} - {data.decode('utf-8')}") return [] response_data = json.loads(data.decode("utf-8")) - # 解析搜索结果 + # Parse search results results = [] if "webpages" in response_data: for i, webpage in enumerate(response_data["webpages"]): - # 使用content字段作为主要文本内容 + # Use content field as primary text content content = webpage.get("content", "") if not content: content = webpage.get("snippet", "") - # 创建RetrievalResult对象 + # Create RetrievalResult object result = RetrievalResult( - embedding=None, # 网页搜索结果没有向量 + embedding=None, # Web search results don't have vectors text=content, reference=webpage.get("link", ""), - score=1.0 - (i * (1 / size)), # 根据位置计算分数 + score=None, # Web search results don't have scores metadata={ "title": webpage.get("title", ""), "date": webpage.get("date", ""), @@ -80,11 +80,11 @@ class WebSearch: ) results.append(result) - log.info(f"网页搜索成功,找到 {len(results)} 个结果") + log.info(f"Web search successful, found {len(results)} results") return results except Exception as e: - log.error(f"网页搜索出错: {str(e)}") + log.error(f"Web search error: {str(e)}") return [] finally: if 'conn' in locals(): @@ -92,13 +92,13 @@ class WebSearch: def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]: """ - 带重试机制的网页搜索 + Web search with retry mechanism Args: - query (str): 搜索查询 - size (int): 返回结果数量 - max_retries (int): 最大重试次数 + query (str): Search query + size (int): Number of results to return + max_retries (int): Maximum number of retries Returns: - List[RetrievalResult]: 搜索结果列表 + List[RetrievalResult]: List of search results """ for attempt in range(max_retries): try: @@ -106,8 +106,8 @@ class WebSearch: if results: return results except Exception as e: - log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}") + log.warning(f"Web search attempt {attempt + 1} failed: {str(e)}") if attempt < max_retries - 1: - time.sleep(1) # 等待1秒后重试 - log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败") + time.sleep(1) # Wait 1 second before retrying + log.error(f"Web search failed after {max_retries} attempts") return [] diff --git a/main.py b/main.py index 3b6da71..933ee6b 100644 --- a/main.py +++ b/main.py @@ -141,7 +141,7 @@ def load_files( batch_size=batch_size if batch_size is not None else 8, force_rebuild=force_rebuild, ) - return {"message": "成功加载"} + return {"message": "加载完成"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -385,43 +385,43 @@ def clear_messages(): def serve_file(file_path: str, download: bool = Query(False, description="Whether to download the file")): """ Serve local files for file:// URIs in generated reports. - - This endpoint allows accessing local files that are referenced in the - generated reports. The file_path parameter should be the URL-encoded + + This endpoint allows accessing local files that are referenced in the + generated reports. The file_path parameter should be the URL-encoded path to the file. - + Args: file_path (str): The URL-encoded file path - + Returns: HTMLResponse or PlainTextResponse: The file content displayed in browser - + Raises: HTTPException: If the file is not found or access is denied """ import urllib.parse import mimetypes from pathlib import Path - + try: # URL解码文件路径 decoded_path = urllib.parse.unquote(file_path) - + # 转换为Path对象 file_path_obj = Path(decoded_path) - + # 安全检查:确保文件路径是绝对路径 if not file_path_obj.is_absolute(): raise HTTPException(status_code=400, detail="Only absolute file paths are allowed") - + # 安全检查:确保文件存在 if not file_path_obj.exists(): raise HTTPException(status_code=404, detail=f"File not found: {decoded_path}") - + # 安全检查:确保是文件而不是目录 if not file_path_obj.is_file(): raise HTTPException(status_code=400, detail=f"Path is not a file: {decoded_path}") - + # 如果请求下载,直接返回文件 if download: return FileResponse( @@ -429,24 +429,24 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe filename=file_path_obj.name, media_type='application/octet-stream' ) - + # 尝试读取文件内容 try: - with open(file_path_obj, 'r', encoding='utf-8') as f: + with open(file_path_obj, encoding='utf-8') as f: content = f.read() except UnicodeDecodeError: # 如果UTF-8解码失败,尝试其他编码 try: - with open(file_path_obj, 'r', encoding='latin-1') as f: + with open(file_path_obj, encoding='latin-1') as f: content = f.read() except Exception as e: raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}") except Exception as e: raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}") - + # 获取文件类型 mime_type, _ = mimetypes.guess_type(str(file_path_obj)) - + # 根据文件类型决定如何显示 if mime_type and mime_type.startswith('text/'): # 文本文件直接在浏览器中显示 @@ -454,97 +454,97 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe else: # 其他文件类型创建HTML页面显示 html_content = f""" - - - - - - 文件查看器 - {file_path_obj.name} - - - -
-
-
-

{file_path_obj.name}

-
- 路径: {decoded_path}
- 大小: {file_path_obj.stat().st_size:,} 字节 -
-
- 下载文件 -
-
-""" - + + + + + + 文件查看器 - {file_path_obj.name} + + + +
+
+
+

{file_path_obj.name}

+
+ 路径: {decoded_path}
+ 大小: {file_path_obj.stat().st_size:,} 字节 +
+
+ 下载文件 +
+
+ """ + # 检查是否为二进制文件 try: # 尝试读取前1024字节来检测是否为二进制文件 @@ -552,39 +552,39 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe sample = f.read(1024) # 检查是否包含null字节,这是二进制文件的特征 if b'\x00' in sample: - html_content += f""" -
- 注意:这是一个二进制文件,无法在浏览器中直接显示内容。 -
-""" + html_content += """ +
+ 注意:这是一个二进制文件,无法在浏览器中直接显示内容。 +
+ """ else: # 尝试以文本形式显示 try: text_content = sample.decode('utf-8') html_content += f""" -
{text_content}
-""" +
{text_content}
+ """ except UnicodeDecodeError: - html_content += f""" -
- 注意:此文件包含非文本内容,无法在浏览器中直接显示。 -
-""" + html_content += """ +
+ 注意:此文件包含非文本内容,无法在浏览器中直接显示。 +
+ """ except Exception: - html_content += f""" -
- 注意:无法读取文件内容。 -
-""" - + html_content += """ +
+ 注意:无法读取文件内容。 +
+ """ + html_content += """ -
-
- - -""" +
+
+ + + """ return HTMLResponse(content=html_content) - + except HTTPException: # 重新抛出HTTP异常 raise