Browse Source

feat: 添加latex主题文字

refactor: 优化前端到后端的参数传递链
main
tanxing 7 hours ago
parent
commit
fc7ac11c5e
  1. 26
      deepsearcher/agent/deep_search.py
  2. 12
      deepsearcher/config.yaml
  3. 21
      deepsearcher/online_query.py
  4. 4
      deepsearcher/templates/static/themes/Readme.md
  5. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-105-Heavy.ttf
  6. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-115-Black.ttf
  7. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-35-Thin.ttf
  8. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-45-Light.ttf
  9. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-55-Regular.ttf
  10. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-65-Medium.ttf
  11. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-75-SemiBold.ttf
  12. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-85-Bold.ttf
  13. BIN
      deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-95-ExtraBold.ttf
  14. BIN
      deepsearcher/templates/static/themes/latex/STSongti-SC-Black.ttf
  15. BIN
      deepsearcher/templates/static/themes/latex/STSongti-SC-Regular.ttf
  16. 50
      deepsearcher/web_search.py
  17. 38
      main.py

26
deepsearcher/agent/deep_search.py

@ -20,7 +20,7 @@ COLLECTION_ROUTE_PROMPT = """
"合集信息": {collection_info}
使用的语言与问题相同
你需要返回的格式是 a python list of str without any addtional content:
你需要返回的格式是 a python list[str] without any addtional content:
"""
@ -29,7 +29,12 @@ SUB_QUERY_PROMPT = """
请你使用自顶向下和自底向上两种方向来思考如何拆分问题
子问题的数量不可以太多但是也不可以太少应当保证问题的回答全面性请根据问题复杂程度来决定子问题的数量
如果原问题本身非常简单没有必要进行拆分则保留输出原问题本身
需要保证每个子问题都具体清晰不可分原子性即不可以再包含更细分的子问题子问题中不要包含"请你回答""请你总结""请你分析"等祈使类型词语
需要保证每个子问题都具体清晰不可分原子性即不可以再包含更细分的子问题但是也不可以过于片面狭窄否则会降低回答的全面性
例如对于机器学习如果问题是"XGBoost是什么"那么子问题:
"XGBoost和其他Boosting算法有什么区别"(正确)
"XGBoost和Gradient Boost有什么区别"错误
"XGBoost和其他Boosting算法有什么区别(如AdaBoost、Gradient Boost)"错误
同时子问题中不要包含"请你回答""请你总结""请你分析"等祈使类型词语
你需要最终返回一个字符串列表
原问题: {original_query}
@ -43,14 +48,14 @@ SUB_QUERY_PROMPT = """
"什么是机器学习?",
"机器学习的使用目的",
"机器学习的常用算法",
"机器学习的历史演进过程",
"机器学习和深度学习的区别是什么?"
"机器学习算法的演进过程",
"机器学习和现在流行的深度学习的区别是什么?"
]
</EXAMPLE>
使用的语言与原问题相同
你需要返回的是 a python list of str without any addtional content:
你需要返回的是 a python list[str] without any addtional content:
"""
@ -66,7 +71,7 @@ RERANK_PROMPT = """
例如假如给出4个chunks实际检索到的文档片段不一定是这么多返回4个"True"或者"False"注意这只是一个示例不代表实际判断: ["True", "False", "True", "True"]
使用的语言与问题相同
你需要返回的是 a python list of str(bool) without any addtional content:
你需要返回的是 a python list[str(bool)] without any addtional content:
"""
@ -85,7 +90,7 @@ REFLECT_PROMPT = """
{chunks}
使用的语言与原问题相同
你需要返回的是 a python list of str without any addtional content:
你需要返回的是 a python list[str] without any addtional content:
"""
@ -93,6 +98,7 @@ SUMMARY_PROMPT = """
你是一个内容分析专家
请你综合已经提出的问题和检索到的信息以原问题为中心生成详细准确层次分明多级标题从一级开始尽可能长的回答
如果检索到的信息不足以回答问题你应该使用你的知识来进行扩展补充
如果检索到的文档片段中有其他有用的信息但是没有在之前的问题中被提出你也应该添加进最终回答中
注意不要逐个回答问题而是应该综合所有问题和信息生成一个完整的回答
同时你应该根据提供的信息生成文内引用"[^index]"(markdown文内引用)
来自<chunk><reference>的引用序号从[^index]从index=1开始来源需要与前文<reference>中的"id"一致
@ -248,9 +254,9 @@ class DeepSearch(BaseAgent):
send_info(f"本地向量搜索找到 {len(vector_results)} 个结果")
# 网页搜索
self.web_search = WebSearch() if kwargs.get('web_search', False) else None
self.web_search = True if kwargs.get('web_search', False) else None
if self.web_search:
web_results = self.web_search.search_with_retry(query, size=2)
web_results = WebSearch().search_with_retry(query, size=4)
if web_results:
send_info(f"网页搜索找到 {len(web_results)} 个结果")
else:
@ -473,7 +479,7 @@ class DeepSearch(BaseAgent):
absolute_path = str(Path(reference).resolve())
encoded_path = urllib.parse.quote(absolute_path, safe='')
# 使用相对路径,这样可以在不同的服务器配置下工作
formated_refs.append(f"[^{i + 1}]: [/file/{encoded_path}](/file/{encoded_path})\n")
formated_refs.append(f"[^{i + 1}]: [{absolute_path}](/file/{encoded_path})\n")
except Exception as _:
formated_refs.append(f"[^{i + 1}]: {reference}\n")

12
deepsearcher/config.yaml

@ -2,12 +2,12 @@ provide_settings:
llm:
provider: "OpenAILLM"
config:
# model: "Qwen/Qwen3-32B"
# api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt"
#base_url: "https://api.siliconflow.cn/v1"
model: qwen3-32b
api_key: sk-14f39f0c530d4aa0b5588454bff859d6
base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
model: "Qwen/Qwen3-32B"
api_key: "sk-fpzwvagjkhwysjsozfybvtjzongatcwqdihdxzuijnfdrjzt"
base_url: "https://api.siliconflow.cn/v1"
# model: qwen3-32b
# api_key: sk-14f39f0c530d4aa0b5588454bff859d6
# base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
embedding:
provider: "OpenAIEmbedding"

21
deepsearcher/online_query.py

@ -23,24 +23,3 @@ def query(original_query: str, **kwargs) -> tuple[str, list[RetrievalResult]]:
max_iter = kwargs.get("max_iter", 3)
web_search = kwargs.get("web_search", False)
return default_searcher.query(original_query, max_iter=max_iter, web_search=web_search)
def retrieve(original_query: str, max_iter: int | None = None) -> tuple[list[RetrievalResult], list[str]]:
"""
Retrieve relevant information from the knowledge base without generating an answer.
This function uses the default searcher to retrieve information from the knowledge base
that is relevant to the query.
Args:
original_query: The question or query to search for.
max_iter: Maximum number of iterations for the search process.
Returns:
A tuple containing:
- A list of retrieval results
- A list of strings representing consumed tokens
"""
default_searcher = configuration.default_searcher
retrieved_results, metadata = default_searcher.retrieve(original_query, max_iter=max_iter)
return retrieved_results

4
deepsearcher/templates/static/themes/Readme.md

@ -1,4 +0,0 @@
The built-in CSS will be replaced after update / reinstall, DO NOT MODIFY THEM.
Refer https://support.typora.io/Add-Custom-CSS/ when you want to modify those CSS.
Refer https://support.typora.io/About-Themes/ if you want to create / install new themes.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-105-Heavy.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-115-Black.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-35-Thin.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-45-Light.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-55-Regular.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-65-Medium.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-75-SemiBold.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-85-Bold.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/AlibabaPuHuiTi-2-95-ExtraBold.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/STSongti-SC-Black.ttf

Binary file not shown.

BIN
deepsearcher/templates/static/themes/latex/STSongti-SC-Regular.ttf

Binary file not shown.

50
deepsearcher/web_search.py

@ -6,12 +6,12 @@ from deepsearcher.utils import log
class WebSearch:
"""网页搜索类,用于调用metaso.cn API进行网页搜索"""
"""Web search class for calling metaso.cn API to perform web searches"""
def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"):
"""
初始化网页搜索
Initialize web search
Args:
api_key (str): metaso.cn API密钥
api_key (str): metaso.cn API key
"""
self.api_key = api_key
self.base_url = "metaso.cn"
@ -19,15 +19,15 @@ class WebSearch:
def search(self, query: str, size: int = 4) -> list[RetrievalResult]:
"""
执行网页搜索
Execute web search
Args:
query (str): 搜索查询
size (int): 返回结果数量默认为4
query (str): Search query
size (int): Number of results to return, default is 4
Returns:
List[RetrievalResult]: 搜索结果列表
List[RetrievalResult]: List of search results
"""
try:
# 构建请求数据
# Build request data
payload = json.dumps({
"q": query,
"scope": "webpage",
@ -43,33 +43,33 @@ class WebSearch:
'Content-Type': 'application/json'
}
# 发送请求
# Send request
conn = http.client.HTTPSConnection(self.base_url)
conn.request("POST", self.endpoint, payload, headers)
res = conn.getresponse()
data = res.read()
if res.status != 200:
log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}")
log.error(f"Web search request failed: {res.status} - {data.decode('utf-8')}")
return []
response_data = json.loads(data.decode("utf-8"))
# 解析搜索结果
# Parse search results
results = []
if "webpages" in response_data:
for i, webpage in enumerate(response_data["webpages"]):
# 使用content字段作为主要文本内容
# Use content field as primary text content
content = webpage.get("content", "")
if not content:
content = webpage.get("snippet", "")
# 创建RetrievalResult对象
# Create RetrievalResult object
result = RetrievalResult(
embedding=None, # 网页搜索结果没有向量
embedding=None, # Web search results don't have vectors
text=content,
reference=webpage.get("link", ""),
score=1.0 - (i * (1 / size)), # 根据位置计算分数
score=None, # Web search results don't have scores
metadata={
"title": webpage.get("title", ""),
"date": webpage.get("date", ""),
@ -80,11 +80,11 @@ class WebSearch:
)
results.append(result)
log.info(f"网页搜索成功,找到 {len(results)} 个结果")
log.info(f"Web search successful, found {len(results)} results")
return results
except Exception as e:
log.error(f"网页搜索出错: {str(e)}")
log.error(f"Web search error: {str(e)}")
return []
finally:
if 'conn' in locals():
@ -92,13 +92,13 @@ class WebSearch:
def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]:
"""
带重试机制的网页搜索
Web search with retry mechanism
Args:
query (str): 搜索查询
size (int): 返回结果数量
max_retries (int): 最大重试次数
query (str): Search query
size (int): Number of results to return
max_retries (int): Maximum number of retries
Returns:
List[RetrievalResult]: 搜索结果列表
List[RetrievalResult]: List of search results
"""
for attempt in range(max_retries):
try:
@ -106,8 +106,8 @@ class WebSearch:
if results:
return results
except Exception as e:
log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}")
log.warning(f"Web search attempt {attempt + 1} failed: {str(e)}")
if attempt < max_retries - 1:
time.sleep(1) # 等待1秒后重试
log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败")
time.sleep(1) # Wait 1 second before retrying
log.error(f"Web search failed after {max_retries} attempts")
return []

38
main.py

@ -141,7 +141,7 @@ def load_files(
batch_size=batch_size if batch_size is not None else 8,
force_rebuild=force_rebuild,
)
return {"message": "成功加载"}
return {"message": "加载完成"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@ -432,12 +432,12 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe
# 尝试读取文件内容
try:
with open(file_path_obj, 'r', encoding='utf-8') as f:
with open(file_path_obj, encoding='utf-8') as f:
content = f.read()
except UnicodeDecodeError:
# 如果UTF-8解码失败,尝试其他编码
try:
with open(file_path_obj, 'r', encoding='latin-1') as f:
with open(file_path_obj, encoding='latin-1') as f:
content = f.read()
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
@ -454,9 +454,9 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe
else:
# 其他文件类型创建HTML页面显示
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>文件查看器 - {file_path_obj.name}</title>
@ -529,8 +529,8 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe
color: #856404;
}}
</style>
</head>
<body>
</head>
<body>
<div class="container">
<div class="header">
<div>
@ -543,7 +543,7 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe
<a href="/file/{file_path}?download=true" class="download-btn">下载文件</a>
</div>
<div class="content">
"""
"""
# 检查是否为二进制文件
try:
@ -552,37 +552,37 @@ def serve_file(file_path: str, download: bool = Query(False, description="Whethe
sample = f.read(1024)
# 检查是否包含null字节,这是二进制文件的特征
if b'\x00' in sample:
html_content += f"""
html_content += """
<div class="binary-notice">
<strong>注意</strong>这是一个二进制文件无法在浏览器中直接显示内容
</div>
"""
"""
else:
# 尝试以文本形式显示
try:
text_content = sample.decode('utf-8')
html_content += f"""
<pre>{text_content}</pre>
"""
"""
except UnicodeDecodeError:
html_content += f"""
html_content += """
<div class="binary-notice">
<strong>注意</strong>此文件包含非文本内容无法在浏览器中直接显示
</div>
"""
"""
except Exception:
html_content += f"""
html_content += """
<div class="binary-notice">
<strong>注意</strong>无法读取文件内容
</div>
"""
"""
html_content += """
</div>
</div>
</body>
</html>
"""
</body>
</html>
"""
return HTMLResponse(content=html_content)
except HTTPException:

Loading…
Cancel
Save