|
|
@ -2,7 +2,7 @@ from deepsearcher.agent.base import BaseAgent, describe_class |
|
|
|
from deepsearcher.embedding.base import BaseEmbedding |
|
|
|
from deepsearcher.llm.base import BaseLLM |
|
|
|
from deepsearcher.utils import log |
|
|
|
from deepsearcher.utils.message_stream import send_search, send_think, send_answer |
|
|
|
from deepsearcher.utils.message_stream import send_info, send_answer |
|
|
|
from deepsearcher.vector_db import RetrievalResult |
|
|
|
from deepsearcher.vector_db.base import BaseVectorDB, deduplicate |
|
|
|
from collections import defaultdict |
|
|
@ -22,7 +22,8 @@ COLLECTION_ROUTE_PROMPT = """ |
|
|
|
|
|
|
|
|
|
|
|
SUB_QUERY_PROMPT = """ |
|
|
|
为了能够全面的回答这个问题,请你尝试把原本的问题拆分或扩展为多个子问题 |
|
|
|
为了能够全面的回答这个问题,请你尝试把原本的问题拆分或扩展为几个子问题 |
|
|
|
不可以太多,但是也不可以太少,请根据问题复杂程度来决定子问题的数量 |
|
|
|
如果原问题本身非常简单,没有必要进行拆分,则保留输出原问题本身 |
|
|
|
需要保证每个子问题都具体、清晰、不可分(原子性),最终返回一个字符串列表 |
|
|
|
|
|
|
@ -84,9 +85,8 @@ REFLECT_PROMPT = """ |
|
|
|
SUMMARY_PROMPT = """ |
|
|
|
你是一个内容分析专家,请你根据提供的问题和检索到的信息生成详尽的长文回答。 |
|
|
|
如果检索到的信息不足以回答问题或者必须添加额外信息才能能回答,你应该使用你的知识来进行补充, |
|
|
|
这种情况下,你自己提供的信息需要使用"<unreferenced>your knowledge here</unreferenced>"括起来 |
|
|
|
(添加多个<unreferenced></unreferenced>块时不需要序号,并且<unreferenced></unreferenced>块不应该被单独分到一个段落中,而是放到回答的各处) |
|
|
|
同时,你应该根据提供的信息生成文内引用和文末参考资料列表 |
|
|
|
这种情况下,你自己提供的信息需要使用例如"your knowledge here[^0]"引用,注意,这里的"[^0]"的序号0是固定的,表示你的知识,下文当中有文末引用的例子 |
|
|
|
同时,你应该根据提供的信息生成文内引用和文末参考资料列表,来自文档切片的reference引用从[^1]开始 |
|
|
|
如果多个片段是相同的来源或者一个片段可以回答多个问题,文内引用可以引用多次,但文末只引用一次来源,即文末的引用列表中不能有重复的来源。 |
|
|
|
|
|
|
|
例子: |
|
|
@ -98,6 +98,7 @@ SUMMARY_PROMPT = """ |
|
|
|
|
|
|
|
|
|
|
|
文末引用示例 (需要与前文reference的href一致,不需要对每个chunk分配一个引用,而是每一个referecen共用一个引用): |
|
|
|
[^0]: AI Generated |
|
|
|
[^2]: files/docs/chap_001_003_models.md |
|
|
|
|
|
|
|
</EXAMPLE> |
|
|
@ -180,7 +181,7 @@ class DeepSearch(BaseAgent): |
|
|
|
if len(collection_infos) == 1: |
|
|
|
the_only_collection = collection_infos[0].collection_name |
|
|
|
log.color_print( |
|
|
|
f"<think> Perform search [{query}] on the vector DB collection: {the_only_collection} </think>\n" |
|
|
|
f"Perform search [{query}] on the vector DB collection: {the_only_collection}\n" |
|
|
|
) |
|
|
|
return [the_only_collection] |
|
|
|
vector_db_search_prompt = COLLECTION_ROUTE_PROMPT.format( |
|
|
@ -207,7 +208,7 @@ class DeepSearch(BaseAgent): |
|
|
|
selected_collections.append(collection_info.collection_name) |
|
|
|
selected_collections = list(set(selected_collections)) |
|
|
|
log.color_print( |
|
|
|
f"<think> Perform search [{query}] on the vector DB collections: {selected_collections} </think>\n" |
|
|
|
f"Perform search [{query}] on the vector DB collections: {selected_collections}\n" |
|
|
|
) |
|
|
|
return selected_collections |
|
|
|
|
|
|
@ -231,12 +232,12 @@ class DeepSearch(BaseAgent): |
|
|
|
all_retrieved_results = [] |
|
|
|
query_vector = self.embedding_model.embed_query(query) |
|
|
|
for collection in selected_collections: |
|
|
|
send_search(f"Search [{query}] in [{collection}]...") |
|
|
|
send_info(f"正在 [{collection}] 中搜索 [{query}] ...") |
|
|
|
retrieved_results = self.vector_db.search_data( |
|
|
|
collection=collection, vector=query_vector, query_text=query |
|
|
|
) |
|
|
|
if not retrieved_results or len(retrieved_results) == 0: |
|
|
|
send_search(f"No relevant document chunks found in '{collection}'!") |
|
|
|
send_info(f"'{collection}' 中没有找到相关文档!") |
|
|
|
continue |
|
|
|
|
|
|
|
# Format all chunks for batch processing |
|
|
@ -287,9 +288,9 @@ class DeepSearch(BaseAgent): |
|
|
|
references.add(retrieved_result.reference) |
|
|
|
|
|
|
|
if accepted_chunk_num > 0: |
|
|
|
send_search(f"Accept {accepted_chunk_num} document chunk(s) from references: {list(references)}") |
|
|
|
send_info(f"采纳 {accepted_chunk_num} 个文档片段,来源:{list(references)}") |
|
|
|
else: |
|
|
|
send_search(f"No document chunk accepted from '{collection}'!") |
|
|
|
send_info(f"没有采纳任何 '{collection}' 中找到的文档片段!") |
|
|
|
return all_retrieved_results |
|
|
|
|
|
|
|
def _generate_more_sub_queries( |
|
|
@ -327,7 +328,6 @@ class DeepSearch(BaseAgent): |
|
|
|
max_iter = kwargs.get('max_iter', self.max_iter) |
|
|
|
|
|
|
|
### SUB QUERIES ### |
|
|
|
send_think(f"<query> {original_query} </query>") |
|
|
|
all_search_results = [] |
|
|
|
all_sub_queries = [] |
|
|
|
|
|
|
@ -336,11 +336,11 @@ class DeepSearch(BaseAgent): |
|
|
|
log.color_print("No sub queries were generated by the LLM. Exiting.") |
|
|
|
return [], {} |
|
|
|
else: |
|
|
|
send_think(f"Break down the original query into new sub queries: {sub_queries}") |
|
|
|
send_info(f"原问题被拆分为这些子问题: {sub_queries}") |
|
|
|
all_sub_queries.extend(sub_queries) |
|
|
|
|
|
|
|
for it in range(max_iter): |
|
|
|
send_think(f">> Iteration: {it + 1}") |
|
|
|
send_info(f"第 {it + 1} 轮搜索:") |
|
|
|
|
|
|
|
# Execute all search tasks sequentially |
|
|
|
for query in sub_queries: |
|
|
@ -350,25 +350,25 @@ class DeepSearch(BaseAgent): |
|
|
|
all_search_results = deduplicate(all_search_results) |
|
|
|
deduped_len = len(all_search_results) |
|
|
|
if undeduped_len - deduped_len != 0: |
|
|
|
send_search(f"Remove {undeduped_len - deduped_len} duplicates") |
|
|
|
send_info(f"移除 {undeduped_len - deduped_len} 个重复文档片段") |
|
|
|
# search_res_from_internet = deduplicate_results(search_res_from_internet) |
|
|
|
# all_search_res.extend(search_res_from_vectordb + search_res_from_internet) |
|
|
|
|
|
|
|
### REFLECTION & GET MORE SUB QUERIES ### |
|
|
|
# Only generate more queries if we haven't reached the maximum iterations |
|
|
|
if it + 1 < max_iter: |
|
|
|
send_think("Reflecting on the search results...") |
|
|
|
send_info("正在根据文档片段思考 ...") |
|
|
|
sub_queries = self._generate_more_sub_queries( |
|
|
|
original_query, all_sub_queries, all_search_results |
|
|
|
) |
|
|
|
if not sub_queries or len(sub_queries) == 0: |
|
|
|
send_think("No new search queries were generated. Exiting.") |
|
|
|
send_info("没能生成更多的子问题,正在退出 ....") |
|
|
|
break |
|
|
|
else: |
|
|
|
send_think(f"New search queries for next iteration: {sub_queries}") |
|
|
|
send_info(f"下一轮搜索的子问题: {sub_queries}") |
|
|
|
all_sub_queries.extend(sub_queries) |
|
|
|
else: |
|
|
|
send_think("Reached maximum iterations. Exiting.") |
|
|
|
send_info("已达到最大搜索轮数,正在退出 ...") |
|
|
|
break |
|
|
|
|
|
|
|
all_search_results = deduplicate(all_search_results) |
|
|
@ -392,10 +392,10 @@ class DeepSearch(BaseAgent): |
|
|
|
""" |
|
|
|
all_retrieved_results, all_sub_queries = self.retrieve(original_query, **kwargs) |
|
|
|
if not all_retrieved_results or len(all_retrieved_results) == 0: |
|
|
|
send_think(f"No relevant information found for query '{original_query}'.") |
|
|
|
send_info(f"'{original_query}'没能找到更多信息!") |
|
|
|
return "", [] |
|
|
|
chunks = self._format_chunks(all_retrieved_results) |
|
|
|
send_think(f"Summarize answer from all {len(all_retrieved_results)} retrieved chunks...") |
|
|
|
send_info(f"正在总结 {len(all_retrieved_results)} 个查找到的文档片段") |
|
|
|
summary_prompt = SUMMARY_PROMPT.format( |
|
|
|
original_query=original_query, |
|
|
|
all_sub_queries=all_sub_queries, |
|
|
@ -403,7 +403,6 @@ class DeepSearch(BaseAgent): |
|
|
|
) |
|
|
|
response = self.llm.chat([{"role": "user", "content": summary_prompt}]) |
|
|
|
final_answer = self.llm.remove_think(response) |
|
|
|
# 直接发送最终答案,不发送占位符 |
|
|
|
send_answer(final_answer) |
|
|
|
return self.llm.remove_think(response), all_retrieved_results |
|
|
|
|
|
|
|