diff --git a/deepsearcher/agent/deep_search.py b/deepsearcher/agent/deep_search.py
index b11c81c..7ef8056 100644
--- a/deepsearcher/agent/deep_search.py
+++ b/deepsearcher/agent/deep_search.py
@@ -20,7 +20,8 @@ Your selected collection name list is:
SUB_QUERY_PROMPT = """
-To answer this question more comprehensively, please break down the original question into few numbers of sub-questions (more if necessary).
+To answer this question more comprehensively, please break down the original question into few numbers of sub-questions
+(the less the better, but more if nesscessary to ensure the coverage of answering the original question).
If this is a very simple question and no decomposition is necessary, then keep the only one original question.
Make sure each sub-question is clear, concise and atomic.
Return as list of str in python style and json convertable.
@@ -45,10 +46,10 @@ Provide your response in a python code list of str format:
RERANK_PROMPT = """
-Based on the query questions and the retrieved chunks, determine whether each chunk is helpful in answering any of the query questions.
-For each chunk, you must return "YES" or "NO" without any other information.
+Based on the query and the retrieved chunks, give a quick judge of whether each chunk is helpful in answering the query.
+For each chunk, you must return "YES" or "NO" python style list without any other information.
-Query Questions: {query}
+Query: {query}
Retrieved Chunks:
{retrieved_chunks}
@@ -224,7 +225,10 @@ class DeepSearch(BaseAgent):
# Format all chunks for batch processing
formatted_chunks = ""
for i, retrieved_result in enumerate(retrieved_results):
- formatted_chunks += f"\n{retrieved_result.text}\n\n"
+ formatted_chunks += f'''
+ \n{retrieved_result.text}\n\n
+ \n{retrieved_result.reference}\n
+ '''
# Batch process all chunks with a single LLM call
content = self.llm.chat(
@@ -321,24 +325,29 @@ class DeepSearch(BaseAgent):
log.color_print("No sub queries were generated by the LLM. Exiting.")
return [], {}
else:
- log.color_print(
- f" Break down the original query into new sub queries: {sub_queries} ")
+ log.color_print(f" Break down the original query into new sub queries: {sub_queries} ")
all_sub_queries.extend(sub_queries)
sub_gap_queries = sub_queries
for iter in range(max_iter):
log.color_print(f">> Iteration: {iter + 1}\n")
search_res_from_vectordb = []
- search_res_from_internet = [] # TODO
+ # search_res_from_internet = [] # TODO
# Execute all search tasks sequentially
for query in sub_gap_queries:
result = self._search_chunks_from_vectordb(query)
search_res_from_vectordb.extend(result)
-
- search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+ undedup_len = len(search_res_from_vectordb)
+ search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+ deduped_len = len(search_res_from_vectordb)
+ if undedup_len - deduped_len != 0:
+ log.color_print(
+ f" Removed {undedup_len - deduped_len} duplicates "
+ )
# search_res_from_internet = deduplicate_results(search_res_from_internet)
- all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+ # all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+ all_search_res.extend(search_res_from_vectordb)
if iter == max_iter - 1:
log.color_print(" Exceeded maximum iterations. Exiting. ")
break
diff --git a/deepsearcher/loader/splitter.py b/deepsearcher/loader/splitter.py
index ebdf70a..cdde9e2 100644
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@@ -1,7 +1,6 @@
## Sentence Window splitting strategy, ref:
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
-from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -26,7 +25,7 @@ class Chunk:
text: str,
reference: str,
metadata: dict = None,
- embedding: List[float] = None,
+ embedding: list[float] = None,
):
"""
Initialize a Chunk object.
@@ -44,8 +43,8 @@ class Chunk:
def _sentence_window_split(
- split_docs: List[Document], original_document: Document, offset: int = 200
-) -> List[Chunk]:
+ split_docs: list[Document], original_document: Document, offset: int = 200
+) -> list[Chunk]:
"""
Create chunks with context windows from split documents.
@@ -78,8 +77,8 @@ def _sentence_window_split(
def split_docs_to_chunks(
- documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
-) -> List[Chunk]:
+ documents: list[Document], chunk_size: int = 1500, chunk_overlap=100
+) -> list[Chunk]:
"""
Split documents into chunks with context windows.