From ca9b68ef911ee5e1bb03bf2db7fea9915d0eb9a4 Mon Sep 17 00:00:00 2001
From: tanxing <eternaldwellers@gmail.com>
Date: Mon, 11 Aug 2025 18:07:39 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=BF=AD=E4=BB=A3=E6=A3=80?=
 =?UTF-8?q?=E7=B4=A2=E4=B8=AD=E7=9A=84chunk=E5=8E=BB=E9=87=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deepsearcher/agent/deep_search.py | 31 ++++++++++++++++++++-----------
 deepsearcher/loader/splitter.py   | 11 +++++------
 2 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/deepsearcher/agent/deep_search.py b/deepsearcher/agent/deep_search.py
index b11c81c..7ef8056 100644
--- a/deepsearcher/agent/deep_search.py
+++ b/deepsearcher/agent/deep_search.py
@@ -20,7 +20,8 @@ Your selected collection name list is:
 
 
 SUB_QUERY_PROMPT = """
-To answer this question more comprehensively, please break down the original question into few numbers of sub-questions (more if necessary).
+To answer this question more comprehensively, please break down the original question into few numbers of sub-questions
+(the less the better, but more if nesscessary to ensure the coverage of answering the original question).
 If this is a very simple question and no decomposition is necessary, then keep the only one original question.
 Make sure each sub-question is clear, concise and atomic.
 Return as list of str in python style and json convertable.
@@ -45,10 +46,10 @@ Provide your response in a python code list of str format:
 
 
 RERANK_PROMPT = """
-Based on the query questions and the retrieved chunks, determine whether each chunk is helpful in answering any of the query questions.
-For each chunk, you must return "YES" or "NO" without any other information.
+Based on the query and the retrieved chunks, give a quick judge of whether each chunk is helpful in answering the query.
+For each chunk, you must return "YES" or "NO" python style list without any other information.
 
-Query Questions: {query}
+Query: {query}
 
 Retrieved Chunks:
 {retrieved_chunks}
@@ -224,7 +225,10 @@ class DeepSearch(BaseAgent):
             # Format all chunks for batch processing
             formatted_chunks = ""
             for i, retrieved_result in enumerate(retrieved_results):
-                formatted_chunks += f"<chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n"
+                formatted_chunks += f'''
+                <chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n
+                <reference_{i + 1}>\n{retrieved_result.reference}\n</reference_{i + 1}>
+                '''
 
             # Batch process all chunks with a single LLM call
             content = self.llm.chat(
@@ -321,24 +325,29 @@ class DeepSearch(BaseAgent):
             log.color_print("No sub queries were generated by the LLM. Exiting.")
             return [], {}
         else:
-            log.color_print(
-                f"</think> Break down the original query into new sub queries: {sub_queries}  ")
+            log.color_print(f"</think> Break down the original query into new sub queries: {sub_queries}  ")
         all_sub_queries.extend(sub_queries)
         sub_gap_queries = sub_queries
 
         for iter in range(max_iter):
             log.color_print(f">> Iteration: {iter + 1}\n")
             search_res_from_vectordb = []
-            search_res_from_internet = []  # TODO
+            # search_res_from_internet = []  # TODO
 
             # Execute all search tasks sequentially
             for query in sub_gap_queries:
                 result = self._search_chunks_from_vectordb(query)
                 search_res_from_vectordb.extend(result)
-
-            search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+                undedup_len = len(search_res_from_vectordb)
+                search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+                deduped_len = len(search_res_from_vectordb)
+                if undedup_len - deduped_len != 0:
+                    log.color_print(
+                        f"<search> Removed {undedup_len - deduped_len} duplicates </search>"
+                    )
             # search_res_from_internet = deduplicate_results(search_res_from_internet)
-            all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+            # all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+            all_search_res.extend(search_res_from_vectordb)
             if iter == max_iter - 1:
                 log.color_print("</think> Exceeded maximum iterations. Exiting.  ")
                 break
diff --git a/deepsearcher/loader/splitter.py b/deepsearcher/loader/splitter.py
index ebdf70a..cdde9e2 100644
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@@ -1,7 +1,6 @@
 ## Sentence Window splitting strategy, ref:
 #  https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
 
-from typing import List
 
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -26,7 +25,7 @@ class Chunk:
         text: str,
         reference: str,
         metadata: dict = None,
-        embedding: List[float] = None,
+        embedding: list[float] = None,
     ):
         """
         Initialize a Chunk object.
@@ -44,8 +43,8 @@ class Chunk:
 
 
 def _sentence_window_split(
-    split_docs: List[Document], original_document: Document, offset: int = 200
-) -> List[Chunk]:
+    split_docs: list[Document], original_document: Document, offset: int = 200
+) -> list[Chunk]:
     """
     Create chunks with context windows from split documents.
 
@@ -78,8 +77,8 @@ def _sentence_window_split(
 
 
 def split_docs_to_chunks(
-    documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
-) -> List[Chunk]:
+    documents: list[Document], chunk_size: int = 1500, chunk_overlap=100
+) -> list[Chunk]:
     """
     Split documents into chunks with context windows.