新增迭代检索中的chunk去重

6 days ago · ca9b68ef91
2 changed files with 25 additions and 17 deletions
--- a/deepsearcher/agent/deep_search.py
+++ b/deepsearcher/agent/deep_search.py
@ -20,7 +20,8 @@ Your selected collection name list is:


 SUB_QUERY_PROMPT = """
-To answer this question more comprehensively, please break down the original question into few numbers of sub-questions (more if necessary).
+To answer this question more comprehensively, please break down the original question into few numbers of sub-questions
+(the less the better, but more if nesscessary to ensure the coverage of answering the original question).
 If this is a very simple question and no decomposition is necessary, then keep the only one original question.
 Make sure each sub-question is clear, concise and atomic.
 Return as list of str in python style and json convertable.
@ -45,10 +46,10 @@ Provide your response in a python code list of str format:


 RERANK_PROMPT = """
-Based on the query questions and the retrieved chunks, determine whether each chunk is helpful in answering any of the query questions.
-For each chunk, you must return "YES" or "NO" without any other information.
+Based on the query and the retrieved chunks, give a quick judge of whether each chunk is helpful in answering the query.
+For each chunk, you must return "YES" or "NO" python style list without any other information.

-Query Questions: {query}
+Query: {query}

 Retrieved Chunks:
 {retrieved_chunks}
@ -224,7 +225,10 @@ class DeepSearch(BaseAgent):
            # Format all chunks for batch processing
            formatted_chunks = ""
            for i, retrieved_result in enumerate(retrieved_results):
-                formatted_chunks += f"<chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n"
+                formatted_chunks += f'''
+                <chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n
+                <reference_{i + 1}>\n{retrieved_result.reference}\n</reference_{i + 1}>
+                '''

            # Batch process all chunks with a single LLM call
            content = self.llm.chat(
@ -321,24 +325,29 @@ class DeepSearch(BaseAgent):
            log.color_print("No sub queries were generated by the LLM. Exiting.")
            return [], {}
        else:
-            log.color_print(
-                f"</think> Break down the original query into new sub queries: {sub_queries}  ")
+            log.color_print(f"</think> Break down the original query into new sub queries: {sub_queries}  ")
        all_sub_queries.extend(sub_queries)
        sub_gap_queries = sub_queries

        for iter in range(max_iter):
            log.color_print(f">> Iteration: {iter + 1}\n")
            search_res_from_vectordb = []
-            search_res_from_internet = []  # TODO
+            # search_res_from_internet = []  # TODO

            # Execute all search tasks sequentially
            for query in sub_gap_queries:
                result = self._search_chunks_from_vectordb(query)
                search_res_from_vectordb.extend(result)
-
-            search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+                undedup_len = len(search_res_from_vectordb)
+                search_res_from_vectordb = deduplicate(search_res_from_vectordb)
+                deduped_len = len(search_res_from_vectordb)
+                if undedup_len - deduped_len != 0:
+                    log.color_print(
+                        f"<search> Removed {undedup_len - deduped_len} duplicates </search>"
+                    )
            # search_res_from_internet = deduplicate_results(search_res_from_internet)
-            all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+            # all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
+            all_search_res.extend(search_res_from_vectordb)
            if iter == max_iter - 1:
                log.color_print("</think> Exceeded maximum iterations. Exiting.  ")
                break
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@ -1,7 +1,6 @@
 ## Sentence Window splitting strategy, ref:
 #  https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb

-from typing import List

 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@ -26,7 +25,7 @@ class Chunk:
        text: str,
        reference: str,
        metadata: dict = None,
-        embedding: List[float] = None,
+        embedding: list[float] = None,
    ):
        """
        Initialize a Chunk object.
@ -44,8 +43,8 @@ class Chunk:


 def _sentence_window_split(
-    split_docs: List[Document], original_document: Document, offset: int = 200
-) -> List[Chunk]:
+    split_docs: list[Document], original_document: Document, offset: int = 200
+) -> list[Chunk]:
    """
    Create chunks with context windows from split documents.

@ -78,8 +77,8 @@ def _sentence_window_split(


 def split_docs_to_chunks(
-    documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
-) -> List[Chunk]:
+    documents: list[Document], chunk_size: int = 1500, chunk_overlap=100
+) -> list[Chunk]:
    """
    Split documents into chunks with context windows.