From ca9b68ef911ee5e1bb03bf2db7fea9915d0eb9a4 Mon Sep 17 00:00:00 2001 From: tanxing Date: Mon, 11 Aug 2025 18:07:39 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=BF=AD=E4=BB=A3=E6=A3=80?= =?UTF-8?q?=E7=B4=A2=E4=B8=AD=E7=9A=84chunk=E5=8E=BB=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepsearcher/agent/deep_search.py | 31 ++++++++++++++++++++----------- deepsearcher/loader/splitter.py | 11 +++++------ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/deepsearcher/agent/deep_search.py b/deepsearcher/agent/deep_search.py index b11c81c..7ef8056 100644 --- a/deepsearcher/agent/deep_search.py +++ b/deepsearcher/agent/deep_search.py @@ -20,7 +20,8 @@ Your selected collection name list is: SUB_QUERY_PROMPT = """ -To answer this question more comprehensively, please break down the original question into few numbers of sub-questions (more if necessary). +To answer this question more comprehensively, please break down the original question into few numbers of sub-questions +(the less the better, but more if nesscessary to ensure the coverage of answering the original question). If this is a very simple question and no decomposition is necessary, then keep the only one original question. Make sure each sub-question is clear, concise and atomic. Return as list of str in python style and json convertable. @@ -45,10 +46,10 @@ Provide your response in a python code list of str format: RERANK_PROMPT = """ -Based on the query questions and the retrieved chunks, determine whether each chunk is helpful in answering any of the query questions. -For each chunk, you must return "YES" or "NO" without any other information. +Based on the query and the retrieved chunks, give a quick judge of whether each chunk is helpful in answering the query. +For each chunk, you must return "YES" or "NO" python style list without any other information. -Query Questions: {query} +Query: {query} Retrieved Chunks: {retrieved_chunks} @@ -224,7 +225,10 @@ class DeepSearch(BaseAgent): # Format all chunks for batch processing formatted_chunks = "" for i, retrieved_result in enumerate(retrieved_results): - formatted_chunks += f"\n{retrieved_result.text}\n\n" + formatted_chunks += f''' + \n{retrieved_result.text}\n\n + \n{retrieved_result.reference}\n + ''' # Batch process all chunks with a single LLM call content = self.llm.chat( @@ -321,24 +325,29 @@ class DeepSearch(BaseAgent): log.color_print("No sub queries were generated by the LLM. Exiting.") return [], {} else: - log.color_print( - f" Break down the original query into new sub queries: {sub_queries} ") + log.color_print(f" Break down the original query into new sub queries: {sub_queries} ") all_sub_queries.extend(sub_queries) sub_gap_queries = sub_queries for iter in range(max_iter): log.color_print(f">> Iteration: {iter + 1}\n") search_res_from_vectordb = [] - search_res_from_internet = [] # TODO + # search_res_from_internet = [] # TODO # Execute all search tasks sequentially for query in sub_gap_queries: result = self._search_chunks_from_vectordb(query) search_res_from_vectordb.extend(result) - - search_res_from_vectordb = deduplicate(search_res_from_vectordb) + undedup_len = len(search_res_from_vectordb) + search_res_from_vectordb = deduplicate(search_res_from_vectordb) + deduped_len = len(search_res_from_vectordb) + if undedup_len - deduped_len != 0: + log.color_print( + f" Removed {undedup_len - deduped_len} duplicates " + ) # search_res_from_internet = deduplicate_results(search_res_from_internet) - all_search_res.extend(search_res_from_vectordb + search_res_from_internet) + # all_search_res.extend(search_res_from_vectordb + search_res_from_internet) + all_search_res.extend(search_res_from_vectordb) if iter == max_iter - 1: log.color_print(" Exceeded maximum iterations. Exiting. ") break diff --git a/deepsearcher/loader/splitter.py b/deepsearcher/loader/splitter.py index ebdf70a..cdde9e2 100644 --- a/deepsearcher/loader/splitter.py +++ b/deepsearcher/loader/splitter.py @@ -1,7 +1,6 @@ ## Sentence Window splitting strategy, ref: # https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb -from typing import List from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter @@ -26,7 +25,7 @@ class Chunk: text: str, reference: str, metadata: dict = None, - embedding: List[float] = None, + embedding: list[float] = None, ): """ Initialize a Chunk object. @@ -44,8 +43,8 @@ class Chunk: def _sentence_window_split( - split_docs: List[Document], original_document: Document, offset: int = 200 -) -> List[Chunk]: + split_docs: list[Document], original_document: Document, offset: int = 200 +) -> list[Chunk]: """ Create chunks with context windows from split documents. @@ -78,8 +77,8 @@ def _sentence_window_split( def split_docs_to_chunks( - documents: List[Document], chunk_size: int = 1500, chunk_overlap=100 -) -> List[Chunk]: + documents: list[Document], chunk_size: int = 1500, chunk_overlap=100 +) -> list[Chunk]: """ Split documents into chunks with context windows.