Browse Source

新增迭代检索中的chunk去重

main
tanxing 6 days ago
parent
commit
ca9b68ef91
  1. 31
      deepsearcher/agent/deep_search.py
  2. 11
      deepsearcher/loader/splitter.py

31
deepsearcher/agent/deep_search.py

@ -20,7 +20,8 @@ Your selected collection name list is:
SUB_QUERY_PROMPT = """
To answer this question more comprehensively, please break down the original question into few numbers of sub-questions (more if necessary).
To answer this question more comprehensively, please break down the original question into few numbers of sub-questions
(the less the better, but more if nesscessary to ensure the coverage of answering the original question).
If this is a very simple question and no decomposition is necessary, then keep the only one original question.
Make sure each sub-question is clear, concise and atomic.
Return as list of str in python style and json convertable.
@ -45,10 +46,10 @@ Provide your response in a python code list of str format:
RERANK_PROMPT = """
Based on the query questions and the retrieved chunks, determine whether each chunk is helpful in answering any of the query questions.
For each chunk, you must return "YES" or "NO" without any other information.
Based on the query and the retrieved chunks, give a quick judge of whether each chunk is helpful in answering the query.
For each chunk, you must return "YES" or "NO" python style list without any other information.
Query Questions: {query}
Query: {query}
Retrieved Chunks:
{retrieved_chunks}
@ -224,7 +225,10 @@ class DeepSearch(BaseAgent):
# Format all chunks for batch processing
formatted_chunks = ""
for i, retrieved_result in enumerate(retrieved_results):
formatted_chunks += f"<chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n"
formatted_chunks += f'''
<chunk_{i + 1}>\n{retrieved_result.text}\n</chunk_{i + 1}>\n
<reference_{i + 1}>\n{retrieved_result.reference}\n</reference_{i + 1}>
'''
# Batch process all chunks with a single LLM call
content = self.llm.chat(
@ -321,24 +325,29 @@ class DeepSearch(BaseAgent):
log.color_print("No sub queries were generated by the LLM. Exiting.")
return [], {}
else:
log.color_print(
f"</think> Break down the original query into new sub queries: {sub_queries} ")
log.color_print(f"</think> Break down the original query into new sub queries: {sub_queries} ")
all_sub_queries.extend(sub_queries)
sub_gap_queries = sub_queries
for iter in range(max_iter):
log.color_print(f">> Iteration: {iter + 1}\n")
search_res_from_vectordb = []
search_res_from_internet = [] # TODO
# search_res_from_internet = [] # TODO
# Execute all search tasks sequentially
for query in sub_gap_queries:
result = self._search_chunks_from_vectordb(query)
search_res_from_vectordb.extend(result)
search_res_from_vectordb = deduplicate(search_res_from_vectordb)
undedup_len = len(search_res_from_vectordb)
search_res_from_vectordb = deduplicate(search_res_from_vectordb)
deduped_len = len(search_res_from_vectordb)
if undedup_len - deduped_len != 0:
log.color_print(
f"<search> Removed {undedup_len - deduped_len} duplicates </search>"
)
# search_res_from_internet = deduplicate_results(search_res_from_internet)
all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
# all_search_res.extend(search_res_from_vectordb + search_res_from_internet)
all_search_res.extend(search_res_from_vectordb)
if iter == max_iter - 1:
log.color_print("</think> Exceeded maximum iterations. Exiting. ")
break

11
deepsearcher/loader/splitter.py

@ -1,7 +1,6 @@
## Sentence Window splitting strategy, ref:
# https://github.com/milvus-io/bootcamp/blob/master/bootcamp/RAG/advanced_rag/sentence_window_with_langchain.ipynb
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
@ -26,7 +25,7 @@ class Chunk:
text: str,
reference: str,
metadata: dict = None,
embedding: List[float] = None,
embedding: list[float] = None,
):
"""
Initialize a Chunk object.
@ -44,8 +43,8 @@ class Chunk:
def _sentence_window_split(
split_docs: List[Document], original_document: Document, offset: int = 200
) -> List[Chunk]:
split_docs: list[Document], original_document: Document, offset: int = 200
) -> list[Chunk]:
"""
Create chunks with context windows from split documents.
@ -78,8 +77,8 @@ def _sentence_window_split(
def split_docs_to_chunks(
documents: List[Document], chunk_size: int = 1500, chunk_overlap=100
) -> List[Chunk]:
documents: list[Document], chunk_size: int = 1500, chunk_overlap=100
) -> list[Chunk]:
"""
Split documents into chunks with context windows.

Loading…
Cancel
Save