14 changed files with 3 additions and 825 deletions
@ -1,118 +0,0 @@ |
|||
import argparse |
|||
import logging |
|||
import sys |
|||
import warnings |
|||
|
|||
from deepsearcher.configuration import Configuration, init_config |
|||
from deepsearcher.offline_loading import load_from_local_files, load_from_website |
|||
from deepsearcher.online_query import query |
|||
from deepsearcher.utils import log |
|||
|
|||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|||
httpx_logger.setLevel(logging.WARNING) |
|||
|
|||
|
|||
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output |
|||
|
|||
|
|||
def main(): |
|||
""" |
|||
Main entry point for the DeepSearcher CLI. |
|||
|
|||
This function parses command line arguments and executes the appropriate action |
|||
based on the subcommand provided (query or load). It handles the deprecated |
|||
command line format and provides helpful error messages. |
|||
|
|||
Returns: |
|||
None |
|||
""" |
|||
if "--query" in sys.argv or "--load" in sys.argv: |
|||
print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.") |
|||
print("Please use:") |
|||
print(" deepsearcher query <your_query> --max_iter 3") |
|||
print( |
|||
" deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>" |
|||
) |
|||
sys.exit(1) |
|||
|
|||
config = Configuration() # Customize your config here |
|||
init_config(config=config) |
|||
|
|||
parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.") |
|||
subparsers = parser.add_subparsers(dest="subcommand", title="subcommands") |
|||
|
|||
## Arguments of query |
|||
query_parser = subparsers.add_parser("query", help="Query a question or search topic.") |
|||
query_parser.add_argument("query", type=str, default="", help="query question or search topic.") |
|||
query_parser.add_argument( |
|||
"--max_iter", |
|||
type=int, |
|||
default=3, |
|||
help="Max iterations of reflection. Default is 3.", |
|||
) |
|||
|
|||
## Arguments of loading |
|||
load_parser = subparsers.add_parser( |
|||
"load", help="Load knowledge from local files or from URLs." |
|||
) |
|||
load_parser.add_argument( |
|||
"load_path", |
|||
type=str, |
|||
nargs="+", # 1 or more files or urls |
|||
help="Load knowledge from local files or from URLs.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--batch_size", |
|||
type=int, |
|||
default=256, |
|||
help="Batch size for loading knowledge.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--collection_name", |
|||
type=str, |
|||
default=None, |
|||
help="Destination collection name of loaded knowledge.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--collection_desc", |
|||
type=str, |
|||
default=None, |
|||
help="Description of the collection.", |
|||
) |
|||
load_parser.add_argument( |
|||
"--force_new_collection", |
|||
type=bool, |
|||
default=False, |
|||
help="If you want to drop origin collection and create a new collection on every load, set to True", |
|||
) |
|||
|
|||
args = parser.parse_args() |
|||
if args.subcommand == "query": |
|||
final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter) |
|||
log.color_print("\n==== FINAL ANSWER====\n") |
|||
log.color_print(final_answer) |
|||
log.color_print("\n### References\n") |
|||
for i, ref in enumerate(refs): |
|||
log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}") |
|||
elif args.subcommand == "load": |
|||
urls = [url for url in args.load_path if url.startswith("http")] |
|||
local_files = [file for file in args.load_path if not file.startswith("http")] |
|||
kwargs = {} |
|||
if args.collection_name: |
|||
kwargs["collection_name"] = args.collection_name |
|||
if args.collection_desc: |
|||
kwargs["collection_description"] = args.collection_desc |
|||
if args.force_new_collection: |
|||
kwargs["force_new_collection"] = args.force_new_collection |
|||
if args.batch_size: |
|||
kwargs["batch_size"] = args.batch_size |
|||
if len(urls) > 0: |
|||
load_from_website(urls, **kwargs) |
|||
if len(local_files) > 0: |
|||
load_from_local_files(local_files, **kwargs) |
|||
else: |
|||
print("Please provide a query or a load argument.") |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
main() |
@ -1,11 +0,0 @@ |
|||
from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler |
|||
from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler |
|||
from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler |
|||
from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler |
|||
|
|||
__all__ = [ |
|||
"FireCrawlCrawler", |
|||
"JinaCrawler", |
|||
"Crawl4AICrawler", |
|||
"DoclingCrawler", |
|||
] |
@ -1,55 +0,0 @@ |
|||
from abc import ABC |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
|
|||
class BaseCrawler(ABC): |
|||
""" |
|||
Abstract base class for web crawlers. |
|||
|
|||
This class defines the interface for crawling web pages and converting them |
|||
into Document objects for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the crawler with optional keyword arguments. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments for specific crawler implementations. |
|||
""" |
|||
pass |
|||
|
|||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl a single URL and convert it to Document objects. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects containing the content and metadata from the URL. |
|||
|
|||
Note: |
|||
Implementations should include the URL reference in the metadata. |
|||
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})] |
|||
""" |
|||
pass |
|||
|
|||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl multiple URLs and return a list of Document objects. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects containing the content and metadata from all URLs. |
|||
""" |
|||
documents = [] |
|||
for url in urls: |
|||
documents.extend(self.crawl_url(url, **crawl_kwargs)) |
|||
return documents |
@ -1,140 +0,0 @@ |
|||
import asyncio |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class Crawl4AICrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using the Crawl4AI library. |
|||
|
|||
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them |
|||
into markdown format for further processing. It supports both single-page crawling |
|||
and batch crawling of multiple pages. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the Crawl4AICrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
browser_config: Configuration for the browser used by Crawl4AI. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.crawler = None # Lazy init |
|||
self.browser_config = kwargs.get("browser_config", None) |
|||
|
|||
def _lazy_init(self): |
|||
""" |
|||
Initialize the crawler lazily when needed. |
|||
|
|||
This method creates the AsyncWebCrawler instance with the provided browser configuration |
|||
only when it's first needed, to avoid unnecessary initialization. |
|||
""" |
|||
from crawl4ai import AsyncWebCrawler, BrowserConfig |
|||
|
|||
if self.crawler is None: |
|||
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None |
|||
self.crawler = AsyncWebCrawler(config=config) |
|||
|
|||
async def _async_crawl(self, url: str) -> Document: |
|||
""" |
|||
Asynchronously crawl a single URL. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A Document object with the markdown content and metadata from the URL. |
|||
""" |
|||
if self.crawler is None: |
|||
self._lazy_init() |
|||
|
|||
async with self.crawler as crawler: |
|||
result = await crawler.arun(url) |
|||
|
|||
markdown_content = result.markdown or "" |
|||
|
|||
metadata = { |
|||
"reference": url, |
|||
"success": result.success, |
|||
"status_code": result.status_code, |
|||
"media": result.media, |
|||
"links": result.links, |
|||
} |
|||
|
|||
if hasattr(result, "metadata") and result.metadata: |
|||
metadata["title"] = result.metadata.get("title", "") |
|||
metadata["author"] = result.metadata.get("author", "") |
|||
|
|||
return Document(page_content=markdown_content, metadata=metadata) |
|||
|
|||
def crawl_url(self, url: str) -> List[Document]: |
|||
""" |
|||
Crawl a single URL. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the markdown content and metadata, |
|||
or an empty list if an error occurs. |
|||
""" |
|||
try: |
|||
document = asyncio.run(self._async_crawl(url)) |
|||
return [document] |
|||
except Exception as e: |
|||
log.error(f"Error during crawling {url}: {e}") |
|||
return [] |
|||
|
|||
async def _async_crawl_many(self, urls: List[str]) -> List[Document]: |
|||
""" |
|||
Asynchronously crawl multiple URLs. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
|
|||
Returns: |
|||
A list of Document objects with the markdown content and metadata from all URLs. |
|||
""" |
|||
if self.crawler is None: |
|||
self._lazy_init() |
|||
async with self.crawler as crawler: |
|||
results = await crawler.arun_many(urls) |
|||
documents = [] |
|||
for result in results: |
|||
markdown_content = result.markdown or "" |
|||
metadata = { |
|||
"reference": result.url, |
|||
"success": result.success, |
|||
"status_code": result.status_code, |
|||
"media": result.media, |
|||
"links": result.links, |
|||
} |
|||
if hasattr(result, "metadata") and result.metadata: |
|||
metadata["title"] = result.metadata.get("title", "") |
|||
metadata["author"] = result.metadata.get("author", "") |
|||
documents.append(Document(page_content=markdown_content, metadata=metadata)) |
|||
return documents |
|||
|
|||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl multiple URLs. |
|||
|
|||
Args: |
|||
urls: A list of URLs to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects with the markdown content and metadata from all URLs, |
|||
or an empty list if an error occurs. |
|||
""" |
|||
try: |
|||
return asyncio.run(self._async_crawl_many(urls)) |
|||
except Exception as e: |
|||
log.error(f"Error during crawling {urls}: {e}") |
|||
return [] |
@ -1,98 +0,0 @@ |
|||
from typing import List |
|||
|
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class DoclingCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using Docling's DocumentConverter and HierarchicalChunker. |
|||
|
|||
This crawler leverages Docling's capabilities to convert web pages into structured |
|||
documents and chunk them appropriately for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
from docling.document_converter import DocumentConverter |
|||
from docling_core.transforms.chunker import HierarchicalChunker |
|||
|
|||
self.converter = DocumentConverter() |
|||
self.chunker = HierarchicalChunker() |
|||
|
|||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|||
""" |
|||
Crawl a single URL using Docling's conversion and perform hierarchical chunking. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|||
|
|||
Returns: |
|||
A list of Document objects, each representing a chunk from the crawled URL. |
|||
|
|||
Raises: |
|||
IOError: If there is an error processing the URL. |
|||
""" |
|||
try: |
|||
# Use Docling to convert the URL to a document |
|||
conversion_result = self.converter.convert(url) |
|||
docling_document = conversion_result.document |
|||
|
|||
# Chunk the document using hierarchical chunking |
|||
chunks = list(self.chunker.chunk(docling_document)) |
|||
|
|||
documents = [] |
|||
for chunk in chunks: |
|||
metadata = {"reference": url, "text": chunk.text} |
|||
documents.append(Document(page_content=chunk.text, metadata=metadata)) |
|||
|
|||
return documents |
|||
|
|||
except Exception as e: |
|||
log.color_print(f"Error processing URL {url}: {str(e)}") |
|||
raise IOError(f"Failed to process URL {url}: {str(e)}") |
|||
|
|||
@property |
|||
def supported_file_types(self) -> List[str]: |
|||
""" |
|||
Return the list of file types and formats supported by Docling. |
|||
|
|||
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/): |
|||
- PDF |
|||
- Office formats: DOCX, XLSX, PPTX |
|||
- Markdown |
|||
- AsciiDoc |
|||
- HTML, XHTML |
|||
- CSV |
|||
- Images: PNG, JPEG, TIFF, BMP |
|||
|
|||
Returns: |
|||
A list of file extensions supported by this crawler. |
|||
""" |
|||
return [ |
|||
"pdf", |
|||
"docx", |
|||
"xlsx", |
|||
"pptx", |
|||
"md", |
|||
"adoc", |
|||
"asciidoc", |
|||
"html", |
|||
"xhtml", |
|||
"csv", |
|||
"png", |
|||
"jpg", |
|||
"jpeg", |
|||
"tif", |
|||
"tiff", |
|||
"bmp", |
|||
] |
@ -1,88 +0,0 @@ |
|||
import os |
|||
from typing import List, Optional |
|||
|
|||
from firecrawl import FirecrawlApp, ScrapeOptions |
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
|
|||
|
|||
class FireCrawlCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using the FireCrawl service. |
|||
|
|||
This crawler uses the FireCrawl service to crawl web pages and convert them |
|||
into markdown format for further processing. It supports both single-page scraping |
|||
and recursive crawling of multiple pages. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the FireCrawlCrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.app = None |
|||
|
|||
def crawl_url( |
|||
self, |
|||
url: str, |
|||
max_depth: Optional[int] = None, |
|||
limit: Optional[int] = None, |
|||
allow_backward_links: Optional[bool] = None, |
|||
) -> List[Document]: |
|||
""" |
|||
Dynamically crawls a URL using either scrape_url or crawl_url: |
|||
|
|||
- Uses scrape_url for single-page extraction if no params are provided. |
|||
- Uses crawl_url to recursively gather pages when any param is provided. |
|||
|
|||
Args: |
|||
url (str): The starting URL to crawl. |
|||
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2). |
|||
limit (Optional[int]): Maximum number of pages to crawl (default: 20). |
|||
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False). |
|||
|
|||
Returns: |
|||
List[Document]: List of Document objects with page content and metadata. |
|||
""" |
|||
# Lazy init |
|||
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) |
|||
|
|||
# if user just inputs a single url as param |
|||
# scrape single page |
|||
if max_depth is None and limit is None and allow_backward_links is None: |
|||
# Call the new Firecrawl API, passing formats directly |
|||
scrape_response = self.app.scrape_url(url=url, formats=["markdown"]) |
|||
data = scrape_response.model_dump() |
|||
return [ |
|||
Document( |
|||
page_content=data.get("markdown", ""), |
|||
metadata={"reference": url, **data.get("metadata", {})}, |
|||
) |
|||
] |
|||
|
|||
# else, crawl multiple pages based on users' input params |
|||
# set default values if not provided |
|||
crawl_response = self.app.crawl_url( |
|||
url=url, |
|||
limit=limit or 20, |
|||
max_depth=max_depth or 2, |
|||
allow_backward_links=allow_backward_links or False, |
|||
scrape_options=ScrapeOptions(formats=["markdown"]), |
|||
poll_interval=5, |
|||
) |
|||
items = crawl_response.model_dump().get("data", []) |
|||
|
|||
documents: List[Document] = [] |
|||
for item in items: |
|||
# Support items that are either dicts or Pydantic sub-models |
|||
item_dict = item.model_dump() if hasattr(item, "model_dump") else item |
|||
md = item_dict.get("markdown", "") |
|||
meta = item_dict.get("metadata", {}) |
|||
meta["reference"] = meta.get("url", url) |
|||
documents.append(Document(page_content=md, metadata=meta)) |
|||
|
|||
return documents |
@ -1,62 +0,0 @@ |
|||
import os |
|||
from typing import List |
|||
|
|||
import requests |
|||
from langchain_core.documents import Document |
|||
|
|||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|||
|
|||
|
|||
class JinaCrawler(BaseCrawler): |
|||
""" |
|||
Web crawler using Jina AI's rendering service. |
|||
|
|||
This crawler uses Jina AI's rendering service to crawl web pages and convert them |
|||
into markdown format for further processing. |
|||
""" |
|||
|
|||
def __init__(self, **kwargs): |
|||
""" |
|||
Initialize the JinaCrawler. |
|||
|
|||
Args: |
|||
**kwargs: Optional keyword arguments. |
|||
|
|||
Raises: |
|||
ValueError: If the JINA_API_TOKEN environment variable is not set. |
|||
""" |
|||
super().__init__(**kwargs) |
|||
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY") |
|||
if not self.jina_api_token: |
|||
raise ValueError("Missing JINA_API_TOKEN environment variable") |
|||
|
|||
def crawl_url(self, url: str) -> List[Document]: |
|||
""" |
|||
Crawl a single URL using Jina AI's rendering service. |
|||
|
|||
Args: |
|||
url: The URL to crawl. |
|||
|
|||
Returns: |
|||
A list containing a single Document object with the markdown content and metadata. |
|||
|
|||
Raises: |
|||
HTTPError: If the request to Jina AI's service fails. |
|||
""" |
|||
jina_url = f"https://r.jina.ai/{url}" |
|||
headers = { |
|||
"Authorization": f"Bearer {self.jina_api_token}", |
|||
"X-Return-Format": "markdown", |
|||
} |
|||
|
|||
response = requests.get(jina_url, headers=headers) |
|||
response.raise_for_status() |
|||
|
|||
markdown_content = response.text |
|||
metadata = { |
|||
"reference": url, |
|||
"status_code": response.status_code, |
|||
"headers": dict(response.headers), |
|||
} |
|||
|
|||
return [Document(page_content=markdown_content, metadata=metadata)] |
@ -1,40 +0,0 @@ |
|||
#!/usr/bin/env python3 |
|||
""" |
|||
只测试网页搜索功能 |
|||
""" |
|||
|
|||
import sys |
|||
import os |
|||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
from deepsearcher.web_search import WebSearch |
|||
|
|||
def test_web_search(): |
|||
"""测试网页搜索功能""" |
|||
print("=== 测试网页搜索功能 ===") |
|||
|
|||
# 初始化网页搜索 |
|||
web_search = WebSearch() |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results = web_search.search_with_retry(test_query, size=4) |
|||
|
|||
if results: |
|||
print(f"✅ 成功找到 {len(results)} 个搜索结果:") |
|||
for i, result in enumerate(results, 1): |
|||
print(f"\n--- 结果 {i} ---") |
|||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|||
print(f"链接: {result.reference}") |
|||
print(f"分数: {result.score}") |
|||
print(f"内容长度: {len(result.text)} 字符") |
|||
print(f"内容预览: {result.text[:200]}...") |
|||
print(f"来源: {result.metadata.get('source', 'N/A')}") |
|||
else: |
|||
print("❌ 未找到搜索结果") |
|||
|
|||
if __name__ == "__main__": |
|||
test_web_search() |
@ -1,75 +0,0 @@ |
|||
#!/usr/bin/env python3 |
|||
""" |
|||
测试网页搜索功能 |
|||
""" |
|||
|
|||
import sys |
|||
import os |
|||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
from deepsearcher.web_search import WebSearch |
|||
from deepsearcher import configuration |
|||
|
|||
def test_web_search(): |
|||
"""测试网页搜索功能""" |
|||
print("=== 测试网页搜索功能 ===") |
|||
|
|||
# 初始化网页搜索 |
|||
web_search = WebSearch() |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results = web_search.search_with_retry(test_query, size=4) |
|||
|
|||
if results: |
|||
print(f"找到 {len(results)} 个搜索结果:") |
|||
for i, result in enumerate(results, 1): |
|||
print(f"\n--- 结果 {i} ---") |
|||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|||
print(f"链接: {result.reference}") |
|||
print(f"分数: {result.score}") |
|||
print(f"内容长度: {len(result.text)} 字符") |
|||
print(f"内容预览: {result.text[:200]}...") |
|||
else: |
|||
print("未找到搜索结果") |
|||
|
|||
def test_integration(): |
|||
"""测试与DeepSearch的集成""" |
|||
print("\n=== 测试与DeepSearch的集成 ===") |
|||
|
|||
# 初始化配置 |
|||
configuration.init_config(configuration.config) |
|||
|
|||
# 创建DeepSearch实例(启用网页搜索) |
|||
from deepsearcher.agent.deep_search import DeepSearch |
|||
|
|||
searcher = DeepSearch( |
|||
llm=configuration.llm, |
|||
embedding_model=configuration.embedding_model, |
|||
vector_db=configuration.vector_db, |
|||
max_iter=2, |
|||
enable_web_search=True |
|||
) |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results, sub_queries = searcher.retrieve(test_query, max_iter=2) |
|||
|
|||
print(f"生成的子问题: {sub_queries}") |
|||
print(f"找到 {len(results)} 个搜索结果") |
|||
# 显示结果统计 |
|||
web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"] |
|||
vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"] |
|||
|
|||
print(f"网页搜索结果: {len(web_results)} 个") |
|||
print(f"向量数据库结果: {len(vector_results)} 个") |
|||
|
|||
if __name__ == "__main__": |
|||
test_web_search() |
|||
test_integration() |
Loading…
Reference in new issue