14 changed files with 3 additions and 825 deletions
@ -1,118 +0,0 @@ |
|||||
import argparse |
|
||||
import logging |
|
||||
import sys |
|
||||
import warnings |
|
||||
|
|
||||
from deepsearcher.configuration import Configuration, init_config |
|
||||
from deepsearcher.offline_loading import load_from_local_files, load_from_website |
|
||||
from deepsearcher.online_query import query |
|
||||
from deepsearcher.utils import log |
|
||||
|
|
||||
httpx_logger = logging.getLogger("httpx") # disable openai's logger output |
|
||||
httpx_logger.setLevel(logging.WARNING) |
|
||||
|
|
||||
|
|
||||
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output |
|
||||
|
|
||||
|
|
||||
def main(): |
|
||||
""" |
|
||||
Main entry point for the DeepSearcher CLI. |
|
||||
|
|
||||
This function parses command line arguments and executes the appropriate action |
|
||||
based on the subcommand provided (query or load). It handles the deprecated |
|
||||
command line format and provides helpful error messages. |
|
||||
|
|
||||
Returns: |
|
||||
None |
|
||||
""" |
|
||||
if "--query" in sys.argv or "--load" in sys.argv: |
|
||||
print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.") |
|
||||
print("Please use:") |
|
||||
print(" deepsearcher query <your_query> --max_iter 3") |
|
||||
print( |
|
||||
" deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>" |
|
||||
) |
|
||||
sys.exit(1) |
|
||||
|
|
||||
config = Configuration() # Customize your config here |
|
||||
init_config(config=config) |
|
||||
|
|
||||
parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.") |
|
||||
subparsers = parser.add_subparsers(dest="subcommand", title="subcommands") |
|
||||
|
|
||||
## Arguments of query |
|
||||
query_parser = subparsers.add_parser("query", help="Query a question or search topic.") |
|
||||
query_parser.add_argument("query", type=str, default="", help="query question or search topic.") |
|
||||
query_parser.add_argument( |
|
||||
"--max_iter", |
|
||||
type=int, |
|
||||
default=3, |
|
||||
help="Max iterations of reflection. Default is 3.", |
|
||||
) |
|
||||
|
|
||||
## Arguments of loading |
|
||||
load_parser = subparsers.add_parser( |
|
||||
"load", help="Load knowledge from local files or from URLs." |
|
||||
) |
|
||||
load_parser.add_argument( |
|
||||
"load_path", |
|
||||
type=str, |
|
||||
nargs="+", # 1 or more files or urls |
|
||||
help="Load knowledge from local files or from URLs.", |
|
||||
) |
|
||||
load_parser.add_argument( |
|
||||
"--batch_size", |
|
||||
type=int, |
|
||||
default=256, |
|
||||
help="Batch size for loading knowledge.", |
|
||||
) |
|
||||
load_parser.add_argument( |
|
||||
"--collection_name", |
|
||||
type=str, |
|
||||
default=None, |
|
||||
help="Destination collection name of loaded knowledge.", |
|
||||
) |
|
||||
load_parser.add_argument( |
|
||||
"--collection_desc", |
|
||||
type=str, |
|
||||
default=None, |
|
||||
help="Description of the collection.", |
|
||||
) |
|
||||
load_parser.add_argument( |
|
||||
"--force_new_collection", |
|
||||
type=bool, |
|
||||
default=False, |
|
||||
help="If you want to drop origin collection and create a new collection on every load, set to True", |
|
||||
) |
|
||||
|
|
||||
args = parser.parse_args() |
|
||||
if args.subcommand == "query": |
|
||||
final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter) |
|
||||
log.color_print("\n==== FINAL ANSWER====\n") |
|
||||
log.color_print(final_answer) |
|
||||
log.color_print("\n### References\n") |
|
||||
for i, ref in enumerate(refs): |
|
||||
log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}") |
|
||||
elif args.subcommand == "load": |
|
||||
urls = [url for url in args.load_path if url.startswith("http")] |
|
||||
local_files = [file for file in args.load_path if not file.startswith("http")] |
|
||||
kwargs = {} |
|
||||
if args.collection_name: |
|
||||
kwargs["collection_name"] = args.collection_name |
|
||||
if args.collection_desc: |
|
||||
kwargs["collection_description"] = args.collection_desc |
|
||||
if args.force_new_collection: |
|
||||
kwargs["force_new_collection"] = args.force_new_collection |
|
||||
if args.batch_size: |
|
||||
kwargs["batch_size"] = args.batch_size |
|
||||
if len(urls) > 0: |
|
||||
load_from_website(urls, **kwargs) |
|
||||
if len(local_files) > 0: |
|
||||
load_from_local_files(local_files, **kwargs) |
|
||||
else: |
|
||||
print("Please provide a query or a load argument.") |
|
||||
|
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
main() |
|
@ -1,11 +0,0 @@ |
|||||
from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler |
|
||||
from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler |
|
||||
from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler |
|
||||
from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler |
|
||||
|
|
||||
__all__ = [ |
|
||||
"FireCrawlCrawler", |
|
||||
"JinaCrawler", |
|
||||
"Crawl4AICrawler", |
|
||||
"DoclingCrawler", |
|
||||
] |
|
@ -1,55 +0,0 @@ |
|||||
from abc import ABC |
|
||||
from typing import List |
|
||||
|
|
||||
from langchain_core.documents import Document |
|
||||
|
|
||||
|
|
||||
class BaseCrawler(ABC): |
|
||||
""" |
|
||||
Abstract base class for web crawlers. |
|
||||
|
|
||||
This class defines the interface for crawling web pages and converting them |
|
||||
into Document objects for further processing. |
|
||||
""" |
|
||||
|
|
||||
def __init__(self, **kwargs): |
|
||||
""" |
|
||||
Initialize the crawler with optional keyword arguments. |
|
||||
|
|
||||
Args: |
|
||||
**kwargs: Optional keyword arguments for specific crawler implementations. |
|
||||
""" |
|
||||
pass |
|
||||
|
|
||||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|
||||
""" |
|
||||
Crawl a single URL and convert it to Document objects. |
|
||||
|
|
||||
Args: |
|
||||
url: The URL to crawl. |
|
||||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|
||||
|
|
||||
Returns: |
|
||||
A list of Document objects containing the content and metadata from the URL. |
|
||||
|
|
||||
Note: |
|
||||
Implementations should include the URL reference in the metadata. |
|
||||
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})] |
|
||||
""" |
|
||||
pass |
|
||||
|
|
||||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|
||||
""" |
|
||||
Crawl multiple URLs and return a list of Document objects. |
|
||||
|
|
||||
Args: |
|
||||
urls: A list of URLs to crawl. |
|
||||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|
||||
|
|
||||
Returns: |
|
||||
A list of Document objects containing the content and metadata from all URLs. |
|
||||
""" |
|
||||
documents = [] |
|
||||
for url in urls: |
|
||||
documents.extend(self.crawl_url(url, **crawl_kwargs)) |
|
||||
return documents |
|
@ -1,140 +0,0 @@ |
|||||
import asyncio |
|
||||
from typing import List |
|
||||
|
|
||||
from langchain_core.documents import Document |
|
||||
|
|
||||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|
||||
from deepsearcher.utils import log |
|
||||
|
|
||||
|
|
||||
class Crawl4AICrawler(BaseCrawler): |
|
||||
""" |
|
||||
Web crawler using the Crawl4AI library. |
|
||||
|
|
||||
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them |
|
||||
into markdown format for further processing. It supports both single-page crawling |
|
||||
and batch crawling of multiple pages. |
|
||||
""" |
|
||||
|
|
||||
def __init__(self, **kwargs): |
|
||||
""" |
|
||||
Initialize the Crawl4AICrawler. |
|
||||
|
|
||||
Args: |
|
||||
**kwargs: Optional keyword arguments. |
|
||||
browser_config: Configuration for the browser used by Crawl4AI. |
|
||||
""" |
|
||||
super().__init__(**kwargs) |
|
||||
self.crawler = None # Lazy init |
|
||||
self.browser_config = kwargs.get("browser_config", None) |
|
||||
|
|
||||
def _lazy_init(self): |
|
||||
""" |
|
||||
Initialize the crawler lazily when needed. |
|
||||
|
|
||||
This method creates the AsyncWebCrawler instance with the provided browser configuration |
|
||||
only when it's first needed, to avoid unnecessary initialization. |
|
||||
""" |
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig |
|
||||
|
|
||||
if self.crawler is None: |
|
||||
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None |
|
||||
self.crawler = AsyncWebCrawler(config=config) |
|
||||
|
|
||||
async def _async_crawl(self, url: str) -> Document: |
|
||||
""" |
|
||||
Asynchronously crawl a single URL. |
|
||||
|
|
||||
Args: |
|
||||
url: The URL to crawl. |
|
||||
|
|
||||
Returns: |
|
||||
A Document object with the markdown content and metadata from the URL. |
|
||||
""" |
|
||||
if self.crawler is None: |
|
||||
self._lazy_init() |
|
||||
|
|
||||
async with self.crawler as crawler: |
|
||||
result = await crawler.arun(url) |
|
||||
|
|
||||
markdown_content = result.markdown or "" |
|
||||
|
|
||||
metadata = { |
|
||||
"reference": url, |
|
||||
"success": result.success, |
|
||||
"status_code": result.status_code, |
|
||||
"media": result.media, |
|
||||
"links": result.links, |
|
||||
} |
|
||||
|
|
||||
if hasattr(result, "metadata") and result.metadata: |
|
||||
metadata["title"] = result.metadata.get("title", "") |
|
||||
metadata["author"] = result.metadata.get("author", "") |
|
||||
|
|
||||
return Document(page_content=markdown_content, metadata=metadata) |
|
||||
|
|
||||
def crawl_url(self, url: str) -> List[Document]: |
|
||||
""" |
|
||||
Crawl a single URL. |
|
||||
|
|
||||
Args: |
|
||||
url: The URL to crawl. |
|
||||
|
|
||||
Returns: |
|
||||
A list containing a single Document object with the markdown content and metadata, |
|
||||
or an empty list if an error occurs. |
|
||||
""" |
|
||||
try: |
|
||||
document = asyncio.run(self._async_crawl(url)) |
|
||||
return [document] |
|
||||
except Exception as e: |
|
||||
log.error(f"Error during crawling {url}: {e}") |
|
||||
return [] |
|
||||
|
|
||||
async def _async_crawl_many(self, urls: List[str]) -> List[Document]: |
|
||||
""" |
|
||||
Asynchronously crawl multiple URLs. |
|
||||
|
|
||||
Args: |
|
||||
urls: A list of URLs to crawl. |
|
||||
|
|
||||
Returns: |
|
||||
A list of Document objects with the markdown content and metadata from all URLs. |
|
||||
""" |
|
||||
if self.crawler is None: |
|
||||
self._lazy_init() |
|
||||
async with self.crawler as crawler: |
|
||||
results = await crawler.arun_many(urls) |
|
||||
documents = [] |
|
||||
for result in results: |
|
||||
markdown_content = result.markdown or "" |
|
||||
metadata = { |
|
||||
"reference": result.url, |
|
||||
"success": result.success, |
|
||||
"status_code": result.status_code, |
|
||||
"media": result.media, |
|
||||
"links": result.links, |
|
||||
} |
|
||||
if hasattr(result, "metadata") and result.metadata: |
|
||||
metadata["title"] = result.metadata.get("title", "") |
|
||||
metadata["author"] = result.metadata.get("author", "") |
|
||||
documents.append(Document(page_content=markdown_content, metadata=metadata)) |
|
||||
return documents |
|
||||
|
|
||||
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: |
|
||||
""" |
|
||||
Crawl multiple URLs. |
|
||||
|
|
||||
Args: |
|
||||
urls: A list of URLs to crawl. |
|
||||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|
||||
|
|
||||
Returns: |
|
||||
A list of Document objects with the markdown content and metadata from all URLs, |
|
||||
or an empty list if an error occurs. |
|
||||
""" |
|
||||
try: |
|
||||
return asyncio.run(self._async_crawl_many(urls)) |
|
||||
except Exception as e: |
|
||||
log.error(f"Error during crawling {urls}: {e}") |
|
||||
return [] |
|
@ -1,98 +0,0 @@ |
|||||
from typing import List |
|
||||
|
|
||||
from langchain_core.documents import Document |
|
||||
|
|
||||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|
||||
from deepsearcher.utils import log |
|
||||
|
|
||||
|
|
||||
class DoclingCrawler(BaseCrawler): |
|
||||
""" |
|
||||
Web crawler using Docling's DocumentConverter and HierarchicalChunker. |
|
||||
|
|
||||
This crawler leverages Docling's capabilities to convert web pages into structured |
|
||||
documents and chunk them appropriately for further processing. |
|
||||
""" |
|
||||
|
|
||||
def __init__(self, **kwargs): |
|
||||
""" |
|
||||
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances. |
|
||||
|
|
||||
Args: |
|
||||
**kwargs: Optional keyword arguments. |
|
||||
""" |
|
||||
super().__init__(**kwargs) |
|
||||
from docling.document_converter import DocumentConverter |
|
||||
from docling_core.transforms.chunker import HierarchicalChunker |
|
||||
|
|
||||
self.converter = DocumentConverter() |
|
||||
self.chunker = HierarchicalChunker() |
|
||||
|
|
||||
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: |
|
||||
""" |
|
||||
Crawl a single URL using Docling's conversion and perform hierarchical chunking. |
|
||||
|
|
||||
Args: |
|
||||
url: The URL to crawl. |
|
||||
**crawl_kwargs: Optional keyword arguments for the crawling process. |
|
||||
|
|
||||
Returns: |
|
||||
A list of Document objects, each representing a chunk from the crawled URL. |
|
||||
|
|
||||
Raises: |
|
||||
IOError: If there is an error processing the URL. |
|
||||
""" |
|
||||
try: |
|
||||
# Use Docling to convert the URL to a document |
|
||||
conversion_result = self.converter.convert(url) |
|
||||
docling_document = conversion_result.document |
|
||||
|
|
||||
# Chunk the document using hierarchical chunking |
|
||||
chunks = list(self.chunker.chunk(docling_document)) |
|
||||
|
|
||||
documents = [] |
|
||||
for chunk in chunks: |
|
||||
metadata = {"reference": url, "text": chunk.text} |
|
||||
documents.append(Document(page_content=chunk.text, metadata=metadata)) |
|
||||
|
|
||||
return documents |
|
||||
|
|
||||
except Exception as e: |
|
||||
log.color_print(f"Error processing URL {url}: {str(e)}") |
|
||||
raise IOError(f"Failed to process URL {url}: {str(e)}") |
|
||||
|
|
||||
@property |
|
||||
def supported_file_types(self) -> List[str]: |
|
||||
""" |
|
||||
Return the list of file types and formats supported by Docling. |
|
||||
|
|
||||
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/): |
|
||||
- PDF |
|
||||
- Office formats: DOCX, XLSX, PPTX |
|
||||
- Markdown |
|
||||
- AsciiDoc |
|
||||
- HTML, XHTML |
|
||||
- CSV |
|
||||
- Images: PNG, JPEG, TIFF, BMP |
|
||||
|
|
||||
Returns: |
|
||||
A list of file extensions supported by this crawler. |
|
||||
""" |
|
||||
return [ |
|
||||
"pdf", |
|
||||
"docx", |
|
||||
"xlsx", |
|
||||
"pptx", |
|
||||
"md", |
|
||||
"adoc", |
|
||||
"asciidoc", |
|
||||
"html", |
|
||||
"xhtml", |
|
||||
"csv", |
|
||||
"png", |
|
||||
"jpg", |
|
||||
"jpeg", |
|
||||
"tif", |
|
||||
"tiff", |
|
||||
"bmp", |
|
||||
] |
|
@ -1,88 +0,0 @@ |
|||||
import os |
|
||||
from typing import List, Optional |
|
||||
|
|
||||
from firecrawl import FirecrawlApp, ScrapeOptions |
|
||||
from langchain_core.documents import Document |
|
||||
|
|
||||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|
||||
|
|
||||
|
|
||||
class FireCrawlCrawler(BaseCrawler): |
|
||||
""" |
|
||||
Web crawler using the FireCrawl service. |
|
||||
|
|
||||
This crawler uses the FireCrawl service to crawl web pages and convert them |
|
||||
into markdown format for further processing. It supports both single-page scraping |
|
||||
and recursive crawling of multiple pages. |
|
||||
""" |
|
||||
|
|
||||
def __init__(self, **kwargs): |
|
||||
""" |
|
||||
Initialize the FireCrawlCrawler. |
|
||||
|
|
||||
Args: |
|
||||
**kwargs: Optional keyword arguments. |
|
||||
""" |
|
||||
super().__init__(**kwargs) |
|
||||
self.app = None |
|
||||
|
|
||||
def crawl_url( |
|
||||
self, |
|
||||
url: str, |
|
||||
max_depth: Optional[int] = None, |
|
||||
limit: Optional[int] = None, |
|
||||
allow_backward_links: Optional[bool] = None, |
|
||||
) -> List[Document]: |
|
||||
""" |
|
||||
Dynamically crawls a URL using either scrape_url or crawl_url: |
|
||||
|
|
||||
- Uses scrape_url for single-page extraction if no params are provided. |
|
||||
- Uses crawl_url to recursively gather pages when any param is provided. |
|
||||
|
|
||||
Args: |
|
||||
url (str): The starting URL to crawl. |
|
||||
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2). |
|
||||
limit (Optional[int]): Maximum number of pages to crawl (default: 20). |
|
||||
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False). |
|
||||
|
|
||||
Returns: |
|
||||
List[Document]: List of Document objects with page content and metadata. |
|
||||
""" |
|
||||
# Lazy init |
|
||||
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) |
|
||||
|
|
||||
# if user just inputs a single url as param |
|
||||
# scrape single page |
|
||||
if max_depth is None and limit is None and allow_backward_links is None: |
|
||||
# Call the new Firecrawl API, passing formats directly |
|
||||
scrape_response = self.app.scrape_url(url=url, formats=["markdown"]) |
|
||||
data = scrape_response.model_dump() |
|
||||
return [ |
|
||||
Document( |
|
||||
page_content=data.get("markdown", ""), |
|
||||
metadata={"reference": url, **data.get("metadata", {})}, |
|
||||
) |
|
||||
] |
|
||||
|
|
||||
# else, crawl multiple pages based on users' input params |
|
||||
# set default values if not provided |
|
||||
crawl_response = self.app.crawl_url( |
|
||||
url=url, |
|
||||
limit=limit or 20, |
|
||||
max_depth=max_depth or 2, |
|
||||
allow_backward_links=allow_backward_links or False, |
|
||||
scrape_options=ScrapeOptions(formats=["markdown"]), |
|
||||
poll_interval=5, |
|
||||
) |
|
||||
items = crawl_response.model_dump().get("data", []) |
|
||||
|
|
||||
documents: List[Document] = [] |
|
||||
for item in items: |
|
||||
# Support items that are either dicts or Pydantic sub-models |
|
||||
item_dict = item.model_dump() if hasattr(item, "model_dump") else item |
|
||||
md = item_dict.get("markdown", "") |
|
||||
meta = item_dict.get("metadata", {}) |
|
||||
meta["reference"] = meta.get("url", url) |
|
||||
documents.append(Document(page_content=md, metadata=meta)) |
|
||||
|
|
||||
return documents |
|
@ -1,62 +0,0 @@ |
|||||
import os |
|
||||
from typing import List |
|
||||
|
|
||||
import requests |
|
||||
from langchain_core.documents import Document |
|
||||
|
|
||||
from deepsearcher.loader.web_crawler.base import BaseCrawler |
|
||||
|
|
||||
|
|
||||
class JinaCrawler(BaseCrawler): |
|
||||
""" |
|
||||
Web crawler using Jina AI's rendering service. |
|
||||
|
|
||||
This crawler uses Jina AI's rendering service to crawl web pages and convert them |
|
||||
into markdown format for further processing. |
|
||||
""" |
|
||||
|
|
||||
def __init__(self, **kwargs): |
|
||||
""" |
|
||||
Initialize the JinaCrawler. |
|
||||
|
|
||||
Args: |
|
||||
**kwargs: Optional keyword arguments. |
|
||||
|
|
||||
Raises: |
|
||||
ValueError: If the JINA_API_TOKEN environment variable is not set. |
|
||||
""" |
|
||||
super().__init__(**kwargs) |
|
||||
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY") |
|
||||
if not self.jina_api_token: |
|
||||
raise ValueError("Missing JINA_API_TOKEN environment variable") |
|
||||
|
|
||||
def crawl_url(self, url: str) -> List[Document]: |
|
||||
""" |
|
||||
Crawl a single URL using Jina AI's rendering service. |
|
||||
|
|
||||
Args: |
|
||||
url: The URL to crawl. |
|
||||
|
|
||||
Returns: |
|
||||
A list containing a single Document object with the markdown content and metadata. |
|
||||
|
|
||||
Raises: |
|
||||
HTTPError: If the request to Jina AI's service fails. |
|
||||
""" |
|
||||
jina_url = f"https://r.jina.ai/{url}" |
|
||||
headers = { |
|
||||
"Authorization": f"Bearer {self.jina_api_token}", |
|
||||
"X-Return-Format": "markdown", |
|
||||
} |
|
||||
|
|
||||
response = requests.get(jina_url, headers=headers) |
|
||||
response.raise_for_status() |
|
||||
|
|
||||
markdown_content = response.text |
|
||||
metadata = { |
|
||||
"reference": url, |
|
||||
"status_code": response.status_code, |
|
||||
"headers": dict(response.headers), |
|
||||
} |
|
||||
|
|
||||
return [Document(page_content=markdown_content, metadata=metadata)] |
|
@ -1,40 +0,0 @@ |
|||||
#!/usr/bin/env python3 |
|
||||
""" |
|
||||
只测试网页搜索功能 |
|
||||
""" |
|
||||
|
|
||||
import sys |
|
||||
import os |
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
||||
|
|
||||
from deepsearcher.web_search import WebSearch |
|
||||
|
|
||||
def test_web_search(): |
|
||||
"""测试网页搜索功能""" |
|
||||
print("=== 测试网页搜索功能 ===") |
|
||||
|
|
||||
# 初始化网页搜索 |
|
||||
web_search = WebSearch() |
|
||||
|
|
||||
# 测试查询 |
|
||||
test_query = "Milvus是什么" |
|
||||
print(f"测试查询: {test_query}") |
|
||||
|
|
||||
# 执行搜索 |
|
||||
results = web_search.search_with_retry(test_query, size=4) |
|
||||
|
|
||||
if results: |
|
||||
print(f"✅ 成功找到 {len(results)} 个搜索结果:") |
|
||||
for i, result in enumerate(results, 1): |
|
||||
print(f"\n--- 结果 {i} ---") |
|
||||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|
||||
print(f"链接: {result.reference}") |
|
||||
print(f"分数: {result.score}") |
|
||||
print(f"内容长度: {len(result.text)} 字符") |
|
||||
print(f"内容预览: {result.text[:200]}...") |
|
||||
print(f"来源: {result.metadata.get('source', 'N/A')}") |
|
||||
else: |
|
||||
print("❌ 未找到搜索结果") |
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
test_web_search() |
|
@ -1,75 +0,0 @@ |
|||||
#!/usr/bin/env python3 |
|
||||
""" |
|
||||
测试网页搜索功能 |
|
||||
""" |
|
||||
|
|
||||
import sys |
|
||||
import os |
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
||||
|
|
||||
from deepsearcher.web_search import WebSearch |
|
||||
from deepsearcher import configuration |
|
||||
|
|
||||
def test_web_search(): |
|
||||
"""测试网页搜索功能""" |
|
||||
print("=== 测试网页搜索功能 ===") |
|
||||
|
|
||||
# 初始化网页搜索 |
|
||||
web_search = WebSearch() |
|
||||
|
|
||||
# 测试查询 |
|
||||
test_query = "Milvus是什么" |
|
||||
print(f"测试查询: {test_query}") |
|
||||
|
|
||||
# 执行搜索 |
|
||||
results = web_search.search_with_retry(test_query, size=4) |
|
||||
|
|
||||
if results: |
|
||||
print(f"找到 {len(results)} 个搜索结果:") |
|
||||
for i, result in enumerate(results, 1): |
|
||||
print(f"\n--- 结果 {i} ---") |
|
||||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|
||||
print(f"链接: {result.reference}") |
|
||||
print(f"分数: {result.score}") |
|
||||
print(f"内容长度: {len(result.text)} 字符") |
|
||||
print(f"内容预览: {result.text[:200]}...") |
|
||||
else: |
|
||||
print("未找到搜索结果") |
|
||||
|
|
||||
def test_integration(): |
|
||||
"""测试与DeepSearch的集成""" |
|
||||
print("\n=== 测试与DeepSearch的集成 ===") |
|
||||
|
|
||||
# 初始化配置 |
|
||||
configuration.init_config(configuration.config) |
|
||||
|
|
||||
# 创建DeepSearch实例(启用网页搜索) |
|
||||
from deepsearcher.agent.deep_search import DeepSearch |
|
||||
|
|
||||
searcher = DeepSearch( |
|
||||
llm=configuration.llm, |
|
||||
embedding_model=configuration.embedding_model, |
|
||||
vector_db=configuration.vector_db, |
|
||||
max_iter=2, |
|
||||
enable_web_search=True |
|
||||
) |
|
||||
|
|
||||
# 测试查询 |
|
||||
test_query = "Milvus是什么" |
|
||||
print(f"测试查询: {test_query}") |
|
||||
|
|
||||
# 执行搜索 |
|
||||
results, sub_queries = searcher.retrieve(test_query, max_iter=2) |
|
||||
|
|
||||
print(f"生成的子问题: {sub_queries}") |
|
||||
print(f"找到 {len(results)} 个搜索结果") |
|
||||
# 显示结果统计 |
|
||||
web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"] |
|
||||
vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"] |
|
||||
|
|
||||
print(f"网页搜索结果: {len(web_results)} 个") |
|
||||
print(f"向量数据库结果: {len(vector_results)} 个") |
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
test_web_search() |
|
||||
test_integration() |
|
Loading…
Reference in new issue