chore: 移除手动网页内容加载的前后端逻辑

5 days ago · eaf6a4a9b5
14 changed files with 3 additions and 825 deletions
--- a/deepsearcher/cli.py
+++ b/deepsearcher/cli.py
@ -1,118 +0,0 @@
 import argparse
 import logging
 import sys
 import warnings
 from deepsearcher.configuration import Configuration, init_config
 from deepsearcher.offline_loading import load_from_local_files, load_from_website
 from deepsearcher.online_query import query
 from deepsearcher.utils import log
 httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
 httpx_logger.setLevel(logging.WARNING)
 warnings.simplefilter(action="ignore", category=FutureWarning)  # disable warning output
 def main():
    """
    Main entry point for the DeepSearcher CLI.
    This function parses command line arguments and executes the appropriate action
    based on the subcommand provided (query or load). It handles the deprecated
    command line format and provides helpful error messages.
    Returns:
        None
    """
    if "--query" in sys.argv or "--load" in sys.argv:
        print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.")
        print("Please use:")
        print("  deepsearcher query <your_query> --max_iter 3")
        print(
            "  deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>"
        )
        sys.exit(1)
    config = Configuration()  # Customize your config here
    init_config(config=config)
    parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.")
    subparsers = parser.add_subparsers(dest="subcommand", title="subcommands")
    ## Arguments of query
    query_parser = subparsers.add_parser("query", help="Query a question or search topic.")
    query_parser.add_argument("query", type=str, default="", help="query question or search topic.")
    query_parser.add_argument(
        "--max_iter",
        type=int,
        default=3,
        help="Max iterations of reflection. Default is 3.",
    )
    ## Arguments of loading
    load_parser = subparsers.add_parser(
        "load", help="Load knowledge from local files or from URLs."
    )
    load_parser.add_argument(
        "load_path",
        type=str,
        nargs="+",  # 1 or more files or urls
        help="Load knowledge from local files or from URLs.",
    )
    load_parser.add_argument(
        "--batch_size",
        type=int,
        default=256,
        help="Batch size for loading knowledge.",
    )
    load_parser.add_argument(
        "--collection_name",
        type=str,
        default=None,
        help="Destination collection name of loaded knowledge.",
    )
    load_parser.add_argument(
        "--collection_desc",
        type=str,
        default=None,
        help="Description of the collection.",
    )
    load_parser.add_argument(
        "--force_new_collection",
        type=bool,
        default=False,
        help="If you want to drop origin collection and create a new collection on every load, set to True",
    )
    args = parser.parse_args()
    if args.subcommand == "query":
        final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter)
        log.color_print("\n==== FINAL ANSWER====\n")
        log.color_print(final_answer)
        log.color_print("\n### References\n")
        for i, ref in enumerate(refs):
            log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}")
    elif args.subcommand == "load":
        urls = [url for url in args.load_path if url.startswith("http")]
        local_files = [file for file in args.load_path if not file.startswith("http")]
        kwargs = {}
        if args.collection_name:
            kwargs["collection_name"] = args.collection_name
        if args.collection_desc:
            kwargs["collection_description"] = args.collection_desc
        if args.force_new_collection:
            kwargs["force_new_collection"] = args.force_new_collection
        if args.batch_size:
            kwargs["batch_size"] = args.batch_size
        if len(urls) > 0:
            load_from_website(urls, **kwargs)
        if len(local_files) > 0:
            load_from_local_files(local_files, **kwargs)
    else:
        print("Please provide a query or a load argument.")
 if __name__ == "__main__":
    main()
--- a/deepsearcher/config.yaml
+++ b/deepsearcher/config.yaml
@ -36,26 +36,6 @@ provide_settings:
 #    config: {}
  web_crawler:
    provider: "FireCrawlCrawler"
    config: {}
    # provider: "Crawl4AICrawler"
    # config: # Uncomment to custom browser configuration for Crawl4AI
    #   browser_config:
    #     headless: false
    #     proxy: "http://127.0.0.1:7890"
    #     chrome_channel: "chrome"
    #     verbose: true
    #     viewport_width: 800
    #     viewport_height: 600
    #    provider: "JinaCrawler"
    #    config: {}
    #    provider: "DoclingCrawler"
    #    config: {}
  vector_db:
    provider: "Milvus"
    config:
@ -64,27 +44,9 @@ provide_settings:
      token: "root:Milvus"
      db: "default"
  # vector_db:      
  #   provider: "OracleDB"
  #   config:
  #     default_collection: "deepsearcher"
  #     user: ""
  #     password: ""
  #     dsn: ""
  #     config_dir: ""
  #     wallet_location: ""
  #     wallet_password: ""
  # vector_db:      
  #   provider: "Qdrant"
  #   config:
  #     default_collection: "deepsearcher"
  #     host: "localhost"
  #     port: 6333
 query_settings:
-  max_iter: 3
+  max_iter: 2
  enable_web_search: true
 load_settings:
  chunk_size: 2048
--- a/deepsearcher/configuration.py
+++ b/deepsearcher/configuration.py
@ -7,7 +7,6 @@ from deepsearcher.agent import BaseAgent, DeepSearch
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.loader.file_loader.base import BaseLoader
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.vector_db.base import BaseVectorDB
 current_dir = os.path.dirname(os.path.abspath(__file__))
@ -21,7 +20,7 @@ class Configuration:
    Configuration class for DeepSearcher.
    This class manages the configuration settings for various components of the DeepSearcher system,
-    including LLM providers, embedding models, file loaders, web crawlers, and vector databases.
+    including LLM providers, embedding models, file loaders and vector databases.
    It loads configurations from a YAML file and provides methods to get and set provider configurations.
    """
@ -151,14 +150,6 @@ class ModuleFactory:
        """
        return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader")
    def create_web_crawler(self) -> BaseCrawler:
        """
        Create an instance of a web crawler.
        Returns:
            An instance of a BaseCrawler implementation.
        """
        return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler")
    def create_vector_db(self) -> BaseVectorDB:
        """
@ -177,7 +168,6 @@ llm: BaseLLM = None
 embedding_model: BaseEmbedding = None
 file_loader: BaseLoader = None
 vector_db: BaseVectorDB = None
 web_crawler: BaseCrawler = None
 default_searcher: BaseAgent = None
 def init_config(config: Configuration):
@ -196,13 +186,11 @@ def init_config(config: Configuration):
        embedding_model, \
        file_loader, \
        vector_db, \
        web_crawler, \
        default_searcher
    module_factory = ModuleFactory(config)
    llm = module_factory.create_llm()
    embedding_model = module_factory.create_embedding()
    file_loader = module_factory.create_file_loader()
    web_crawler = module_factory.create_web_crawler()
    vector_db = module_factory.create_vector_db()
    default_searcher = DeepSearch(
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@ -70,8 +70,7 @@ def _sentence_window_split(
            max(0, start_index - offset) : min(len(original_text), end_index + offset)
        ]
        reference = doc.metadata.pop("reference", "")
-        doc.metadata["wider_text"] = wider_text
+        chunk = Chunk(text=wider_text, reference=reference, metadata=doc.metadata)
        chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
        chunks.append(chunk)
    return chunks
--- a/deepsearcher/loader/web_crawler/init.py
+++ b/deepsearcher/loader/web_crawler/init.py
@ -1,11 +0,0 @@
 from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler
 from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler
 from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler
 from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler
 __all__ = [
    "FireCrawlCrawler",
    "JinaCrawler",
    "Crawl4AICrawler",
    "DoclingCrawler",
 ]
--- a/deepsearcher/loader/web_crawler/base.py
+++ b/deepsearcher/loader/web_crawler/base.py
@ -1,55 +0,0 @@
 from abc import ABC
 from typing import List
 from langchain_core.documents import Document
 class BaseCrawler(ABC):
    """
    Abstract base class for web crawlers.
    This class defines the interface for crawling web pages and converting them
    into Document objects for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the crawler with optional keyword arguments.
        Args:
            **kwargs: Optional keyword arguments for specific crawler implementations.
        """
        pass
    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL and convert it to Document objects.
        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects containing the content and metadata from the URL.
        Note:
            Implementations should include the URL reference in the metadata.
            e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
        """
        pass
    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
        """
        Crawl multiple URLs and return a list of Document objects.
        Args:
            urls: A list of URLs to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects containing the content and metadata from all URLs.
        """
        documents = []
        for url in urls:
            documents.extend(self.crawl_url(url, **crawl_kwargs))
        return documents
--- a/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
+++ b/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
@ -1,140 +0,0 @@
 import asyncio
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.utils import log
 class Crawl4AICrawler(BaseCrawler):
    """
    Web crawler using the Crawl4AI library.
    This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
    into markdown format for further processing. It supports both single-page crawling
    and batch crawling of multiple pages.
    """
    def __init__(self, **kwargs):
        """
        Initialize the Crawl4AICrawler.
        Args:
            **kwargs: Optional keyword arguments.
                browser_config: Configuration for the browser used by Crawl4AI.
        """
        super().__init__(**kwargs)
        self.crawler = None  # Lazy init
        self.browser_config = kwargs.get("browser_config", None)
    def _lazy_init(self):
        """
        Initialize the crawler lazily when needed.
        This method creates the AsyncWebCrawler instance with the provided browser configuration
        only when it's first needed, to avoid unnecessary initialization.
        """
        from crawl4ai import AsyncWebCrawler, BrowserConfig
        if self.crawler is None:
            config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
            self.crawler = AsyncWebCrawler(config=config)
    async def _async_crawl(self, url: str) -> Document:
        """
        Asynchronously crawl a single URL.
        Args:
            url: The URL to crawl.
        Returns:
            A Document object with the markdown content and metadata from the URL.
        """
        if self.crawler is None:
            self._lazy_init()
        async with self.crawler as crawler:
            result = await crawler.arun(url)
            markdown_content = result.markdown or ""
            metadata = {
                "reference": url,
                "success": result.success,
                "status_code": result.status_code,
                "media": result.media,
                "links": result.links,
            }
            if hasattr(result, "metadata") and result.metadata:
                metadata["title"] = result.metadata.get("title", "")
                metadata["author"] = result.metadata.get("author", "")
            return Document(page_content=markdown_content, metadata=metadata)
    def crawl_url(self, url: str) -> List[Document]:
        """
        Crawl a single URL.
        Args:
            url: The URL to crawl.
        Returns:
            A list containing a single Document object with the markdown content and metadata,
            or an empty list if an error occurs.
        """
        try:
            document = asyncio.run(self._async_crawl(url))
            return [document]
        except Exception as e:
            log.error(f"Error during crawling {url}: {e}")
            return []
    async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
        """
        Asynchronously crawl multiple URLs.
        Args:
            urls: A list of URLs to crawl.
        Returns:
            A list of Document objects with the markdown content and metadata from all URLs.
        """
        if self.crawler is None:
            self._lazy_init()
        async with self.crawler as crawler:
            results = await crawler.arun_many(urls)
            documents = []
            for result in results:
                markdown_content = result.markdown or ""
                metadata = {
                    "reference": result.url,
                    "success": result.success,
                    "status_code": result.status_code,
                    "media": result.media,
                    "links": result.links,
                }
                if hasattr(result, "metadata") and result.metadata:
                    metadata["title"] = result.metadata.get("title", "")
                    metadata["author"] = result.metadata.get("author", "")
                documents.append(Document(page_content=markdown_content, metadata=metadata))
            return documents
    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
        """
        Crawl multiple URLs.
        Args:
            urls: A list of URLs to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects with the markdown content and metadata from all URLs,
            or an empty list if an error occurs.
        """
        try:
            return asyncio.run(self._async_crawl_many(urls))
        except Exception as e:
            log.error(f"Error during crawling {urls}: {e}")
            return []
--- a/deepsearcher/loader/web_crawler/docling_crawler.py
+++ b/deepsearcher/loader/web_crawler/docling_crawler.py
@ -1,98 +0,0 @@
 from typing import List
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.utils import log
 class DoclingCrawler(BaseCrawler):
    """
    Web crawler using Docling's DocumentConverter and HierarchicalChunker.
    This crawler leverages Docling's capabilities to convert web pages into structured
    documents and chunk them appropriately for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
        Args:
            **kwargs: Optional keyword arguments.
        """
        super().__init__(**kwargs)
        from docling.document_converter import DocumentConverter
        from docling_core.transforms.chunker import HierarchicalChunker
        self.converter = DocumentConverter()
        self.chunker = HierarchicalChunker()
    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL using Docling's conversion and perform hierarchical chunking.
        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.
        Returns:
            A list of Document objects, each representing a chunk from the crawled URL.
        Raises:
            IOError: If there is an error processing the URL.
        """
        try:
            # Use Docling to convert the URL to a document
            conversion_result = self.converter.convert(url)
            docling_document = conversion_result.document
            # Chunk the document using hierarchical chunking
            chunks = list(self.chunker.chunk(docling_document))
            documents = []
            for chunk in chunks:
                metadata = {"reference": url, "text": chunk.text}
                documents.append(Document(page_content=chunk.text, metadata=metadata))
            return documents
        except Exception as e:
            log.color_print(f"Error processing URL {url}: {str(e)}")
            raise IOError(f"Failed to process URL {url}: {str(e)}")
    @property
    def supported_file_types(self) -> List[str]:
        """
        Return the list of file types and formats supported by Docling.
        Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
        - PDF
        - Office formats: DOCX, XLSX, PPTX
        - Markdown
        - AsciiDoc
        - HTML, XHTML
        - CSV
        - Images: PNG, JPEG, TIFF, BMP
        Returns:
            A list of file extensions supported by this crawler.
        """
        return [
            "pdf",
            "docx",
            "xlsx",
            "pptx",
            "md",
            "adoc",
            "asciidoc",
            "html",
            "xhtml",
            "csv",
            "png",
            "jpg",
            "jpeg",
            "tif",
            "tiff",
            "bmp",
        ]
--- a/deepsearcher/loader/web_crawler/firecrawl_crawler.py
+++ b/deepsearcher/loader/web_crawler/firecrawl_crawler.py
@ -1,88 +0,0 @@
 import os
 from typing import List, Optional
 from firecrawl import FirecrawlApp, ScrapeOptions
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 class FireCrawlCrawler(BaseCrawler):
    """
    Web crawler using the FireCrawl service.
    This crawler uses the FireCrawl service to crawl web pages and convert them
    into markdown format for further processing. It supports both single-page scraping
    and recursive crawling of multiple pages.
    """
    def __init__(self, **kwargs):
        """
        Initialize the FireCrawlCrawler.
        Args:
            **kwargs: Optional keyword arguments.
        """
        super().__init__(**kwargs)
        self.app = None
    def crawl_url(
        self,
        url: str,
        max_depth: Optional[int] = None,
        limit: Optional[int] = None,
        allow_backward_links: Optional[bool] = None,
    ) -> List[Document]:
        """
        Dynamically crawls a URL using either scrape_url or crawl_url:
        - Uses scrape_url for single-page extraction if no params are provided.
        - Uses crawl_url to recursively gather pages when any param is provided.
        Args:
            url (str): The starting URL to crawl.
            max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
            limit (Optional[int]): Maximum number of pages to crawl (default: 20).
            allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
        Returns:
            List[Document]: List of Document objects with page content and metadata.
        """
        # Lazy init
        self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
        # if user just inputs a single url as param
        # scrape single page
        if max_depth is None and limit is None and allow_backward_links is None:
            # Call the new Firecrawl API, passing formats directly
            scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
            data = scrape_response.model_dump()
            return [
                Document(
                    page_content=data.get("markdown", ""),
                    metadata={"reference": url, **data.get("metadata", {})},
                )
            ]
        # else, crawl multiple pages based on users' input params
        # set default values if not provided
        crawl_response = self.app.crawl_url(
            url=url,
            limit=limit or 20,
            max_depth=max_depth or 2,
            allow_backward_links=allow_backward_links or False,
            scrape_options=ScrapeOptions(formats=["markdown"]),
            poll_interval=5,
        )
        items = crawl_response.model_dump().get("data", [])
        documents: List[Document] = []
        for item in items:
            # Support items that are either dicts or Pydantic sub-models
            item_dict = item.model_dump() if hasattr(item, "model_dump") else item
            md = item_dict.get("markdown", "")
            meta = item_dict.get("metadata", {})
            meta["reference"] = meta.get("url", url)
            documents.append(Document(page_content=md, metadata=meta))
        return documents
--- a/deepsearcher/loader/web_crawler/jina_crawler.py
+++ b/deepsearcher/loader/web_crawler/jina_crawler.py
@ -1,62 +0,0 @@
 import os
 from typing import List
 import requests
 from langchain_core.documents import Document
 from deepsearcher.loader.web_crawler.base import BaseCrawler
 class JinaCrawler(BaseCrawler):
    """
    Web crawler using Jina AI's rendering service.
    This crawler uses Jina AI's rendering service to crawl web pages and convert them
    into markdown format for further processing.
    """
    def __init__(self, **kwargs):
        """
        Initialize the JinaCrawler.
        Args:
            **kwargs: Optional keyword arguments.
        Raises:
            ValueError: If the JINA_API_TOKEN environment variable is not set.
        """
        super().__init__(**kwargs)
        self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
        if not self.jina_api_token:
            raise ValueError("Missing JINA_API_TOKEN environment variable")
    def crawl_url(self, url: str) -> List[Document]:
        """
        Crawl a single URL using Jina AI's rendering service.
        Args:
            url: The URL to crawl.
        Returns:
            A list containing a single Document object with the markdown content and metadata.
        Raises:
            HTTPError: If the request to Jina AI's service fails.
        """
        jina_url = f"https://r.jina.ai/{url}"
        headers = {
            "Authorization": f"Bearer {self.jina_api_token}",
            "X-Return-Format": "markdown",
        }
        response = requests.get(jina_url, headers=headers)
        response.raise_for_status()
        markdown_content = response.text
        metadata = {
            "reference": url,
            "status_code": response.status_code,
            "headers": dict(response.headers),
        }
        return [Document(page_content=markdown_content, metadata=metadata)]
--- a/deepsearcher/templates/html/index.html
+++ b/deepsearcher/templates/html/index.html
@ -63,39 +63,6 @@
          ></div>
        </div>
        <div class="card">
          <h2 class="card-title">网站内容加载</h2>
          <div class="form-group">
            <label for="websiteUrls">网站URL（多个URL用逗号分隔）</label>
            <input
              type="text"
              id="websiteUrls"
              placeholder="例如: https://example.com/page1,https://example.com/page2"
            />
          </div>
          <div class="form-group">
            <label for="webCollectionName">集合名称（可选）</label>
            <input
              type="text"
              id="webCollectionName"
              placeholder="例如: web_collection"
            />
          </div>
          <div class="form-group">
            <label for="webCollectionDesc">集合描述（可选）</label>
            <textarea
              id="webCollectionDesc"
              rows="2"
              placeholder="例如: 来自网站的内容"
            ></textarea>
          </div>
          <button id="loadWebsiteBtn">加载网站内容</button>
          <div
            id="webLoadStatus"
            class="status"
          ></div>
        </div>
        <div class="card">
          <h2 class="card-title">智能查询</h2>
          <div class="form-group">
--- a/deepsearcher/templates/static/js/app.js
+++ b/deepsearcher/templates/static/js/app.js
@ -343,57 +343,6 @@ document
        }
    });
 // 加载网站内容功能
 document
    .getElementById('loadWebsiteBtn')
    .addEventListener('click', async function () {
        const button = this;
        const urlsInput = document.getElementById('websiteUrls').value;
        const collectionName = document.getElementById('webCollectionName').value;
        const collectionDesc = document.getElementById('webCollectionDesc').value;
        if (!urlsInput) {
            showStatus('webLoadStatus', '请提供至少一个网站URL', 'error');
            return;
        }
        const urls = urlsInput
            .split(',')
            .map((url) => url.trim())
            .filter((url) => url);
        setButtonLoading(button, true);
        showStatus('webLoadStatus', ' 正在加载网站...', 'loading');
        hideResult();
        hideProcessResult();
        try {
            const response = await fetch('/load-website/', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify({
                    urls: urls,
                    collection_name: collectionName || undefined,
                    collection_description: collectionDesc || undefined
                })
            });
            const data = await response.json();
            if (response.ok) {
                showStatus('webLoadStatus', data.message, 'success');
            } else {
                showStatus('webLoadStatus', `加载失败: ${data.detail}`, 'error');
            }
        } catch (error) {
            showStatus('webLoadStatus', `请求失败: ${error.message}`, 'error');
        } finally {
            setButtonLoading(button, false);
        }
    });
 // 查询功能 - 使用实时流
 document
    .getElementById('queryBtn')
--- a/test_web_only.py
+++ b/test_web_only.py
@ -1,40 +0,0 @@
 #!/usr/bin/env python3
 """
 只测试网页搜索功能
 """
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from deepsearcher.web_search import WebSearch
 def test_web_search():
    """测试网页搜索功能"""
    print("=== 测试网页搜索功能 ===")
    # 初始化网页搜索
    web_search = WebSearch()
    # 测试查询
    test_query = "Milvus是什么"
    print(f"测试查询: {test_query}")
    # 执行搜索
    results = web_search.search_with_retry(test_query, size=4)
    if results:
        print(f"✅ 成功找到 {len(results)} 个搜索结果:")
        for i, result in enumerate(results, 1):
            print(f"\n--- 结果 {i} ---")
            print(f"标题: {result.metadata.get('title', 'N/A')}")
            print(f"链接: {result.reference}")
            print(f"分数: {result.score}")
            print(f"内容长度: {len(result.text)} 字符")
            print(f"内容预览: {result.text[:200]}...")
            print(f"来源: {result.metadata.get('source', 'N/A')}")
    else:
        print("❌ 未找到搜索结果")
 if __name__ == "__main__":
    test_web_search()
--- a/test_web_search.py
+++ b/test_web_search.py
@ -1,75 +0,0 @@
 #!/usr/bin/env python3
 """
 测试网页搜索功能
 """
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from deepsearcher.web_search import WebSearch
 from deepsearcher import configuration
 def test_web_search():
    """测试网页搜索功能"""
    print("=== 测试网页搜索功能 ===")
    # 初始化网页搜索
    web_search = WebSearch()
    # 测试查询
    test_query = "Milvus是什么"
    print(f"测试查询: {test_query}")
    # 执行搜索
    results = web_search.search_with_retry(test_query, size=4)
    if results:
        print(f"找到 {len(results)} 个搜索结果:")
        for i, result in enumerate(results, 1):
            print(f"\n--- 结果 {i} ---")
            print(f"标题: {result.metadata.get('title', 'N/A')}")
            print(f"链接: {result.reference}")
            print(f"分数: {result.score}")
            print(f"内容长度: {len(result.text)} 字符")
            print(f"内容预览: {result.text[:200]}...")
    else:
        print("未找到搜索结果")
 def test_integration():
    """测试与DeepSearch的集成"""
    print("\n=== 测试与DeepSearch的集成 ===")
    # 初始化配置
    configuration.init_config(configuration.config)
    # 创建DeepSearch实例（启用网页搜索）
    from deepsearcher.agent.deep_search import DeepSearch
    searcher = DeepSearch(
        llm=configuration.llm,
        embedding_model=configuration.embedding_model,
        vector_db=configuration.vector_db,
        max_iter=2,
        enable_web_search=True
    )
    # 测试查询
    test_query = "Milvus是什么"
    print(f"测试查询: {test_query}")
    # 执行搜索
    results, sub_queries = searcher.retrieve(test_query, max_iter=2)
    print(f"生成的子问题: {sub_queries}")
    print(f"找到 {len(results)} 个搜索结果")
    # 显示结果统计
    web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"]
    vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"]
    print(f"网页搜索结果: {len(web_results)} 个")
    print(f"向量数据库结果: {len(vector_results)} 个")
 if __name__ == "__main__":
    test_web_search()
    test_integration()