chore: 移除手动网页内容加载的前后端逻辑

5 days ago · eaf6a4a9b5
14 changed files with 3 additions and 825 deletions
--- a/deepsearcher/cli.py
+++ b/deepsearcher/cli.py
@ -1,118 +0,0 @@
-import argparse
-import logging
-import sys
-import warnings
-
-from deepsearcher.configuration import Configuration, init_config
-from deepsearcher.offline_loading import load_from_local_files, load_from_website
-from deepsearcher.online_query import query
-from deepsearcher.utils import log
-
-httpx_logger = logging.getLogger("httpx")  # disable openai's logger output
-httpx_logger.setLevel(logging.WARNING)
-
-
-warnings.simplefilter(action="ignore", category=FutureWarning)  # disable warning output
-
-
-def main():
-    """
-    Main entry point for the DeepSearcher CLI.
-
-    This function parses command line arguments and executes the appropriate action
-    based on the subcommand provided (query or load). It handles the deprecated
-    command line format and provides helpful error messages.
-
-    Returns:
-        None
-    """
-    if "--query" in sys.argv or "--load" in sys.argv:
-        print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.")
-        print("Please use:")
-        print("  deepsearcher query <your_query> --max_iter 3")
-        print(
-            "  deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>"
-        )
-        sys.exit(1)
-
-    config = Configuration()  # Customize your config here
-    init_config(config=config)
-
-    parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.")
-    subparsers = parser.add_subparsers(dest="subcommand", title="subcommands")
-
-    ## Arguments of query
-    query_parser = subparsers.add_parser("query", help="Query a question or search topic.")
-    query_parser.add_argument("query", type=str, default="", help="query question or search topic.")
-    query_parser.add_argument(
-        "--max_iter",
-        type=int,
-        default=3,
-        help="Max iterations of reflection. Default is 3.",
-    )
-
-    ## Arguments of loading
-    load_parser = subparsers.add_parser(
-        "load", help="Load knowledge from local files or from URLs."
-    )
-    load_parser.add_argument(
-        "load_path",
-        type=str,
-        nargs="+",  # 1 or more files or urls
-        help="Load knowledge from local files or from URLs.",
-    )
-    load_parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=256,
-        help="Batch size for loading knowledge.",
-    )
-    load_parser.add_argument(
-        "--collection_name",
-        type=str,
-        default=None,
-        help="Destination collection name of loaded knowledge.",
-    )
-    load_parser.add_argument(
-        "--collection_desc",
-        type=str,
-        default=None,
-        help="Description of the collection.",
-    )
-    load_parser.add_argument(
-        "--force_new_collection",
-        type=bool,
-        default=False,
-        help="If you want to drop origin collection and create a new collection on every load, set to True",
-    )
-
-    args = parser.parse_args()
-    if args.subcommand == "query":
-        final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter)
-        log.color_print("\n==== FINAL ANSWER====\n")
-        log.color_print(final_answer)
-        log.color_print("\n### References\n")
-        for i, ref in enumerate(refs):
-            log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}")
-    elif args.subcommand == "load":
-        urls = [url for url in args.load_path if url.startswith("http")]
-        local_files = [file for file in args.load_path if not file.startswith("http")]
-        kwargs = {}
-        if args.collection_name:
-            kwargs["collection_name"] = args.collection_name
-        if args.collection_desc:
-            kwargs["collection_description"] = args.collection_desc
-        if args.force_new_collection:
-            kwargs["force_new_collection"] = args.force_new_collection
-        if args.batch_size:
-            kwargs["batch_size"] = args.batch_size
-        if len(urls) > 0:
-            load_from_website(urls, **kwargs)
-        if len(local_files) > 0:
-            load_from_local_files(local_files, **kwargs)
-    else:
-        print("Please provide a query or a load argument.")
-
-
-if __name__ == "__main__":
-    main()
--- a/deepsearcher/config.yaml
+++ b/deepsearcher/config.yaml
@ -36,26 +36,6 @@ provide_settings:
 #    config: {}


-  web_crawler:
-    provider: "FireCrawlCrawler"
-    config: {}
-
-    # provider: "Crawl4AICrawler"
-    # config: # Uncomment to custom browser configuration for Crawl4AI
-    #   browser_config:
-    #     headless: false
-    #     proxy: "http://127.0.0.1:7890"
-    #     chrome_channel: "chrome"
-    #     verbose: true
-    #     viewport_width: 800
-    #     viewport_height: 600
-    
-    #    provider: "JinaCrawler"
-    #    config: {}
-
-    #    provider: "DoclingCrawler"
-    #    config: {}
-
  vector_db:
    provider: "Milvus"
    config:
@ -64,27 +44,9 @@ provide_settings:
      token: "root:Milvus"
      db: "default"

-  # vector_db:      
-  #   provider: "OracleDB"
-  #   config:
-  #     default_collection: "deepsearcher"
-  #     user: ""
-  #     password: ""
-  #     dsn: ""
-  #     config_dir: ""
-  #     wallet_location: ""
-  #     wallet_password: ""
-
-  # vector_db:      
-  #   provider: "Qdrant"
-  #   config:
-  #     default_collection: "deepsearcher"
-  #     host: "localhost"
-  #     port: 6333

 query_settings:
-  max_iter: 3
-  enable_web_search: true
+  max_iter: 2

 load_settings:
  chunk_size: 2048
--- a/deepsearcher/configuration.py
+++ b/deepsearcher/configuration.py
@ -7,7 +7,6 @@ from deepsearcher.agent import BaseAgent, DeepSearch
 from deepsearcher.embedding.base import BaseEmbedding
 from deepsearcher.llm.base import BaseLLM
 from deepsearcher.loader.file_loader.base import BaseLoader
-from deepsearcher.loader.web_crawler.base import BaseCrawler
 from deepsearcher.vector_db.base import BaseVectorDB

 current_dir = os.path.dirname(os.path.abspath(__file__))
@ -21,7 +20,7 @@ class Configuration:
    Configuration class for DeepSearcher.

    This class manages the configuration settings for various components of the DeepSearcher system,
-    including LLM providers, embedding models, file loaders, web crawlers, and vector databases.
+    including LLM providers, embedding models, file loaders and vector databases.
    It loads configurations from a YAML file and provides methods to get and set provider configurations.
    """

@ -151,14 +150,6 @@ class ModuleFactory:
        """
        return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader")

-    def create_web_crawler(self) -> BaseCrawler:
-        """
-        Create an instance of a web crawler.
-
-        Returns:
-            An instance of a BaseCrawler implementation.
-        """
-        return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler")

    def create_vector_db(self) -> BaseVectorDB:
        """
@ -177,7 +168,6 @@ llm: BaseLLM = None
 embedding_model: BaseEmbedding = None
 file_loader: BaseLoader = None
 vector_db: BaseVectorDB = None
-web_crawler: BaseCrawler = None
 default_searcher: BaseAgent = None

 def init_config(config: Configuration):
@ -196,13 +186,11 @@ def init_config(config: Configuration):
        embedding_model, \
        file_loader, \
        vector_db, \
-        web_crawler, \
        default_searcher
    module_factory = ModuleFactory(config)
    llm = module_factory.create_llm()
    embedding_model = module_factory.create_embedding()
    file_loader = module_factory.create_file_loader()
-    web_crawler = module_factory.create_web_crawler()
    vector_db = module_factory.create_vector_db()

    default_searcher = DeepSearch(
--- a/deepsearcher/loader/splitter.py
+++ b/deepsearcher/loader/splitter.py
@ -70,8 +70,7 @@ def _sentence_window_split(
            max(0, start_index - offset) : min(len(original_text), end_index + offset)
        ]
        reference = doc.metadata.pop("reference", "")
-        doc.metadata["wider_text"] = wider_text
-        chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
+        chunk = Chunk(text=wider_text, reference=reference, metadata=doc.metadata)
        chunks.append(chunk)
    return chunks

--- a/deepsearcher/loader/web_crawler/init.py
+++ b/deepsearcher/loader/web_crawler/init.py
@ -1,11 +0,0 @@
-from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler
-from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler
-from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler
-from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler
-
-__all__ = [
-    "FireCrawlCrawler",
-    "JinaCrawler",
-    "Crawl4AICrawler",
-    "DoclingCrawler",
-]
--- a/deepsearcher/loader/web_crawler/base.py
+++ b/deepsearcher/loader/web_crawler/base.py
@ -1,55 +0,0 @@
-from abc import ABC
-from typing import List
-
-from langchain_core.documents import Document
-
-
-class BaseCrawler(ABC):
-    """
-    Abstract base class for web crawlers.
-
-    This class defines the interface for crawling web pages and converting them
-    into Document objects for further processing.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize the crawler with optional keyword arguments.
-
-        Args:
-            **kwargs: Optional keyword arguments for specific crawler implementations.
-        """
-        pass
-
-    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
-        """
-        Crawl a single URL and convert it to Document objects.
-
-        Args:
-            url: The URL to crawl.
-            **crawl_kwargs: Optional keyword arguments for the crawling process.
-
-        Returns:
-            A list of Document objects containing the content and metadata from the URL.
-
-        Note:
-            Implementations should include the URL reference in the metadata.
-            e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
-        """
-        pass
-
-    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
-        """
-        Crawl multiple URLs and return a list of Document objects.
-
-        Args:
-            urls: A list of URLs to crawl.
-            **crawl_kwargs: Optional keyword arguments for the crawling process.
-
-        Returns:
-            A list of Document objects containing the content and metadata from all URLs.
-        """
-        documents = []
-        for url in urls:
-            documents.extend(self.crawl_url(url, **crawl_kwargs))
-        return documents
--- a/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
+++ b/deepsearcher/loader/web_crawler/crawl4ai_crawler.py
@ -1,140 +0,0 @@
-import asyncio
-from typing import List
-
-from langchain_core.documents import Document
-
-from deepsearcher.loader.web_crawler.base import BaseCrawler
-from deepsearcher.utils import log
-
-
-class Crawl4AICrawler(BaseCrawler):
-    """
-    Web crawler using the Crawl4AI library.
-
-    This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
-    into markdown format for further processing. It supports both single-page crawling
-    and batch crawling of multiple pages.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize the Crawl4AICrawler.
-
-        Args:
-            **kwargs: Optional keyword arguments.
-                browser_config: Configuration for the browser used by Crawl4AI.
-        """
-        super().__init__(**kwargs)
-        self.crawler = None  # Lazy init
-        self.browser_config = kwargs.get("browser_config", None)
-
-    def _lazy_init(self):
-        """
-        Initialize the crawler lazily when needed.
-
-        This method creates the AsyncWebCrawler instance with the provided browser configuration
-        only when it's first needed, to avoid unnecessary initialization.
-        """
-        from crawl4ai import AsyncWebCrawler, BrowserConfig
-
-        if self.crawler is None:
-            config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
-            self.crawler = AsyncWebCrawler(config=config)
-
-    async def _async_crawl(self, url: str) -> Document:
-        """
-        Asynchronously crawl a single URL.
-
-        Args:
-            url: The URL to crawl.
-
-        Returns:
-            A Document object with the markdown content and metadata from the URL.
-        """
-        if self.crawler is None:
-            self._lazy_init()
-
-        async with self.crawler as crawler:
-            result = await crawler.arun(url)
-
-            markdown_content = result.markdown or ""
-
-            metadata = {
-                "reference": url,
-                "success": result.success,
-                "status_code": result.status_code,
-                "media": result.media,
-                "links": result.links,
-            }
-
-            if hasattr(result, "metadata") and result.metadata:
-                metadata["title"] = result.metadata.get("title", "")
-                metadata["author"] = result.metadata.get("author", "")
-
-            return Document(page_content=markdown_content, metadata=metadata)
-
-    def crawl_url(self, url: str) -> List[Document]:
-        """
-        Crawl a single URL.
-
-        Args:
-            url: The URL to crawl.
-
-        Returns:
-            A list containing a single Document object with the markdown content and metadata,
-            or an empty list if an error occurs.
-        """
-        try:
-            document = asyncio.run(self._async_crawl(url))
-            return [document]
-        except Exception as e:
-            log.error(f"Error during crawling {url}: {e}")
-            return []
-
-    async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
-        """
-        Asynchronously crawl multiple URLs.
-
-        Args:
-            urls: A list of URLs to crawl.
-
-        Returns:
-            A list of Document objects with the markdown content and metadata from all URLs.
-        """
-        if self.crawler is None:
-            self._lazy_init()
-        async with self.crawler as crawler:
-            results = await crawler.arun_many(urls)
-            documents = []
-            for result in results:
-                markdown_content = result.markdown or ""
-                metadata = {
-                    "reference": result.url,
-                    "success": result.success,
-                    "status_code": result.status_code,
-                    "media": result.media,
-                    "links": result.links,
-                }
-                if hasattr(result, "metadata") and result.metadata:
-                    metadata["title"] = result.metadata.get("title", "")
-                    metadata["author"] = result.metadata.get("author", "")
-                documents.append(Document(page_content=markdown_content, metadata=metadata))
-            return documents
-
-    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
-        """
-        Crawl multiple URLs.
-
-        Args:
-            urls: A list of URLs to crawl.
-            **crawl_kwargs: Optional keyword arguments for the crawling process.
-
-        Returns:
-            A list of Document objects with the markdown content and metadata from all URLs,
-            or an empty list if an error occurs.
-        """
-        try:
-            return asyncio.run(self._async_crawl_many(urls))
-        except Exception as e:
-            log.error(f"Error during crawling {urls}: {e}")
-            return []
--- a/deepsearcher/loader/web_crawler/docling_crawler.py
+++ b/deepsearcher/loader/web_crawler/docling_crawler.py
@ -1,98 +0,0 @@
-from typing import List
-
-from langchain_core.documents import Document
-
-from deepsearcher.loader.web_crawler.base import BaseCrawler
-from deepsearcher.utils import log
-
-
-class DoclingCrawler(BaseCrawler):
-    """
-    Web crawler using Docling's DocumentConverter and HierarchicalChunker.
-
-    This crawler leverages Docling's capabilities to convert web pages into structured
-    documents and chunk them appropriately for further processing.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
-
-        Args:
-            **kwargs: Optional keyword arguments.
-        """
-        super().__init__(**kwargs)
-        from docling.document_converter import DocumentConverter
-        from docling_core.transforms.chunker import HierarchicalChunker
-
-        self.converter = DocumentConverter()
-        self.chunker = HierarchicalChunker()
-
-    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
-        """
-        Crawl a single URL using Docling's conversion and perform hierarchical chunking.
-
-        Args:
-            url: The URL to crawl.
-            **crawl_kwargs: Optional keyword arguments for the crawling process.
-
-        Returns:
-            A list of Document objects, each representing a chunk from the crawled URL.
-
-        Raises:
-            IOError: If there is an error processing the URL.
-        """
-        try:
-            # Use Docling to convert the URL to a document
-            conversion_result = self.converter.convert(url)
-            docling_document = conversion_result.document
-
-            # Chunk the document using hierarchical chunking
-            chunks = list(self.chunker.chunk(docling_document))
-
-            documents = []
-            for chunk in chunks:
-                metadata = {"reference": url, "text": chunk.text}
-                documents.append(Document(page_content=chunk.text, metadata=metadata))
-
-            return documents
-
-        except Exception as e:
-            log.color_print(f"Error processing URL {url}: {str(e)}")
-            raise IOError(f"Failed to process URL {url}: {str(e)}")
-
-    @property
-    def supported_file_types(self) -> List[str]:
-        """
-        Return the list of file types and formats supported by Docling.
-
-        Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
-        - PDF
-        - Office formats: DOCX, XLSX, PPTX
-        - Markdown
-        - AsciiDoc
-        - HTML, XHTML
-        - CSV
-        - Images: PNG, JPEG, TIFF, BMP
-
-        Returns:
-            A list of file extensions supported by this crawler.
-        """
-        return [
-            "pdf",
-            "docx",
-            "xlsx",
-            "pptx",
-            "md",
-            "adoc",
-            "asciidoc",
-            "html",
-            "xhtml",
-            "csv",
-            "png",
-            "jpg",
-            "jpeg",
-            "tif",
-            "tiff",
-            "bmp",
-        ]
--- a/deepsearcher/loader/web_crawler/firecrawl_crawler.py
+++ b/deepsearcher/loader/web_crawler/firecrawl_crawler.py
@ -1,88 +0,0 @@
-import os
-from typing import List, Optional
-
-from firecrawl import FirecrawlApp, ScrapeOptions
-from langchain_core.documents import Document
-
-from deepsearcher.loader.web_crawler.base import BaseCrawler
-
-
-class FireCrawlCrawler(BaseCrawler):
-    """
-    Web crawler using the FireCrawl service.
-
-    This crawler uses the FireCrawl service to crawl web pages and convert them
-    into markdown format for further processing. It supports both single-page scraping
-    and recursive crawling of multiple pages.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize the FireCrawlCrawler.
-
-        Args:
-            **kwargs: Optional keyword arguments.
-        """
-        super().__init__(**kwargs)
-        self.app = None
-
-    def crawl_url(
-        self,
-        url: str,
-        max_depth: Optional[int] = None,
-        limit: Optional[int] = None,
-        allow_backward_links: Optional[bool] = None,
-    ) -> List[Document]:
-        """
-        Dynamically crawls a URL using either scrape_url or crawl_url:
-
-        - Uses scrape_url for single-page extraction if no params are provided.
-        - Uses crawl_url to recursively gather pages when any param is provided.
-
-        Args:
-            url (str): The starting URL to crawl.
-            max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
-            limit (Optional[int]): Maximum number of pages to crawl (default: 20).
-            allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
-
-        Returns:
-            List[Document]: List of Document objects with page content and metadata.
-        """
-        # Lazy init
-        self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
-
-        # if user just inputs a single url as param
-        # scrape single page
-        if max_depth is None and limit is None and allow_backward_links is None:
-            # Call the new Firecrawl API, passing formats directly
-            scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
-            data = scrape_response.model_dump()
-            return [
-                Document(
-                    page_content=data.get("markdown", ""),
-                    metadata={"reference": url, **data.get("metadata", {})},
-                )
-            ]
-
-        # else, crawl multiple pages based on users' input params
-        # set default values if not provided
-        crawl_response = self.app.crawl_url(
-            url=url,
-            limit=limit or 20,
-            max_depth=max_depth or 2,
-            allow_backward_links=allow_backward_links or False,
-            scrape_options=ScrapeOptions(formats=["markdown"]),
-            poll_interval=5,
-        )
-        items = crawl_response.model_dump().get("data", [])
-
-        documents: List[Document] = []
-        for item in items:
-            # Support items that are either dicts or Pydantic sub-models
-            item_dict = item.model_dump() if hasattr(item, "model_dump") else item
-            md = item_dict.get("markdown", "")
-            meta = item_dict.get("metadata", {})
-            meta["reference"] = meta.get("url", url)
-            documents.append(Document(page_content=md, metadata=meta))
-
-        return documents
--- a/deepsearcher/loader/web_crawler/jina_crawler.py
+++ b/deepsearcher/loader/web_crawler/jina_crawler.py
@ -1,62 +0,0 @@
-import os
-from typing import List
-
-import requests
-from langchain_core.documents import Document
-
-from deepsearcher.loader.web_crawler.base import BaseCrawler
-
-
-class JinaCrawler(BaseCrawler):
-    """
-    Web crawler using Jina AI's rendering service.
-
-    This crawler uses Jina AI's rendering service to crawl web pages and convert them
-    into markdown format for further processing.
-    """
-
-    def __init__(self, **kwargs):
-        """
-        Initialize the JinaCrawler.
-
-        Args:
-            **kwargs: Optional keyword arguments.
-
-        Raises:
-            ValueError: If the JINA_API_TOKEN environment variable is not set.
-        """
-        super().__init__(**kwargs)
-        self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
-        if not self.jina_api_token:
-            raise ValueError("Missing JINA_API_TOKEN environment variable")
-
-    def crawl_url(self, url: str) -> List[Document]:
-        """
-        Crawl a single URL using Jina AI's rendering service.
-
-        Args:
-            url: The URL to crawl.
-
-        Returns:
-            A list containing a single Document object with the markdown content and metadata.
-
-        Raises:
-            HTTPError: If the request to Jina AI's service fails.
-        """
-        jina_url = f"https://r.jina.ai/{url}"
-        headers = {
-            "Authorization": f"Bearer {self.jina_api_token}",
-            "X-Return-Format": "markdown",
-        }
-
-        response = requests.get(jina_url, headers=headers)
-        response.raise_for_status()
-
-        markdown_content = response.text
-        metadata = {
-            "reference": url,
-            "status_code": response.status_code,
-            "headers": dict(response.headers),
-        }
-
-        return [Document(page_content=markdown_content, metadata=metadata)]
--- a/deepsearcher/templates/html/index.html
+++ b/deepsearcher/templates/html/index.html
@ -63,39 +63,6 @@
          ></div>
        </div>

-        <div class="card">
-          <h2 class="card-title">网站内容加载</h2>
-          <div class="form-group">
-            <label for="websiteUrls">网站URL（多个URL用逗号分隔）</label>
-            <input
-              type="text"
-              id="websiteUrls"
-              placeholder="例如: https://example.com/page1,https://example.com/page2"
-            />
-          </div>
-          <div class="form-group">
-            <label for="webCollectionName">集合名称（可选）</label>
-            <input
-              type="text"
-              id="webCollectionName"
-              placeholder="例如: web_collection"
-            />
-          </div>
-          <div class="form-group">
-            <label for="webCollectionDesc">集合描述（可选）</label>
-            <textarea
-              id="webCollectionDesc"
-              rows="2"
-              placeholder="例如: 来自网站的内容"
-            ></textarea>
-          </div>
-          <button id="loadWebsiteBtn">加载网站内容</button>
-          <div
-            id="webLoadStatus"
-            class="status"
-          ></div>
-        </div>
-
        <div class="card">
          <h2 class="card-title">智能查询</h2>
          <div class="form-group">
--- a/deepsearcher/templates/static/js/app.js
+++ b/deepsearcher/templates/static/js/app.js
@ -343,57 +343,6 @@ document
        }
    });

-// 加载网站内容功能
-document
-    .getElementById('loadWebsiteBtn')
-    .addEventListener('click', async function () {
-        const button = this;
-        const urlsInput = document.getElementById('websiteUrls').value;
-        const collectionName = document.getElementById('webCollectionName').value;
-        const collectionDesc = document.getElementById('webCollectionDesc').value;
-
-        if (!urlsInput) {
-            showStatus('webLoadStatus', '请提供至少一个网站URL', 'error');
-            return;
-        }
-
-        const urls = urlsInput
-            .split(',')
-            .map((url) => url.trim())
-            .filter((url) => url);
-
-        setButtonLoading(button, true);
-        showStatus('webLoadStatus', ' 正在加载网站...', 'loading');
-        hideResult();
-        hideProcessResult();
-
-        try {
-            const response = await fetch('/load-website/', {
-                method: 'POST',
-                headers: {
-                    'Content-Type': 'application/json'
-                },
-                body: JSON.stringify({
-                    urls: urls,
-                    collection_name: collectionName || undefined,
-                    collection_description: collectionDesc || undefined
-                })
-            });
-
-            const data = await response.json();
-
-            if (response.ok) {
-                showStatus('webLoadStatus', data.message, 'success');
-            } else {
-                showStatus('webLoadStatus', `加载失败: ${data.detail}`, 'error');
-            }
-        } catch (error) {
-            showStatus('webLoadStatus', `请求失败: ${error.message}`, 'error');
-        } finally {
-            setButtonLoading(button, false);
-        }
-    });
-
 // 查询功能 - 使用实时流
 document
    .getElementById('queryBtn')
--- a/test_web_only.py
+++ b/test_web_only.py
@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-"""
-只测试网页搜索功能
-"""
-
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from deepsearcher.web_search import WebSearch
-
-def test_web_search():
-    """测试网页搜索功能"""
-    print("=== 测试网页搜索功能 ===")
-    
-    # 初始化网页搜索
-    web_search = WebSearch()
-    
-    # 测试查询
-    test_query = "Milvus是什么"
-    print(f"测试查询: {test_query}")
-    
-    # 执行搜索
-    results = web_search.search_with_retry(test_query, size=4)
-    
-    if results:
-        print(f"✅ 成功找到 {len(results)} 个搜索结果:")
-        for i, result in enumerate(results, 1):
-            print(f"\n--- 结果 {i} ---")
-            print(f"标题: {result.metadata.get('title', 'N/A')}")
-            print(f"链接: {result.reference}")
-            print(f"分数: {result.score}")
-            print(f"内容长度: {len(result.text)} 字符")
-            print(f"内容预览: {result.text[:200]}...")
-            print(f"来源: {result.metadata.get('source', 'N/A')}")
-    else:
-        print("❌ 未找到搜索结果")
-
-if __name__ == "__main__":
-    test_web_search()
--- a/test_web_search.py
+++ b/test_web_search.py
@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试网页搜索功能
-"""
-
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from deepsearcher.web_search import WebSearch
-from deepsearcher import configuration
-
-def test_web_search():
-    """测试网页搜索功能"""
-    print("=== 测试网页搜索功能 ===")
-
-    # 初始化网页搜索
-    web_search = WebSearch()
-
-    # 测试查询
-    test_query = "Milvus是什么"
-    print(f"测试查询: {test_query}")
-
-    # 执行搜索
-    results = web_search.search_with_retry(test_query, size=4)
-
-    if results:
-        print(f"找到 {len(results)} 个搜索结果:")
-        for i, result in enumerate(results, 1):
-            print(f"\n--- 结果 {i} ---")
-            print(f"标题: {result.metadata.get('title', 'N/A')}")
-            print(f"链接: {result.reference}")
-            print(f"分数: {result.score}")
-            print(f"内容长度: {len(result.text)} 字符")
-            print(f"内容预览: {result.text[:200]}...")
-    else:
-        print("未找到搜索结果")
-
-def test_integration():
-    """测试与DeepSearch的集成"""
-    print("\n=== 测试与DeepSearch的集成 ===")
-
-    # 初始化配置
-    configuration.init_config(configuration.config)
-
-    # 创建DeepSearch实例（启用网页搜索）
-    from deepsearcher.agent.deep_search import DeepSearch
-
-    searcher = DeepSearch(
-        llm=configuration.llm,
-        embedding_model=configuration.embedding_model,
-        vector_db=configuration.vector_db,
-        max_iter=2,
-        enable_web_search=True
-    )
-
-    # 测试查询
-    test_query = "Milvus是什么"
-    print(f"测试查询: {test_query}")
-
-    # 执行搜索
-    results, sub_queries = searcher.retrieve(test_query, max_iter=2)
-
-    print(f"生成的子问题: {sub_queries}")
-    print(f"找到 {len(results)} 个搜索结果")
-    # 显示结果统计
-    web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"]
-    vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"]
-
-    print(f"网页搜索结果: {len(web_results)} 个")
-    print(f"向量数据库结果: {len(vector_results)} 个")
-
-if __name__ == "__main__":
-    test_web_search()
-    test_integration()