From eaf6a4a9b5dd4254abc366284de00e138dbc3fac Mon Sep 17 00:00:00 2001 From: tanxing Date: Mon, 18 Aug 2025 13:37:10 +0800 Subject: [PATCH] =?UTF-8?q?chore:=20=E7=A7=BB=E9=99=A4=E6=89=8B=E5=8A=A8?= =?UTF-8?q?=E7=BD=91=E9=A1=B5=E5=86=85=E5=AE=B9=E5=8A=A0=E8=BD=BD=E7=9A=84?= =?UTF-8?q?=E5=89=8D=E5=90=8E=E7=AB=AF=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepsearcher/cli.py | 118 --------------- deepsearcher/config.yaml | 40 +---- deepsearcher/configuration.py | 14 +- deepsearcher/loader/splitter.py | 3 +- deepsearcher/loader/web_crawler/__init__.py | 11 -- deepsearcher/loader/web_crawler/base.py | 55 ------- .../loader/web_crawler/crawl4ai_crawler.py | 140 ------------------ .../loader/web_crawler/docling_crawler.py | 98 ------------ .../loader/web_crawler/firecrawl_crawler.py | 88 ----------- .../loader/web_crawler/jina_crawler.py | 62 -------- deepsearcher/templates/html/index.html | 33 ----- deepsearcher/templates/static/js/app.js | 51 ------- test_web_only.py | 40 ----- test_web_search.py | 75 ---------- 14 files changed, 3 insertions(+), 825 deletions(-) delete mode 100644 deepsearcher/cli.py delete mode 100644 deepsearcher/loader/web_crawler/__init__.py delete mode 100644 deepsearcher/loader/web_crawler/base.py delete mode 100644 deepsearcher/loader/web_crawler/crawl4ai_crawler.py delete mode 100644 deepsearcher/loader/web_crawler/docling_crawler.py delete mode 100644 deepsearcher/loader/web_crawler/firecrawl_crawler.py delete mode 100644 deepsearcher/loader/web_crawler/jina_crawler.py delete mode 100644 test_web_only.py delete mode 100644 test_web_search.py diff --git a/deepsearcher/cli.py b/deepsearcher/cli.py deleted file mode 100644 index b9bdcf1..0000000 --- a/deepsearcher/cli.py +++ /dev/null @@ -1,118 +0,0 @@ -import argparse -import logging -import sys -import warnings - -from deepsearcher.configuration import Configuration, init_config -from deepsearcher.offline_loading import load_from_local_files, load_from_website -from deepsearcher.online_query import query -from deepsearcher.utils import log - -httpx_logger = logging.getLogger("httpx") # disable openai's logger output -httpx_logger.setLevel(logging.WARNING) - - -warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output - - -def main(): - """ - Main entry point for the DeepSearcher CLI. - - This function parses command line arguments and executes the appropriate action - based on the subcommand provided (query or load). It handles the deprecated - command line format and provides helpful error messages. - - Returns: - None - """ - if "--query" in sys.argv or "--load" in sys.argv: - print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.") - print("Please use:") - print(" deepsearcher query --max_iter 3") - print( - " deepsearcher load --collection_name --collection_desc " - ) - sys.exit(1) - - config = Configuration() # Customize your config here - init_config(config=config) - - parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.") - subparsers = parser.add_subparsers(dest="subcommand", title="subcommands") - - ## Arguments of query - query_parser = subparsers.add_parser("query", help="Query a question or search topic.") - query_parser.add_argument("query", type=str, default="", help="query question or search topic.") - query_parser.add_argument( - "--max_iter", - type=int, - default=3, - help="Max iterations of reflection. Default is 3.", - ) - - ## Arguments of loading - load_parser = subparsers.add_parser( - "load", help="Load knowledge from local files or from URLs." - ) - load_parser.add_argument( - "load_path", - type=str, - nargs="+", # 1 or more files or urls - help="Load knowledge from local files or from URLs.", - ) - load_parser.add_argument( - "--batch_size", - type=int, - default=256, - help="Batch size for loading knowledge.", - ) - load_parser.add_argument( - "--collection_name", - type=str, - default=None, - help="Destination collection name of loaded knowledge.", - ) - load_parser.add_argument( - "--collection_desc", - type=str, - default=None, - help="Description of the collection.", - ) - load_parser.add_argument( - "--force_new_collection", - type=bool, - default=False, - help="If you want to drop origin collection and create a new collection on every load, set to True", - ) - - args = parser.parse_args() - if args.subcommand == "query": - final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter) - log.color_print("\n==== FINAL ANSWER====\n") - log.color_print(final_answer) - log.color_print("\n### References\n") - for i, ref in enumerate(refs): - log.color_print(f"{i + 1}. {ref.text[:60]}… {ref.reference}") - elif args.subcommand == "load": - urls = [url for url in args.load_path if url.startswith("http")] - local_files = [file for file in args.load_path if not file.startswith("http")] - kwargs = {} - if args.collection_name: - kwargs["collection_name"] = args.collection_name - if args.collection_desc: - kwargs["collection_description"] = args.collection_desc - if args.force_new_collection: - kwargs["force_new_collection"] = args.force_new_collection - if args.batch_size: - kwargs["batch_size"] = args.batch_size - if len(urls) > 0: - load_from_website(urls, **kwargs) - if len(local_files) > 0: - load_from_local_files(local_files, **kwargs) - else: - print("Please provide a query or a load argument.") - - -if __name__ == "__main__": - main() diff --git a/deepsearcher/config.yaml b/deepsearcher/config.yaml index 88af5bf..1d70dbb 100644 --- a/deepsearcher/config.yaml +++ b/deepsearcher/config.yaml @@ -36,26 +36,6 @@ provide_settings: # config: {} - web_crawler: - provider: "FireCrawlCrawler" - config: {} - - # provider: "Crawl4AICrawler" - # config: # Uncomment to custom browser configuration for Crawl4AI - # browser_config: - # headless: false - # proxy: "http://127.0.0.1:7890" - # chrome_channel: "chrome" - # verbose: true - # viewport_width: 800 - # viewport_height: 600 - - # provider: "JinaCrawler" - # config: {} - - # provider: "DoclingCrawler" - # config: {} - vector_db: provider: "Milvus" config: @@ -64,27 +44,9 @@ provide_settings: token: "root:Milvus" db: "default" - # vector_db: - # provider: "OracleDB" - # config: - # default_collection: "deepsearcher" - # user: "" - # password: "" - # dsn: "" - # config_dir: "" - # wallet_location: "" - # wallet_password: "" - - # vector_db: - # provider: "Qdrant" - # config: - # default_collection: "deepsearcher" - # host: "localhost" - # port: 6333 query_settings: - max_iter: 3 - enable_web_search: true + max_iter: 2 load_settings: chunk_size: 2048 diff --git a/deepsearcher/configuration.py b/deepsearcher/configuration.py index 56763a7..eafff92 100644 --- a/deepsearcher/configuration.py +++ b/deepsearcher/configuration.py @@ -7,7 +7,6 @@ from deepsearcher.agent import BaseAgent, DeepSearch from deepsearcher.embedding.base import BaseEmbedding from deepsearcher.llm.base import BaseLLM from deepsearcher.loader.file_loader.base import BaseLoader -from deepsearcher.loader.web_crawler.base import BaseCrawler from deepsearcher.vector_db.base import BaseVectorDB current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -21,7 +20,7 @@ class Configuration: Configuration class for DeepSearcher. This class manages the configuration settings for various components of the DeepSearcher system, - including LLM providers, embedding models, file loaders, web crawlers, and vector databases. + including LLM providers, embedding models, file loaders and vector databases. It loads configurations from a YAML file and provides methods to get and set provider configurations. """ @@ -151,14 +150,6 @@ class ModuleFactory: """ return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader") - def create_web_crawler(self) -> BaseCrawler: - """ - Create an instance of a web crawler. - - Returns: - An instance of a BaseCrawler implementation. - """ - return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler") def create_vector_db(self) -> BaseVectorDB: """ @@ -177,7 +168,6 @@ llm: BaseLLM = None embedding_model: BaseEmbedding = None file_loader: BaseLoader = None vector_db: BaseVectorDB = None -web_crawler: BaseCrawler = None default_searcher: BaseAgent = None def init_config(config: Configuration): @@ -196,13 +186,11 @@ def init_config(config: Configuration): embedding_model, \ file_loader, \ vector_db, \ - web_crawler, \ default_searcher module_factory = ModuleFactory(config) llm = module_factory.create_llm() embedding_model = module_factory.create_embedding() file_loader = module_factory.create_file_loader() - web_crawler = module_factory.create_web_crawler() vector_db = module_factory.create_vector_db() default_searcher = DeepSearch( diff --git a/deepsearcher/loader/splitter.py b/deepsearcher/loader/splitter.py index cdde9e2..b25074c 100644 --- a/deepsearcher/loader/splitter.py +++ b/deepsearcher/loader/splitter.py @@ -70,8 +70,7 @@ def _sentence_window_split( max(0, start_index - offset) : min(len(original_text), end_index + offset) ] reference = doc.metadata.pop("reference", "") - doc.metadata["wider_text"] = wider_text - chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata) + chunk = Chunk(text=wider_text, reference=reference, metadata=doc.metadata) chunks.append(chunk) return chunks diff --git a/deepsearcher/loader/web_crawler/__init__.py b/deepsearcher/loader/web_crawler/__init__.py deleted file mode 100644 index 2905fea..0000000 --- a/deepsearcher/loader/web_crawler/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler -from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler -from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler -from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler - -__all__ = [ - "FireCrawlCrawler", - "JinaCrawler", - "Crawl4AICrawler", - "DoclingCrawler", -] diff --git a/deepsearcher/loader/web_crawler/base.py b/deepsearcher/loader/web_crawler/base.py deleted file mode 100644 index df63efc..0000000 --- a/deepsearcher/loader/web_crawler/base.py +++ /dev/null @@ -1,55 +0,0 @@ -from abc import ABC -from typing import List - -from langchain_core.documents import Document - - -class BaseCrawler(ABC): - """ - Abstract base class for web crawlers. - - This class defines the interface for crawling web pages and converting them - into Document objects for further processing. - """ - - def __init__(self, **kwargs): - """ - Initialize the crawler with optional keyword arguments. - - Args: - **kwargs: Optional keyword arguments for specific crawler implementations. - """ - pass - - def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: - """ - Crawl a single URL and convert it to Document objects. - - Args: - url: The URL to crawl. - **crawl_kwargs: Optional keyword arguments for the crawling process. - - Returns: - A list of Document objects containing the content and metadata from the URL. - - Note: - Implementations should include the URL reference in the metadata. - e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})] - """ - pass - - def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: - """ - Crawl multiple URLs and return a list of Document objects. - - Args: - urls: A list of URLs to crawl. - **crawl_kwargs: Optional keyword arguments for the crawling process. - - Returns: - A list of Document objects containing the content and metadata from all URLs. - """ - documents = [] - for url in urls: - documents.extend(self.crawl_url(url, **crawl_kwargs)) - return documents diff --git a/deepsearcher/loader/web_crawler/crawl4ai_crawler.py b/deepsearcher/loader/web_crawler/crawl4ai_crawler.py deleted file mode 100644 index 2aeb0bb..0000000 --- a/deepsearcher/loader/web_crawler/crawl4ai_crawler.py +++ /dev/null @@ -1,140 +0,0 @@ -import asyncio -from typing import List - -from langchain_core.documents import Document - -from deepsearcher.loader.web_crawler.base import BaseCrawler -from deepsearcher.utils import log - - -class Crawl4AICrawler(BaseCrawler): - """ - Web crawler using the Crawl4AI library. - - This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them - into markdown format for further processing. It supports both single-page crawling - and batch crawling of multiple pages. - """ - - def __init__(self, **kwargs): - """ - Initialize the Crawl4AICrawler. - - Args: - **kwargs: Optional keyword arguments. - browser_config: Configuration for the browser used by Crawl4AI. - """ - super().__init__(**kwargs) - self.crawler = None # Lazy init - self.browser_config = kwargs.get("browser_config", None) - - def _lazy_init(self): - """ - Initialize the crawler lazily when needed. - - This method creates the AsyncWebCrawler instance with the provided browser configuration - only when it's first needed, to avoid unnecessary initialization. - """ - from crawl4ai import AsyncWebCrawler, BrowserConfig - - if self.crawler is None: - config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None - self.crawler = AsyncWebCrawler(config=config) - - async def _async_crawl(self, url: str) -> Document: - """ - Asynchronously crawl a single URL. - - Args: - url: The URL to crawl. - - Returns: - A Document object with the markdown content and metadata from the URL. - """ - if self.crawler is None: - self._lazy_init() - - async with self.crawler as crawler: - result = await crawler.arun(url) - - markdown_content = result.markdown or "" - - metadata = { - "reference": url, - "success": result.success, - "status_code": result.status_code, - "media": result.media, - "links": result.links, - } - - if hasattr(result, "metadata") and result.metadata: - metadata["title"] = result.metadata.get("title", "") - metadata["author"] = result.metadata.get("author", "") - - return Document(page_content=markdown_content, metadata=metadata) - - def crawl_url(self, url: str) -> List[Document]: - """ - Crawl a single URL. - - Args: - url: The URL to crawl. - - Returns: - A list containing a single Document object with the markdown content and metadata, - or an empty list if an error occurs. - """ - try: - document = asyncio.run(self._async_crawl(url)) - return [document] - except Exception as e: - log.error(f"Error during crawling {url}: {e}") - return [] - - async def _async_crawl_many(self, urls: List[str]) -> List[Document]: - """ - Asynchronously crawl multiple URLs. - - Args: - urls: A list of URLs to crawl. - - Returns: - A list of Document objects with the markdown content and metadata from all URLs. - """ - if self.crawler is None: - self._lazy_init() - async with self.crawler as crawler: - results = await crawler.arun_many(urls) - documents = [] - for result in results: - markdown_content = result.markdown or "" - metadata = { - "reference": result.url, - "success": result.success, - "status_code": result.status_code, - "media": result.media, - "links": result.links, - } - if hasattr(result, "metadata") and result.metadata: - metadata["title"] = result.metadata.get("title", "") - metadata["author"] = result.metadata.get("author", "") - documents.append(Document(page_content=markdown_content, metadata=metadata)) - return documents - - def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: - """ - Crawl multiple URLs. - - Args: - urls: A list of URLs to crawl. - **crawl_kwargs: Optional keyword arguments for the crawling process. - - Returns: - A list of Document objects with the markdown content and metadata from all URLs, - or an empty list if an error occurs. - """ - try: - return asyncio.run(self._async_crawl_many(urls)) - except Exception as e: - log.error(f"Error during crawling {urls}: {e}") - return [] diff --git a/deepsearcher/loader/web_crawler/docling_crawler.py b/deepsearcher/loader/web_crawler/docling_crawler.py deleted file mode 100644 index 2421d0d..0000000 --- a/deepsearcher/loader/web_crawler/docling_crawler.py +++ /dev/null @@ -1,98 +0,0 @@ -from typing import List - -from langchain_core.documents import Document - -from deepsearcher.loader.web_crawler.base import BaseCrawler -from deepsearcher.utils import log - - -class DoclingCrawler(BaseCrawler): - """ - Web crawler using Docling's DocumentConverter and HierarchicalChunker. - - This crawler leverages Docling's capabilities to convert web pages into structured - documents and chunk them appropriately for further processing. - """ - - def __init__(self, **kwargs): - """ - Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances. - - Args: - **kwargs: Optional keyword arguments. - """ - super().__init__(**kwargs) - from docling.document_converter import DocumentConverter - from docling_core.transforms.chunker import HierarchicalChunker - - self.converter = DocumentConverter() - self.chunker = HierarchicalChunker() - - def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]: - """ - Crawl a single URL using Docling's conversion and perform hierarchical chunking. - - Args: - url: The URL to crawl. - **crawl_kwargs: Optional keyword arguments for the crawling process. - - Returns: - A list of Document objects, each representing a chunk from the crawled URL. - - Raises: - IOError: If there is an error processing the URL. - """ - try: - # Use Docling to convert the URL to a document - conversion_result = self.converter.convert(url) - docling_document = conversion_result.document - - # Chunk the document using hierarchical chunking - chunks = list(self.chunker.chunk(docling_document)) - - documents = [] - for chunk in chunks: - metadata = {"reference": url, "text": chunk.text} - documents.append(Document(page_content=chunk.text, metadata=metadata)) - - return documents - - except Exception as e: - log.color_print(f"Error processing URL {url}: {str(e)}") - raise IOError(f"Failed to process URL {url}: {str(e)}") - - @property - def supported_file_types(self) -> List[str]: - """ - Return the list of file types and formats supported by Docling. - - Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/): - - PDF - - Office formats: DOCX, XLSX, PPTX - - Markdown - - AsciiDoc - - HTML, XHTML - - CSV - - Images: PNG, JPEG, TIFF, BMP - - Returns: - A list of file extensions supported by this crawler. - """ - return [ - "pdf", - "docx", - "xlsx", - "pptx", - "md", - "adoc", - "asciidoc", - "html", - "xhtml", - "csv", - "png", - "jpg", - "jpeg", - "tif", - "tiff", - "bmp", - ] diff --git a/deepsearcher/loader/web_crawler/firecrawl_crawler.py b/deepsearcher/loader/web_crawler/firecrawl_crawler.py deleted file mode 100644 index e0f8e88..0000000 --- a/deepsearcher/loader/web_crawler/firecrawl_crawler.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -from typing import List, Optional - -from firecrawl import FirecrawlApp, ScrapeOptions -from langchain_core.documents import Document - -from deepsearcher.loader.web_crawler.base import BaseCrawler - - -class FireCrawlCrawler(BaseCrawler): - """ - Web crawler using the FireCrawl service. - - This crawler uses the FireCrawl service to crawl web pages and convert them - into markdown format for further processing. It supports both single-page scraping - and recursive crawling of multiple pages. - """ - - def __init__(self, **kwargs): - """ - Initialize the FireCrawlCrawler. - - Args: - **kwargs: Optional keyword arguments. - """ - super().__init__(**kwargs) - self.app = None - - def crawl_url( - self, - url: str, - max_depth: Optional[int] = None, - limit: Optional[int] = None, - allow_backward_links: Optional[bool] = None, - ) -> List[Document]: - """ - Dynamically crawls a URL using either scrape_url or crawl_url: - - - Uses scrape_url for single-page extraction if no params are provided. - - Uses crawl_url to recursively gather pages when any param is provided. - - Args: - url (str): The starting URL to crawl. - max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2). - limit (Optional[int]): Maximum number of pages to crawl (default: 20). - allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False). - - Returns: - List[Document]: List of Document objects with page content and metadata. - """ - # Lazy init - self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) - - # if user just inputs a single url as param - # scrape single page - if max_depth is None and limit is None and allow_backward_links is None: - # Call the new Firecrawl API, passing formats directly - scrape_response = self.app.scrape_url(url=url, formats=["markdown"]) - data = scrape_response.model_dump() - return [ - Document( - page_content=data.get("markdown", ""), - metadata={"reference": url, **data.get("metadata", {})}, - ) - ] - - # else, crawl multiple pages based on users' input params - # set default values if not provided - crawl_response = self.app.crawl_url( - url=url, - limit=limit or 20, - max_depth=max_depth or 2, - allow_backward_links=allow_backward_links or False, - scrape_options=ScrapeOptions(formats=["markdown"]), - poll_interval=5, - ) - items = crawl_response.model_dump().get("data", []) - - documents: List[Document] = [] - for item in items: - # Support items that are either dicts or Pydantic sub-models - item_dict = item.model_dump() if hasattr(item, "model_dump") else item - md = item_dict.get("markdown", "") - meta = item_dict.get("metadata", {}) - meta["reference"] = meta.get("url", url) - documents.append(Document(page_content=md, metadata=meta)) - - return documents diff --git a/deepsearcher/loader/web_crawler/jina_crawler.py b/deepsearcher/loader/web_crawler/jina_crawler.py deleted file mode 100644 index 873f207..0000000 --- a/deepsearcher/loader/web_crawler/jina_crawler.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from typing import List - -import requests -from langchain_core.documents import Document - -from deepsearcher.loader.web_crawler.base import BaseCrawler - - -class JinaCrawler(BaseCrawler): - """ - Web crawler using Jina AI's rendering service. - - This crawler uses Jina AI's rendering service to crawl web pages and convert them - into markdown format for further processing. - """ - - def __init__(self, **kwargs): - """ - Initialize the JinaCrawler. - - Args: - **kwargs: Optional keyword arguments. - - Raises: - ValueError: If the JINA_API_TOKEN environment variable is not set. - """ - super().__init__(**kwargs) - self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY") - if not self.jina_api_token: - raise ValueError("Missing JINA_API_TOKEN environment variable") - - def crawl_url(self, url: str) -> List[Document]: - """ - Crawl a single URL using Jina AI's rendering service. - - Args: - url: The URL to crawl. - - Returns: - A list containing a single Document object with the markdown content and metadata. - - Raises: - HTTPError: If the request to Jina AI's service fails. - """ - jina_url = f"https://r.jina.ai/{url}" - headers = { - "Authorization": f"Bearer {self.jina_api_token}", - "X-Return-Format": "markdown", - } - - response = requests.get(jina_url, headers=headers) - response.raise_for_status() - - markdown_content = response.text - metadata = { - "reference": url, - "status_code": response.status_code, - "headers": dict(response.headers), - } - - return [Document(page_content=markdown_content, metadata=metadata)] diff --git a/deepsearcher/templates/html/index.html b/deepsearcher/templates/html/index.html index 53d77e9..52c754e 100644 --- a/deepsearcher/templates/html/index.html +++ b/deepsearcher/templates/html/index.html @@ -63,39 +63,6 @@ > -
-

网站内容加载

-
- - -
-
- - -
-
- - -
- -
-
-

智能查询

diff --git a/deepsearcher/templates/static/js/app.js b/deepsearcher/templates/static/js/app.js index 7d1a3ef..e8ecadb 100644 --- a/deepsearcher/templates/static/js/app.js +++ b/deepsearcher/templates/static/js/app.js @@ -343,57 +343,6 @@ document } }); -// 加载网站内容功能 -document - .getElementById('loadWebsiteBtn') - .addEventListener('click', async function () { - const button = this; - const urlsInput = document.getElementById('websiteUrls').value; - const collectionName = document.getElementById('webCollectionName').value; - const collectionDesc = document.getElementById('webCollectionDesc').value; - - if (!urlsInput) { - showStatus('webLoadStatus', '请提供至少一个网站URL', 'error'); - return; - } - - const urls = urlsInput - .split(',') - .map((url) => url.trim()) - .filter((url) => url); - - setButtonLoading(button, true); - showStatus('webLoadStatus', ' 正在加载网站...', 'loading'); - hideResult(); - hideProcessResult(); - - try { - const response = await fetch('/load-website/', { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ - urls: urls, - collection_name: collectionName || undefined, - collection_description: collectionDesc || undefined - }) - }); - - const data = await response.json(); - - if (response.ok) { - showStatus('webLoadStatus', data.message, 'success'); - } else { - showStatus('webLoadStatus', `加载失败: ${data.detail}`, 'error'); - } - } catch (error) { - showStatus('webLoadStatus', `请求失败: ${error.message}`, 'error'); - } finally { - setButtonLoading(button, false); - } - }); - // 查询功能 - 使用实时流 document .getElementById('queryBtn') diff --git a/test_web_only.py b/test_web_only.py deleted file mode 100644 index 5c6fc86..0000000 --- a/test_web_only.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -""" -只测试网页搜索功能 -""" - -import sys -import os -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -from deepsearcher.web_search import WebSearch - -def test_web_search(): - """测试网页搜索功能""" - print("=== 测试网页搜索功能 ===") - - # 初始化网页搜索 - web_search = WebSearch() - - # 测试查询 - test_query = "Milvus是什么" - print(f"测试查询: {test_query}") - - # 执行搜索 - results = web_search.search_with_retry(test_query, size=4) - - if results: - print(f"✅ 成功找到 {len(results)} 个搜索结果:") - for i, result in enumerate(results, 1): - print(f"\n--- 结果 {i} ---") - print(f"标题: {result.metadata.get('title', 'N/A')}") - print(f"链接: {result.reference}") - print(f"分数: {result.score}") - print(f"内容长度: {len(result.text)} 字符") - print(f"内容预览: {result.text[:200]}...") - print(f"来源: {result.metadata.get('source', 'N/A')}") - else: - print("❌ 未找到搜索结果") - -if __name__ == "__main__": - test_web_search() diff --git a/test_web_search.py b/test_web_search.py deleted file mode 100644 index 4387926..0000000 --- a/test_web_search.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 -""" -测试网页搜索功能 -""" - -import sys -import os -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -from deepsearcher.web_search import WebSearch -from deepsearcher import configuration - -def test_web_search(): - """测试网页搜索功能""" - print("=== 测试网页搜索功能 ===") - - # 初始化网页搜索 - web_search = WebSearch() - - # 测试查询 - test_query = "Milvus是什么" - print(f"测试查询: {test_query}") - - # 执行搜索 - results = web_search.search_with_retry(test_query, size=4) - - if results: - print(f"找到 {len(results)} 个搜索结果:") - for i, result in enumerate(results, 1): - print(f"\n--- 结果 {i} ---") - print(f"标题: {result.metadata.get('title', 'N/A')}") - print(f"链接: {result.reference}") - print(f"分数: {result.score}") - print(f"内容长度: {len(result.text)} 字符") - print(f"内容预览: {result.text[:200]}...") - else: - print("未找到搜索结果") - -def test_integration(): - """测试与DeepSearch的集成""" - print("\n=== 测试与DeepSearch的集成 ===") - - # 初始化配置 - configuration.init_config(configuration.config) - - # 创建DeepSearch实例(启用网页搜索) - from deepsearcher.agent.deep_search import DeepSearch - - searcher = DeepSearch( - llm=configuration.llm, - embedding_model=configuration.embedding_model, - vector_db=configuration.vector_db, - max_iter=2, - enable_web_search=True - ) - - # 测试查询 - test_query = "Milvus是什么" - print(f"测试查询: {test_query}") - - # 执行搜索 - results, sub_queries = searcher.retrieve(test_query, max_iter=2) - - print(f"生成的子问题: {sub_queries}") - print(f"找到 {len(results)} 个搜索结果") - # 显示结果统计 - web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"] - vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"] - - print(f"网页搜索结果: {len(web_results)} 个") - print(f"向量数据库结果: {len(vector_results)} 个") - -if __name__ == "__main__": - test_web_search() - test_integration()