Browse Source

chore: 移除手动网页内容加载的前后端逻辑

main
tanxing 5 days ago
parent
commit
eaf6a4a9b5
  1. 118
      deepsearcher/cli.py
  2. 40
      deepsearcher/config.yaml
  3. 14
      deepsearcher/configuration.py
  4. 3
      deepsearcher/loader/splitter.py
  5. 11
      deepsearcher/loader/web_crawler/__init__.py
  6. 55
      deepsearcher/loader/web_crawler/base.py
  7. 140
      deepsearcher/loader/web_crawler/crawl4ai_crawler.py
  8. 98
      deepsearcher/loader/web_crawler/docling_crawler.py
  9. 88
      deepsearcher/loader/web_crawler/firecrawl_crawler.py
  10. 62
      deepsearcher/loader/web_crawler/jina_crawler.py
  11. 33
      deepsearcher/templates/html/index.html
  12. 51
      deepsearcher/templates/static/js/app.js
  13. 40
      test_web_only.py
  14. 75
      test_web_search.py

118
deepsearcher/cli.py

@ -1,118 +0,0 @@
import argparse
import logging
import sys
import warnings
from deepsearcher.configuration import Configuration, init_config
from deepsearcher.offline_loading import load_from_local_files, load_from_website
from deepsearcher.online_query import query
from deepsearcher.utils import log
httpx_logger = logging.getLogger("httpx") # disable openai's logger output
httpx_logger.setLevel(logging.WARNING)
warnings.simplefilter(action="ignore", category=FutureWarning) # disable warning output
def main():
"""
Main entry point for the DeepSearcher CLI.
This function parses command line arguments and executes the appropriate action
based on the subcommand provided (query or load). It handles the deprecated
command line format and provides helpful error messages.
Returns:
None
"""
if "--query" in sys.argv or "--load" in sys.argv:
print("\033[91m[Deprecated]\033[0m The use of '--query' and '--load' is deprecated.")
print("Please use:")
print(" deepsearcher query <your_query> --max_iter 3")
print(
" deepsearcher load <your_local_path_or_url> --collection_name <your_collection_name> --collection_desc <your_collection_description>"
)
sys.exit(1)
config = Configuration() # Customize your config here
init_config(config=config)
parser = argparse.ArgumentParser(prog="deepsearcher", description="Deep Searcher.")
subparsers = parser.add_subparsers(dest="subcommand", title="subcommands")
## Arguments of query
query_parser = subparsers.add_parser("query", help="Query a question or search topic.")
query_parser.add_argument("query", type=str, default="", help="query question or search topic.")
query_parser.add_argument(
"--max_iter",
type=int,
default=3,
help="Max iterations of reflection. Default is 3.",
)
## Arguments of loading
load_parser = subparsers.add_parser(
"load", help="Load knowledge from local files or from URLs."
)
load_parser.add_argument(
"load_path",
type=str,
nargs="+", # 1 or more files or urls
help="Load knowledge from local files or from URLs.",
)
load_parser.add_argument(
"--batch_size",
type=int,
default=256,
help="Batch size for loading knowledge.",
)
load_parser.add_argument(
"--collection_name",
type=str,
default=None,
help="Destination collection name of loaded knowledge.",
)
load_parser.add_argument(
"--collection_desc",
type=str,
default=None,
help="Description of the collection.",
)
load_parser.add_argument(
"--force_new_collection",
type=bool,
default=False,
help="If you want to drop origin collection and create a new collection on every load, set to True",
)
args = parser.parse_args()
if args.subcommand == "query":
final_answer, refs, consumed_tokens = query(args.query, max_iter=args.max_iter)
log.color_print("\n==== FINAL ANSWER====\n")
log.color_print(final_answer)
log.color_print("\n### References\n")
for i, ref in enumerate(refs):
log.color_print(f"{i + 1}. {ref.text[:60]}{ref.reference}")
elif args.subcommand == "load":
urls = [url for url in args.load_path if url.startswith("http")]
local_files = [file for file in args.load_path if not file.startswith("http")]
kwargs = {}
if args.collection_name:
kwargs["collection_name"] = args.collection_name
if args.collection_desc:
kwargs["collection_description"] = args.collection_desc
if args.force_new_collection:
kwargs["force_new_collection"] = args.force_new_collection
if args.batch_size:
kwargs["batch_size"] = args.batch_size
if len(urls) > 0:
load_from_website(urls, **kwargs)
if len(local_files) > 0:
load_from_local_files(local_files, **kwargs)
else:
print("Please provide a query or a load argument.")
if __name__ == "__main__":
main()

40
deepsearcher/config.yaml

@ -36,26 +36,6 @@ provide_settings:
# config: {} # config: {}
web_crawler:
provider: "FireCrawlCrawler"
config: {}
# provider: "Crawl4AICrawler"
# config: # Uncomment to custom browser configuration for Crawl4AI
# browser_config:
# headless: false
# proxy: "http://127.0.0.1:7890"
# chrome_channel: "chrome"
# verbose: true
# viewport_width: 800
# viewport_height: 600
# provider: "JinaCrawler"
# config: {}
# provider: "DoclingCrawler"
# config: {}
vector_db: vector_db:
provider: "Milvus" provider: "Milvus"
config: config:
@ -64,27 +44,9 @@ provide_settings:
token: "root:Milvus" token: "root:Milvus"
db: "default" db: "default"
# vector_db:
# provider: "OracleDB"
# config:
# default_collection: "deepsearcher"
# user: ""
# password: ""
# dsn: ""
# config_dir: ""
# wallet_location: ""
# wallet_password: ""
# vector_db:
# provider: "Qdrant"
# config:
# default_collection: "deepsearcher"
# host: "localhost"
# port: 6333
query_settings: query_settings:
max_iter: 3 max_iter: 2
enable_web_search: true
load_settings: load_settings:
chunk_size: 2048 chunk_size: 2048

14
deepsearcher/configuration.py

@ -7,7 +7,6 @@ from deepsearcher.agent import BaseAgent, DeepSearch
from deepsearcher.embedding.base import BaseEmbedding from deepsearcher.embedding.base import BaseEmbedding
from deepsearcher.llm.base import BaseLLM from deepsearcher.llm.base import BaseLLM
from deepsearcher.loader.file_loader.base import BaseLoader from deepsearcher.loader.file_loader.base import BaseLoader
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.vector_db.base import BaseVectorDB from deepsearcher.vector_db.base import BaseVectorDB
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
@ -21,7 +20,7 @@ class Configuration:
Configuration class for DeepSearcher. Configuration class for DeepSearcher.
This class manages the configuration settings for various components of the DeepSearcher system, This class manages the configuration settings for various components of the DeepSearcher system,
including LLM providers, embedding models, file loaders, web crawlers, and vector databases. including LLM providers, embedding models, file loaders and vector databases.
It loads configurations from a YAML file and provides methods to get and set provider configurations. It loads configurations from a YAML file and provides methods to get and set provider configurations.
""" """
@ -151,14 +150,6 @@ class ModuleFactory:
""" """
return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader") return self._create_module_instance("file_loader", "deepsearcher.loader.file_loader")
def create_web_crawler(self) -> BaseCrawler:
"""
Create an instance of a web crawler.
Returns:
An instance of a BaseCrawler implementation.
"""
return self._create_module_instance("web_crawler", "deepsearcher.loader.web_crawler")
def create_vector_db(self) -> BaseVectorDB: def create_vector_db(self) -> BaseVectorDB:
""" """
@ -177,7 +168,6 @@ llm: BaseLLM = None
embedding_model: BaseEmbedding = None embedding_model: BaseEmbedding = None
file_loader: BaseLoader = None file_loader: BaseLoader = None
vector_db: BaseVectorDB = None vector_db: BaseVectorDB = None
web_crawler: BaseCrawler = None
default_searcher: BaseAgent = None default_searcher: BaseAgent = None
def init_config(config: Configuration): def init_config(config: Configuration):
@ -196,13 +186,11 @@ def init_config(config: Configuration):
embedding_model, \ embedding_model, \
file_loader, \ file_loader, \
vector_db, \ vector_db, \
web_crawler, \
default_searcher default_searcher
module_factory = ModuleFactory(config) module_factory = ModuleFactory(config)
llm = module_factory.create_llm() llm = module_factory.create_llm()
embedding_model = module_factory.create_embedding() embedding_model = module_factory.create_embedding()
file_loader = module_factory.create_file_loader() file_loader = module_factory.create_file_loader()
web_crawler = module_factory.create_web_crawler()
vector_db = module_factory.create_vector_db() vector_db = module_factory.create_vector_db()
default_searcher = DeepSearch( default_searcher = DeepSearch(

3
deepsearcher/loader/splitter.py

@ -70,8 +70,7 @@ def _sentence_window_split(
max(0, start_index - offset) : min(len(original_text), end_index + offset) max(0, start_index - offset) : min(len(original_text), end_index + offset)
] ]
reference = doc.metadata.pop("reference", "") reference = doc.metadata.pop("reference", "")
doc.metadata["wider_text"] = wider_text chunk = Chunk(text=wider_text, reference=reference, metadata=doc.metadata)
chunk = Chunk(text=doc_text, reference=reference, metadata=doc.metadata)
chunks.append(chunk) chunks.append(chunk)
return chunks return chunks

11
deepsearcher/loader/web_crawler/__init__.py

@ -1,11 +0,0 @@
from deepsearcher.loader.web_crawler.crawl4ai_crawler import Crawl4AICrawler
from deepsearcher.loader.web_crawler.docling_crawler import DoclingCrawler
from deepsearcher.loader.web_crawler.firecrawl_crawler import FireCrawlCrawler
from deepsearcher.loader.web_crawler.jina_crawler import JinaCrawler
__all__ = [
"FireCrawlCrawler",
"JinaCrawler",
"Crawl4AICrawler",
"DoclingCrawler",
]

55
deepsearcher/loader/web_crawler/base.py

@ -1,55 +0,0 @@
from abc import ABC
from typing import List
from langchain_core.documents import Document
class BaseCrawler(ABC):
"""
Abstract base class for web crawlers.
This class defines the interface for crawling web pages and converting them
into Document objects for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the crawler with optional keyword arguments.
Args:
**kwargs: Optional keyword arguments for specific crawler implementations.
"""
pass
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL and convert it to Document objects.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from the URL.
Note:
Implementations should include the URL reference in the metadata.
e.g. return [Document(page_content=..., metadata={"reference": "www.abc.com/page1.html"})]
"""
pass
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
"""
Crawl multiple URLs and return a list of Document objects.
Args:
urls: A list of URLs to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects containing the content and metadata from all URLs.
"""
documents = []
for url in urls:
documents.extend(self.crawl_url(url, **crawl_kwargs))
return documents

140
deepsearcher/loader/web_crawler/crawl4ai_crawler.py

@ -1,140 +0,0 @@
import asyncio
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log
class Crawl4AICrawler(BaseCrawler):
"""
Web crawler using the Crawl4AI library.
This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them
into markdown format for further processing. It supports both single-page crawling
and batch crawling of multiple pages.
"""
def __init__(self, **kwargs):
"""
Initialize the Crawl4AICrawler.
Args:
**kwargs: Optional keyword arguments.
browser_config: Configuration for the browser used by Crawl4AI.
"""
super().__init__(**kwargs)
self.crawler = None # Lazy init
self.browser_config = kwargs.get("browser_config", None)
def _lazy_init(self):
"""
Initialize the crawler lazily when needed.
This method creates the AsyncWebCrawler instance with the provided browser configuration
only when it's first needed, to avoid unnecessary initialization.
"""
from crawl4ai import AsyncWebCrawler, BrowserConfig
if self.crawler is None:
config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None
self.crawler = AsyncWebCrawler(config=config)
async def _async_crawl(self, url: str) -> Document:
"""
Asynchronously crawl a single URL.
Args:
url: The URL to crawl.
Returns:
A Document object with the markdown content and metadata from the URL.
"""
if self.crawler is None:
self._lazy_init()
async with self.crawler as crawler:
result = await crawler.arun(url)
markdown_content = result.markdown or ""
metadata = {
"reference": url,
"success": result.success,
"status_code": result.status_code,
"media": result.media,
"links": result.links,
}
if hasattr(result, "metadata") and result.metadata:
metadata["title"] = result.metadata.get("title", "")
metadata["author"] = result.metadata.get("author", "")
return Document(page_content=markdown_content, metadata=metadata)
def crawl_url(self, url: str) -> List[Document]:
"""
Crawl a single URL.
Args:
url: The URL to crawl.
Returns:
A list containing a single Document object with the markdown content and metadata,
or an empty list if an error occurs.
"""
try:
document = asyncio.run(self._async_crawl(url))
return [document]
except Exception as e:
log.error(f"Error during crawling {url}: {e}")
return []
async def _async_crawl_many(self, urls: List[str]) -> List[Document]:
"""
Asynchronously crawl multiple URLs.
Args:
urls: A list of URLs to crawl.
Returns:
A list of Document objects with the markdown content and metadata from all URLs.
"""
if self.crawler is None:
self._lazy_init()
async with self.crawler as crawler:
results = await crawler.arun_many(urls)
documents = []
for result in results:
markdown_content = result.markdown or ""
metadata = {
"reference": result.url,
"success": result.success,
"status_code": result.status_code,
"media": result.media,
"links": result.links,
}
if hasattr(result, "metadata") and result.metadata:
metadata["title"] = result.metadata.get("title", "")
metadata["author"] = result.metadata.get("author", "")
documents.append(Document(page_content=markdown_content, metadata=metadata))
return documents
def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:
"""
Crawl multiple URLs.
Args:
urls: A list of URLs to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects with the markdown content and metadata from all URLs,
or an empty list if an error occurs.
"""
try:
return asyncio.run(self._async_crawl_many(urls))
except Exception as e:
log.error(f"Error during crawling {urls}: {e}")
return []

98
deepsearcher/loader/web_crawler/docling_crawler.py

@ -1,98 +0,0 @@
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log
class DoclingCrawler(BaseCrawler):
"""
Web crawler using Docling's DocumentConverter and HierarchicalChunker.
This crawler leverages Docling's capabilities to convert web pages into structured
documents and chunk them appropriately for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
Args:
**kwargs: Optional keyword arguments.
"""
super().__init__(**kwargs)
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
self.converter = DocumentConverter()
self.chunker = HierarchicalChunker()
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL using Docling's conversion and perform hierarchical chunking.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects, each representing a chunk from the crawled URL.
Raises:
IOError: If there is an error processing the URL.
"""
try:
# Use Docling to convert the URL to a document
conversion_result = self.converter.convert(url)
docling_document = conversion_result.document
# Chunk the document using hierarchical chunking
chunks = list(self.chunker.chunk(docling_document))
documents = []
for chunk in chunks:
metadata = {"reference": url, "text": chunk.text}
documents.append(Document(page_content=chunk.text, metadata=metadata))
return documents
except Exception as e:
log.color_print(f"Error processing URL {url}: {str(e)}")
raise IOError(f"Failed to process URL {url}: {str(e)}")
@property
def supported_file_types(self) -> List[str]:
"""
Return the list of file types and formats supported by Docling.
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
- PDF
- Office formats: DOCX, XLSX, PPTX
- Markdown
- AsciiDoc
- HTML, XHTML
- CSV
- Images: PNG, JPEG, TIFF, BMP
Returns:
A list of file extensions supported by this crawler.
"""
return [
"pdf",
"docx",
"xlsx",
"pptx",
"md",
"adoc",
"asciidoc",
"html",
"xhtml",
"csv",
"png",
"jpg",
"jpeg",
"tif",
"tiff",
"bmp",
]

88
deepsearcher/loader/web_crawler/firecrawl_crawler.py

@ -1,88 +0,0 @@
import os
from typing import List, Optional
from firecrawl import FirecrawlApp, ScrapeOptions
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
class FireCrawlCrawler(BaseCrawler):
"""
Web crawler using the FireCrawl service.
This crawler uses the FireCrawl service to crawl web pages and convert them
into markdown format for further processing. It supports both single-page scraping
and recursive crawling of multiple pages.
"""
def __init__(self, **kwargs):
"""
Initialize the FireCrawlCrawler.
Args:
**kwargs: Optional keyword arguments.
"""
super().__init__(**kwargs)
self.app = None
def crawl_url(
self,
url: str,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
) -> List[Document]:
"""
Dynamically crawls a URL using either scrape_url or crawl_url:
- Uses scrape_url for single-page extraction if no params are provided.
- Uses crawl_url to recursively gather pages when any param is provided.
Args:
url (str): The starting URL to crawl.
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
limit (Optional[int]): Maximum number of pages to crawl (default: 20).
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
Returns:
List[Document]: List of Document objects with page content and metadata.
"""
# Lazy init
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
# if user just inputs a single url as param
# scrape single page
if max_depth is None and limit is None and allow_backward_links is None:
# Call the new Firecrawl API, passing formats directly
scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
data = scrape_response.model_dump()
return [
Document(
page_content=data.get("markdown", ""),
metadata={"reference": url, **data.get("metadata", {})},
)
]
# else, crawl multiple pages based on users' input params
# set default values if not provided
crawl_response = self.app.crawl_url(
url=url,
limit=limit or 20,
max_depth=max_depth or 2,
allow_backward_links=allow_backward_links or False,
scrape_options=ScrapeOptions(formats=["markdown"]),
poll_interval=5,
)
items = crawl_response.model_dump().get("data", [])
documents: List[Document] = []
for item in items:
# Support items that are either dicts or Pydantic sub-models
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
md = item_dict.get("markdown", "")
meta = item_dict.get("metadata", {})
meta["reference"] = meta.get("url", url)
documents.append(Document(page_content=md, metadata=meta))
return documents

62
deepsearcher/loader/web_crawler/jina_crawler.py

@ -1,62 +0,0 @@
import os
from typing import List
import requests
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
class JinaCrawler(BaseCrawler):
"""
Web crawler using Jina AI's rendering service.
This crawler uses Jina AI's rendering service to crawl web pages and convert them
into markdown format for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the JinaCrawler.
Args:
**kwargs: Optional keyword arguments.
Raises:
ValueError: If the JINA_API_TOKEN environment variable is not set.
"""
super().__init__(**kwargs)
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
if not self.jina_api_token:
raise ValueError("Missing JINA_API_TOKEN environment variable")
def crawl_url(self, url: str) -> List[Document]:
"""
Crawl a single URL using Jina AI's rendering service.
Args:
url: The URL to crawl.
Returns:
A list containing a single Document object with the markdown content and metadata.
Raises:
HTTPError: If the request to Jina AI's service fails.
"""
jina_url = f"https://r.jina.ai/{url}"
headers = {
"Authorization": f"Bearer {self.jina_api_token}",
"X-Return-Format": "markdown",
}
response = requests.get(jina_url, headers=headers)
response.raise_for_status()
markdown_content = response.text
metadata = {
"reference": url,
"status_code": response.status_code,
"headers": dict(response.headers),
}
return [Document(page_content=markdown_content, metadata=metadata)]

33
deepsearcher/templates/html/index.html

@ -63,39 +63,6 @@
></div> ></div>
</div> </div>
<div class="card">
<h2 class="card-title">网站内容加载</h2>
<div class="form-group">
<label for="websiteUrls">网站URL(多个URL用逗号分隔)</label>
<input
type="text"
id="websiteUrls"
placeholder="例如: https://example.com/page1,https://example.com/page2"
/>
</div>
<div class="form-group">
<label for="webCollectionName">集合名称(可选)</label>
<input
type="text"
id="webCollectionName"
placeholder="例如: web_collection"
/>
</div>
<div class="form-group">
<label for="webCollectionDesc">集合描述(可选)</label>
<textarea
id="webCollectionDesc"
rows="2"
placeholder="例如: 来自网站的内容"
></textarea>
</div>
<button id="loadWebsiteBtn">加载网站内容</button>
<div
id="webLoadStatus"
class="status"
></div>
</div>
<div class="card"> <div class="card">
<h2 class="card-title">智能查询</h2> <h2 class="card-title">智能查询</h2>
<div class="form-group"> <div class="form-group">

51
deepsearcher/templates/static/js/app.js

@ -343,57 +343,6 @@ document
} }
}); });
// 加载网站内容功能
document
.getElementById('loadWebsiteBtn')
.addEventListener('click', async function () {
const button = this;
const urlsInput = document.getElementById('websiteUrls').value;
const collectionName = document.getElementById('webCollectionName').value;
const collectionDesc = document.getElementById('webCollectionDesc').value;
if (!urlsInput) {
showStatus('webLoadStatus', '请提供至少一个网站URL', 'error');
return;
}
const urls = urlsInput
.split(',')
.map((url) => url.trim())
.filter((url) => url);
setButtonLoading(button, true);
showStatus('webLoadStatus', ' 正在加载网站...', 'loading');
hideResult();
hideProcessResult();
try {
const response = await fetch('/load-website/', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
urls: urls,
collection_name: collectionName || undefined,
collection_description: collectionDesc || undefined
})
});
const data = await response.json();
if (response.ok) {
showStatus('webLoadStatus', data.message, 'success');
} else {
showStatus('webLoadStatus', `加载失败: ${data.detail}`, 'error');
}
} catch (error) {
showStatus('webLoadStatus', `请求失败: ${error.message}`, 'error');
} finally {
setButtonLoading(button, false);
}
});
// 查询功能 - 使用实时流 // 查询功能 - 使用实时流
document document
.getElementById('queryBtn') .getElementById('queryBtn')

40
test_web_only.py

@ -1,40 +0,0 @@
#!/usr/bin/env python3
"""
只测试网页搜索功能
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from deepsearcher.web_search import WebSearch
def test_web_search():
"""测试网页搜索功能"""
print("=== 测试网页搜索功能 ===")
# 初始化网页搜索
web_search = WebSearch()
# 测试查询
test_query = "Milvus是什么"
print(f"测试查询: {test_query}")
# 执行搜索
results = web_search.search_with_retry(test_query, size=4)
if results:
print(f"✅ 成功找到 {len(results)} 个搜索结果:")
for i, result in enumerate(results, 1):
print(f"\n--- 结果 {i} ---")
print(f"标题: {result.metadata.get('title', 'N/A')}")
print(f"链接: {result.reference}")
print(f"分数: {result.score}")
print(f"内容长度: {len(result.text)} 字符")
print(f"内容预览: {result.text[:200]}...")
print(f"来源: {result.metadata.get('source', 'N/A')}")
else:
print("❌ 未找到搜索结果")
if __name__ == "__main__":
test_web_search()

75
test_web_search.py

@ -1,75 +0,0 @@
#!/usr/bin/env python3
"""
测试网页搜索功能
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from deepsearcher.web_search import WebSearch
from deepsearcher import configuration
def test_web_search():
"""测试网页搜索功能"""
print("=== 测试网页搜索功能 ===")
# 初始化网页搜索
web_search = WebSearch()
# 测试查询
test_query = "Milvus是什么"
print(f"测试查询: {test_query}")
# 执行搜索
results = web_search.search_with_retry(test_query, size=4)
if results:
print(f"找到 {len(results)} 个搜索结果:")
for i, result in enumerate(results, 1):
print(f"\n--- 结果 {i} ---")
print(f"标题: {result.metadata.get('title', 'N/A')}")
print(f"链接: {result.reference}")
print(f"分数: {result.score}")
print(f"内容长度: {len(result.text)} 字符")
print(f"内容预览: {result.text[:200]}...")
else:
print("未找到搜索结果")
def test_integration():
"""测试与DeepSearch的集成"""
print("\n=== 测试与DeepSearch的集成 ===")
# 初始化配置
configuration.init_config(configuration.config)
# 创建DeepSearch实例(启用网页搜索)
from deepsearcher.agent.deep_search import DeepSearch
searcher = DeepSearch(
llm=configuration.llm,
embedding_model=configuration.embedding_model,
vector_db=configuration.vector_db,
max_iter=2,
enable_web_search=True
)
# 测试查询
test_query = "Milvus是什么"
print(f"测试查询: {test_query}")
# 执行搜索
results, sub_queries = searcher.retrieve(test_query, max_iter=2)
print(f"生成的子问题: {sub_queries}")
print(f"找到 {len(results)} 个搜索结果")
# 显示结果统计
web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"]
vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"]
print(f"网页搜索结果: {len(web_results)}")
print(f"向量数据库结果: {len(vector_results)}")
if __name__ == "__main__":
test_web_search()
test_integration()
Loading…
Cancel
Save