6 changed files with 335 additions and 68 deletions
@ -0,0 +1,113 @@ |
|||
import http.client |
|||
import json |
|||
import time |
|||
from deepsearcher.vector_db import RetrievalResult |
|||
from deepsearcher.utils import log |
|||
|
|||
|
|||
class WebSearch: |
|||
"""网页搜索类,用于调用metaso.cn API进行网页搜索""" |
|||
def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"): |
|||
""" |
|||
初始化网页搜索 |
|||
Args: |
|||
api_key (str): metaso.cn API密钥 |
|||
""" |
|||
self.api_key = api_key |
|||
self.base_url = "metaso.cn" |
|||
self.endpoint = "/api/v1/search" |
|||
|
|||
def search(self, query: str, size: int = 4) -> list[RetrievalResult]: |
|||
""" |
|||
执行网页搜索 |
|||
Args: |
|||
query (str): 搜索查询 |
|||
size (int): 返回结果数量,默认为4 |
|||
Returns: |
|||
List[RetrievalResult]: 搜索结果列表 |
|||
""" |
|||
try: |
|||
# 构建请求数据 |
|||
payload = json.dumps({ |
|||
"q": query, |
|||
"scope": "webpage", |
|||
"includeSummary": False, |
|||
"size": str(size), |
|||
"includeRawContent": True, |
|||
"conciseSnippet": True |
|||
}) |
|||
|
|||
headers = { |
|||
'Authorization': f'Bearer {self.api_key}', |
|||
'Accept': 'application/json', |
|||
'Content-Type': 'application/json' |
|||
} |
|||
|
|||
# 发送请求 |
|||
conn = http.client.HTTPSConnection(self.base_url) |
|||
conn.request("POST", self.endpoint, payload, headers) |
|||
res = conn.getresponse() |
|||
data = res.read() |
|||
|
|||
if res.status != 200: |
|||
log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}") |
|||
return [] |
|||
|
|||
response_data = json.loads(data.decode("utf-8")) |
|||
|
|||
# 解析搜索结果 |
|||
results = [] |
|||
if "webpages" in response_data: |
|||
for i, webpage in enumerate(response_data["webpages"]): |
|||
# 使用content字段作为主要文本内容 |
|||
content = webpage.get("content", "") |
|||
if not content: |
|||
content = webpage.get("snippet", "") |
|||
|
|||
# 创建RetrievalResult对象 |
|||
result = RetrievalResult( |
|||
embedding=None, # 网页搜索结果没有向量 |
|||
text=content, |
|||
reference=webpage.get("link", ""), |
|||
score=1.0 - (i * (1 / size)), # 根据位置计算分数 |
|||
metadata={ |
|||
"title": webpage.get("title", ""), |
|||
"date": webpage.get("date", ""), |
|||
"authors": webpage.get("authors", []), |
|||
"position": webpage.get("position", i + 1), |
|||
"source": "webpage" |
|||
} |
|||
) |
|||
results.append(result) |
|||
|
|||
log.info(f"网页搜索成功,找到 {len(results)} 个结果") |
|||
return results |
|||
|
|||
except Exception as e: |
|||
log.error(f"网页搜索出错: {str(e)}") |
|||
return [] |
|||
finally: |
|||
if 'conn' in locals(): |
|||
conn.close() |
|||
|
|||
def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]: |
|||
""" |
|||
带重试机制的网页搜索 |
|||
Args: |
|||
query (str): 搜索查询 |
|||
size (int): 返回结果数量 |
|||
max_retries (int): 最大重试次数 |
|||
Returns: |
|||
List[RetrievalResult]: 搜索结果列表 |
|||
""" |
|||
for attempt in range(max_retries): |
|||
try: |
|||
results = self.search(query, size) |
|||
if results: |
|||
return results |
|||
except Exception as e: |
|||
log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}") |
|||
if attempt < max_retries - 1: |
|||
time.sleep(1) # 等待1秒后重试 |
|||
log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败") |
|||
return [] |
@ -0,0 +1,40 @@ |
|||
#!/usr/bin/env python3 |
|||
""" |
|||
只测试网页搜索功能 |
|||
""" |
|||
|
|||
import sys |
|||
import os |
|||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
from deepsearcher.web_search import WebSearch |
|||
|
|||
def test_web_search(): |
|||
"""测试网页搜索功能""" |
|||
print("=== 测试网页搜索功能 ===") |
|||
|
|||
# 初始化网页搜索 |
|||
web_search = WebSearch() |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results = web_search.search_with_retry(test_query, size=4) |
|||
|
|||
if results: |
|||
print(f"✅ 成功找到 {len(results)} 个搜索结果:") |
|||
for i, result in enumerate(results, 1): |
|||
print(f"\n--- 结果 {i} ---") |
|||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|||
print(f"链接: {result.reference}") |
|||
print(f"分数: {result.score}") |
|||
print(f"内容长度: {len(result.text)} 字符") |
|||
print(f"内容预览: {result.text[:200]}...") |
|||
print(f"来源: {result.metadata.get('source', 'N/A')}") |
|||
else: |
|||
print("❌ 未找到搜索结果") |
|||
|
|||
if __name__ == "__main__": |
|||
test_web_search() |
@ -0,0 +1,75 @@ |
|||
#!/usr/bin/env python3 |
|||
""" |
|||
测试网页搜索功能 |
|||
""" |
|||
|
|||
import sys |
|||
import os |
|||
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
from deepsearcher.web_search import WebSearch |
|||
from deepsearcher import configuration |
|||
|
|||
def test_web_search(): |
|||
"""测试网页搜索功能""" |
|||
print("=== 测试网页搜索功能 ===") |
|||
|
|||
# 初始化网页搜索 |
|||
web_search = WebSearch() |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results = web_search.search_with_retry(test_query, size=4) |
|||
|
|||
if results: |
|||
print(f"找到 {len(results)} 个搜索结果:") |
|||
for i, result in enumerate(results, 1): |
|||
print(f"\n--- 结果 {i} ---") |
|||
print(f"标题: {result.metadata.get('title', 'N/A')}") |
|||
print(f"链接: {result.reference}") |
|||
print(f"分数: {result.score}") |
|||
print(f"内容长度: {len(result.text)} 字符") |
|||
print(f"内容预览: {result.text[:200]}...") |
|||
else: |
|||
print("未找到搜索结果") |
|||
|
|||
def test_integration(): |
|||
"""测试与DeepSearch的集成""" |
|||
print("\n=== 测试与DeepSearch的集成 ===") |
|||
|
|||
# 初始化配置 |
|||
configuration.init_config(configuration.config) |
|||
|
|||
# 创建DeepSearch实例(启用网页搜索) |
|||
from deepsearcher.agent.deep_search import DeepSearch |
|||
|
|||
searcher = DeepSearch( |
|||
llm=configuration.llm, |
|||
embedding_model=configuration.embedding_model, |
|||
vector_db=configuration.vector_db, |
|||
max_iter=2, |
|||
enable_web_search=True |
|||
) |
|||
|
|||
# 测试查询 |
|||
test_query = "Milvus是什么" |
|||
print(f"测试查询: {test_query}") |
|||
|
|||
# 执行搜索 |
|||
results, sub_queries = searcher.retrieve(test_query, max_iter=2) |
|||
|
|||
print(f"生成的子问题: {sub_queries}") |
|||
print(f"找到 {len(results)} 个搜索结果") |
|||
# 显示结果统计 |
|||
web_results = [r for r in results if r.metadata and r.metadata.get("source") == "webpage"] |
|||
vector_results = [r for r in results if not r.metadata or r.metadata.get("source") != "webpage"] |
|||
|
|||
print(f"网页搜索结果: {len(web_results)} 个") |
|||
print(f"向量数据库结果: {len(vector_results)} 个") |
|||
|
|||
if __name__ == "__main__": |
|||
test_web_search() |
|||
test_integration() |
Loading…
Reference in new issue