You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
114 lines
4.0 KiB
114 lines
4.0 KiB
1 day ago
|
import http.client
|
||
|
import json
|
||
|
import time
|
||
|
from deepsearcher.vector_db import RetrievalResult
|
||
|
from deepsearcher.utils import log
|
||
|
|
||
|
|
||
|
class WebSearch:
|
||
|
"""网页搜索类,用于调用metaso.cn API进行网页搜索"""
|
||
|
def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"):
|
||
|
"""
|
||
|
初始化网页搜索
|
||
|
Args:
|
||
|
api_key (str): metaso.cn API密钥
|
||
|
"""
|
||
|
self.api_key = api_key
|
||
|
self.base_url = "metaso.cn"
|
||
|
self.endpoint = "/api/v1/search"
|
||
|
|
||
|
def search(self, query: str, size: int = 4) -> list[RetrievalResult]:
|
||
|
"""
|
||
|
执行网页搜索
|
||
|
Args:
|
||
|
query (str): 搜索查询
|
||
|
size (int): 返回结果数量,默认为4
|
||
|
Returns:
|
||
|
List[RetrievalResult]: 搜索结果列表
|
||
|
"""
|
||
|
try:
|
||
|
# 构建请求数据
|
||
|
payload = json.dumps({
|
||
|
"q": query,
|
||
|
"scope": "webpage",
|
||
|
"includeSummary": False,
|
||
|
"size": str(size),
|
||
|
"includeRawContent": True,
|
||
|
"conciseSnippet": True
|
||
|
})
|
||
|
|
||
|
headers = {
|
||
|
'Authorization': f'Bearer {self.api_key}',
|
||
|
'Accept': 'application/json',
|
||
|
'Content-Type': 'application/json'
|
||
|
}
|
||
|
|
||
|
# 发送请求
|
||
|
conn = http.client.HTTPSConnection(self.base_url)
|
||
|
conn.request("POST", self.endpoint, payload, headers)
|
||
|
res = conn.getresponse()
|
||
|
data = res.read()
|
||
|
|
||
|
if res.status != 200:
|
||
|
log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}")
|
||
|
return []
|
||
|
|
||
|
response_data = json.loads(data.decode("utf-8"))
|
||
|
|
||
|
# 解析搜索结果
|
||
|
results = []
|
||
|
if "webpages" in response_data:
|
||
|
for i, webpage in enumerate(response_data["webpages"]):
|
||
|
# 使用content字段作为主要文本内容
|
||
|
content = webpage.get("content", "")
|
||
|
if not content:
|
||
|
content = webpage.get("snippet", "")
|
||
|
|
||
|
# 创建RetrievalResult对象
|
||
|
result = RetrievalResult(
|
||
|
embedding=None, # 网页搜索结果没有向量
|
||
|
text=content,
|
||
|
reference=webpage.get("link", ""),
|
||
|
score=1.0 - (i * (1 / size)), # 根据位置计算分数
|
||
|
metadata={
|
||
|
"title": webpage.get("title", ""),
|
||
|
"date": webpage.get("date", ""),
|
||
|
"authors": webpage.get("authors", []),
|
||
|
"position": webpage.get("position", i + 1),
|
||
|
"source": "webpage"
|
||
|
}
|
||
|
)
|
||
|
results.append(result)
|
||
|
|
||
|
log.info(f"网页搜索成功,找到 {len(results)} 个结果")
|
||
|
return results
|
||
|
|
||
|
except Exception as e:
|
||
|
log.error(f"网页搜索出错: {str(e)}")
|
||
|
return []
|
||
|
finally:
|
||
|
if 'conn' in locals():
|
||
|
conn.close()
|
||
|
|
||
|
def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]:
|
||
|
"""
|
||
|
带重试机制的网页搜索
|
||
|
Args:
|
||
|
query (str): 搜索查询
|
||
|
size (int): 返回结果数量
|
||
|
max_retries (int): 最大重试次数
|
||
|
Returns:
|
||
|
List[RetrievalResult]: 搜索结果列表
|
||
|
"""
|
||
|
for attempt in range(max_retries):
|
||
|
try:
|
||
|
results = self.search(query, size)
|
||
|
if results:
|
||
|
return results
|
||
|
except Exception as e:
|
||
|
log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}")
|
||
|
if attempt < max_retries - 1:
|
||
|
time.sleep(1) # 等待1秒后重试
|
||
|
log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败")
|
||
|
return []
|