import http.client import json import time from deepsearcher.vector_db import RetrievalResult from deepsearcher.utils import log class WebSearch: """网页搜索类,用于调用metaso.cn API进行网页搜索""" def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"): """ 初始化网页搜索 Args: api_key (str): metaso.cn API密钥 """ self.api_key = api_key self.base_url = "metaso.cn" self.endpoint = "/api/v1/search" def search(self, query: str, size: int = 4) -> list[RetrievalResult]: """ 执行网页搜索 Args: query (str): 搜索查询 size (int): 返回结果数量,默认为4 Returns: List[RetrievalResult]: 搜索结果列表 """ try: # 构建请求数据 payload = json.dumps({ "q": query, "scope": "webpage", "includeSummary": False, "size": str(size), "includeRawContent": True, "conciseSnippet": True }) headers = { 'Authorization': f'Bearer {self.api_key}', 'Accept': 'application/json', 'Content-Type': 'application/json' } # 发送请求 conn = http.client.HTTPSConnection(self.base_url) conn.request("POST", self.endpoint, payload, headers) res = conn.getresponse() data = res.read() if res.status != 200: log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}") return [] response_data = json.loads(data.decode("utf-8")) # 解析搜索结果 results = [] if "webpages" in response_data: for i, webpage in enumerate(response_data["webpages"]): # 使用content字段作为主要文本内容 content = webpage.get("content", "") if not content: content = webpage.get("snippet", "") # 创建RetrievalResult对象 result = RetrievalResult( embedding=None, # 网页搜索结果没有向量 text=content, reference=webpage.get("link", ""), score=1.0 - (i * (1 / size)), # 根据位置计算分数 metadata={ "title": webpage.get("title", ""), "date": webpage.get("date", ""), "authors": webpage.get("authors", []), "position": webpage.get("position", i + 1), "source": "webpage" } ) results.append(result) log.info(f"网页搜索成功,找到 {len(results)} 个结果") return results except Exception as e: log.error(f"网页搜索出错: {str(e)}") return [] finally: if 'conn' in locals(): conn.close() def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]: """ 带重试机制的网页搜索 Args: query (str): 搜索查询 size (int): 返回结果数量 max_retries (int): 最大重试次数 Returns: List[RetrievalResult]: 搜索结果列表 """ for attempt in range(max_retries): try: results = self.search(query, size) if results: return results except Exception as e: log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}") if attempt < max_retries - 1: time.sleep(1) # 等待1秒后重试 log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败") return []