deep-searcher/deepsearcher/web_search.py

import http.client
import json
import time
from deepsearcher.vector_db import RetrievalResult
from deepsearcher.utils import log


class WebSearch:
    """网页搜索类，用于调用metaso.cn API进行网页搜索"""
    def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"):
        """
        初始化网页搜索
        Args:
            api_key (str): metaso.cn API密钥
        """
        self.api_key = api_key
        self.base_url = "metaso.cn"
        self.endpoint = "/api/v1/search"

    def search(self, query: str, size: int = 4) -> list[RetrievalResult]:
        """
        执行网页搜索
        Args:
            query (str): 搜索查询
            size (int): 返回结果数量，默认为4
        Returns:
            List[RetrievalResult]: 搜索结果列表
        """
        try:
            # 构建请求数据
            payload = json.dumps({
                "q": query,
                "scope": "webpage",
                "includeSummary": False,
                "size": str(size),
                "includeRawContent": True,
                "conciseSnippet": True
            })

            headers = {
                'Authorization': f'Bearer {self.api_key}',
                'Accept': 'application/json',
                'Content-Type': 'application/json'
            }

            # 发送请求
            conn = http.client.HTTPSConnection(self.base_url)
            conn.request("POST", self.endpoint, payload, headers)
            res = conn.getresponse()
            data = res.read()

            if res.status != 200:
                log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}")
                return []

            response_data = json.loads(data.decode("utf-8"))

            # 解析搜索结果
            results = []
            if "webpages" in response_data:
                for i, webpage in enumerate(response_data["webpages"]):
                    # 使用content字段作为主要文本内容
                    content = webpage.get("content", "")
                    if not content:
                        content = webpage.get("snippet", "")

                    # 创建RetrievalResult对象
                    result = RetrievalResult(
                        embedding=None,  # 网页搜索结果没有向量
                        text=content,
                        reference=webpage.get("link", ""),
                        score=1.0 - (i * (1 / size)),  # 根据位置计算分数
                        metadata={
                            "title": webpage.get("title", ""),
                            "date": webpage.get("date", ""),
                            "authors": webpage.get("authors", []),
                            "position": webpage.get("position", i + 1),
                            "source": "webpage"
                        }
                    )
                    results.append(result)

            log.info(f"网页搜索成功，找到 {len(results)} 个结果")
            return results

        except Exception as e:
            log.error(f"网页搜索出错: {str(e)}")
            return []
        finally:
            if 'conn' in locals():
                conn.close()

    def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]:
        """
        带重试机制的网页搜索
        Args:
            query (str): 搜索查询
            size (int): 返回结果数量
            max_retries (int): 最大重试次数
        Returns:
            List[RetrievalResult]: 搜索结果列表
        """
        for attempt in range(max_retries):
            try:
                results = self.search(query, size)
                if results:
                    return results
            except Exception as e:
                log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(1)  # 等待1秒后重试
        log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败")
        return []
feat: 更改网页搜索逻辑 1 day ago			`import http.client`
			`import json`
			`import time`
			`from deepsearcher.vector_db import RetrievalResult`
			`from deepsearcher.utils import log`


			`class WebSearch:`
			`"""网页搜索类，用于调用metaso.cn API进行网页搜索"""`
			`def __init__(self, api_key: str = "mk-CCEA085159C048597435780530A55403"):`
			`"""`
			`初始化网页搜索`
			`Args:`
			`api_key (str): metaso.cn API密钥`
			`"""`
			`self.api_key = api_key`
			`self.base_url = "metaso.cn"`
			`self.endpoint = "/api/v1/search"`

			`def search(self, query: str, size: int = 4) -> list[RetrievalResult]:`
			`"""`
			`执行网页搜索`
			`Args:`
			`query (str): 搜索查询`
			`size (int): 返回结果数量，默认为4`
			`Returns:`
			`List[RetrievalResult]: 搜索结果列表`
			`"""`
			`try:`
			`# 构建请求数据`
			`payload = json.dumps({`
			`"q": query,`
			`"scope": "webpage",`
			`"includeSummary": False,`
			`"size": str(size),`
			`"includeRawContent": True,`
			`"conciseSnippet": True`
			`})`

			`headers = {`
			`'Authorization': f'Bearer {self.api_key}',`
			`'Accept': 'application/json',`
			`'Content-Type': 'application/json'`
			`}`

			`# 发送请求`
			`conn = http.client.HTTPSConnection(self.base_url)`
			`conn.request("POST", self.endpoint, payload, headers)`
			`res = conn.getresponse()`
			`data = res.read()`

			`if res.status != 200:`
			`log.error(f"网页搜索请求失败: {res.status} - {data.decode('utf-8')}")`
			`return []`

			`response_data = json.loads(data.decode("utf-8"))`

			`# 解析搜索结果`
			`results = []`
			`if "webpages" in response_data:`
			`for i, webpage in enumerate(response_data["webpages"]):`
			`# 使用content字段作为主要文本内容`
			`content = webpage.get("content", "")`
			`if not content:`
			`content = webpage.get("snippet", "")`

			`# 创建RetrievalResult对象`
			`result = RetrievalResult(`
			`embedding=None, # 网页搜索结果没有向量`
			`text=content,`
			`reference=webpage.get("link", ""),`
			`score=1.0 - (i * (1 / size)), # 根据位置计算分数`
			`metadata={`
			`"title": webpage.get("title", ""),`
			`"date": webpage.get("date", ""),`
			`"authors": webpage.get("authors", []),`
			`"position": webpage.get("position", i + 1),`
			`"source": "webpage"`
			`}`
			`)`
			`results.append(result)`

			`log.info(f"网页搜索成功，找到 {len(results)} 个结果")`
			`return results`

			`except Exception as e:`
			`log.error(f"网页搜索出错: {str(e)}")`
			`return []`
			`finally:`
			`if 'conn' in locals():`
			`conn.close()`

			`def search_with_retry(self, query: str, size: int = 4, max_retries: int = 3) -> list[RetrievalResult]:`
			`"""`
			`带重试机制的网页搜索`
			`Args:`
			`query (str): 搜索查询`
			`size (int): 返回结果数量`
			`max_retries (int): 最大重试次数`
			`Returns:`
			`List[RetrievalResult]: 搜索结果列表`
			`"""`
			`for attempt in range(max_retries):`
			`try:`
			`results = self.search(query, size)`
			`if results:`
			`return results`
			`except Exception as e:`
			`log.warning(f"网页搜索第 {attempt + 1} 次尝试失败: {str(e)}")`
			`if attempt < max_retries - 1:`
			`time.sleep(1) # 等待1秒后重试`
			`log.error(f"网页搜索在 {max_retries} 次尝试后仍然失败")`
			`return []`