You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
1.8 KiB
62 lines
1.8 KiB
import os
|
|
from typing import List
|
|
|
|
import requests
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler.base import BaseCrawler
|
|
|
|
|
|
class JinaCrawler(BaseCrawler):
|
|
"""
|
|
Web crawler using Jina AI's rendering service.
|
|
|
|
This crawler uses Jina AI's rendering service to crawl web pages and convert them
|
|
into markdown format for further processing.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize the JinaCrawler.
|
|
|
|
Args:
|
|
**kwargs: Optional keyword arguments.
|
|
|
|
Raises:
|
|
ValueError: If the JINA_API_TOKEN environment variable is not set.
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.jina_api_token = os.getenv("JINA_API_TOKEN") or os.getenv("JINAAI_API_KEY")
|
|
if not self.jina_api_token:
|
|
raise ValueError("Missing JINA_API_TOKEN environment variable")
|
|
|
|
def crawl_url(self, url: str) -> List[Document]:
|
|
"""
|
|
Crawl a single URL using Jina AI's rendering service.
|
|
|
|
Args:
|
|
url: The URL to crawl.
|
|
|
|
Returns:
|
|
A list containing a single Document object with the markdown content and metadata.
|
|
|
|
Raises:
|
|
HTTPError: If the request to Jina AI's service fails.
|
|
"""
|
|
jina_url = f"https://r.jina.ai/{url}"
|
|
headers = {
|
|
"Authorization": f"Bearer {self.jina_api_token}",
|
|
"X-Return-Format": "markdown",
|
|
}
|
|
|
|
response = requests.get(jina_url, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
markdown_content = response.text
|
|
metadata = {
|
|
"reference": url,
|
|
"status_code": response.status_code,
|
|
"headers": dict(response.headers),
|
|
}
|
|
|
|
return [Document(page_content=markdown_content, metadata=metadata)]
|
|
|