import asyncio from typing import List from langchain_core.documents import Document from deepsearcher.loader.web_crawler.base import BaseCrawler from deepsearcher.utils import log class Crawl4AICrawler(BaseCrawler): """ Web crawler using the Crawl4AI library. This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them into markdown format for further processing. It supports both single-page crawling and batch crawling of multiple pages. """ def __init__(self, **kwargs): """ Initialize the Crawl4AICrawler. Args: **kwargs: Optional keyword arguments. browser_config: Configuration for the browser used by Crawl4AI. """ super().__init__(**kwargs) self.crawler = None # Lazy init self.browser_config = kwargs.get("browser_config", None) def _lazy_init(self): """ Initialize the crawler lazily when needed. This method creates the AsyncWebCrawler instance with the provided browser configuration only when it's first needed, to avoid unnecessary initialization. """ from crawl4ai import AsyncWebCrawler, BrowserConfig if self.crawler is None: config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None self.crawler = AsyncWebCrawler(config=config) async def _async_crawl(self, url: str) -> Document: """ Asynchronously crawl a single URL. Args: url: The URL to crawl. Returns: A Document object with the markdown content and metadata from the URL. """ if self.crawler is None: self._lazy_init() async with self.crawler as crawler: result = await crawler.arun(url) markdown_content = result.markdown or "" metadata = { "reference": url, "success": result.success, "status_code": result.status_code, "media": result.media, "links": result.links, } if hasattr(result, "metadata") and result.metadata: metadata["title"] = result.metadata.get("title", "") metadata["author"] = result.metadata.get("author", "") return Document(page_content=markdown_content, metadata=metadata) def crawl_url(self, url: str) -> List[Document]: """ Crawl a single URL. Args: url: The URL to crawl. Returns: A list containing a single Document object with the markdown content and metadata, or an empty list if an error occurs. """ try: document = asyncio.run(self._async_crawl(url)) return [document] except Exception as e: log.error(f"Error during crawling {url}: {e}") return [] async def _async_crawl_many(self, urls: List[str]) -> List[Document]: """ Asynchronously crawl multiple URLs. Args: urls: A list of URLs to crawl. Returns: A list of Document objects with the markdown content and metadata from all URLs. """ if self.crawler is None: self._lazy_init() async with self.crawler as crawler: results = await crawler.arun_many(urls) documents = [] for result in results: markdown_content = result.markdown or "" metadata = { "reference": result.url, "success": result.success, "status_code": result.status_code, "media": result.media, "links": result.links, } if hasattr(result, "metadata") and result.metadata: metadata["title"] = result.metadata.get("title", "") metadata["author"] = result.metadata.get("author", "") documents.append(Document(page_content=markdown_content, metadata=metadata)) return documents def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]: """ Crawl multiple URLs. Args: urls: A list of URLs to crawl. **crawl_kwargs: Optional keyword arguments for the crawling process. Returns: A list of Document objects with the markdown content and metadata from all URLs, or an empty list if an error occurs. """ try: return asyncio.run(self._async_crawl_many(urls)) except Exception as e: log.error(f"Error during crawling {urls}: {e}") return []