deep-searcher/deepsearcher/loader/web_crawler/crawl4ai_crawler.py


								import asyncio

								from typing import List


								from langchain_core.documents import Document


								from deepsearcher.loader.web_crawler.base import BaseCrawler

								from deepsearcher.utils import log


								class Crawl4AICrawler(BaseCrawler):

								    """

								    Web crawler using the Crawl4AI library.


								    This crawler uses the Crawl4AI library to crawl web pages asynchronously and convert them

								    into markdown format for further processing. It supports both single-page crawling

								    and batch crawling of multiple pages.

								    """


								    def __init__(self, **kwargs):

								        """

								        Initialize the Crawl4AICrawler.


								        Args:

								            **kwargs: Optional keyword arguments.

								                browser_config: Configuration for the browser used by Crawl4AI.

								        """

								        super().__init__(**kwargs)

								        self.crawler = None  # Lazy init

								        self.browser_config = kwargs.get("browser_config", None)


								    def _lazy_init(self):

								        """

								        Initialize the crawler lazily when needed.


								        This method creates the AsyncWebCrawler instance with the provided browser configuration

								        only when it's first needed, to avoid unnecessary initialization.

								        """

								        from crawl4ai import AsyncWebCrawler, BrowserConfig


								        if self.crawler is None:

								            config = BrowserConfig.from_kwargs(self.browser_config) if self.browser_config else None

								            self.crawler = AsyncWebCrawler(config=config)


								    async def _async_crawl(self, url: str) -> Document:

								        """

								        Asynchronously crawl a single URL.


								        Args:

								            url: The URL to crawl.


								        Returns:

								            A Document object with the markdown content and metadata from the URL.

								        """

								        if self.crawler is None:

								            self._lazy_init()


								        async with self.crawler as crawler:

								            result = await crawler.arun(url)


								            markdown_content = result.markdown or ""


								            metadata = {

								                "reference": url,

								                "success": result.success,

								                "status_code": result.status_code,

								                "media": result.media,

								                "links": result.links,

								            }


								            if hasattr(result, "metadata") and result.metadata:

								                metadata["title"] = result.metadata.get("title", "")

								                metadata["author"] = result.metadata.get("author", "")


								            return Document(page_content=markdown_content, metadata=metadata)


								    def crawl_url(self, url: str) -> List[Document]:

								        """

								        Crawl a single URL.


								        Args:

								            url: The URL to crawl.


								        Returns:

								            A list containing a single Document object with the markdown content and metadata,

								            or an empty list if an error occurs.

								        """

								        try:

								            document = asyncio.run(self._async_crawl(url))

								            return [document]

								        except Exception as e:

								            log.error(f"Error during crawling {url}: {e}")

								            return []


								    async def _async_crawl_many(self, urls: List[str]) -> List[Document]:

								        """

								        Asynchronously crawl multiple URLs.


								        Args:

								            urls: A list of URLs to crawl.


								        Returns:

								            A list of Document objects with the markdown content and metadata from all URLs.

								        """

								        if self.crawler is None:

								            self._lazy_init()

								        async with self.crawler as crawler:

								            results = await crawler.arun_many(urls)

								            documents = []

								            for result in results:

								                markdown_content = result.markdown or ""

								                metadata = {

								                    "reference": result.url,

								                    "success": result.success,

								                    "status_code": result.status_code,

								                    "media": result.media,

								                    "links": result.links,

								                }

								                if hasattr(result, "metadata") and result.metadata:

								                    metadata["title"] = result.metadata.get("title", "")

								                    metadata["author"] = result.metadata.get("author", "")

								                documents.append(Document(page_content=markdown_content, metadata=metadata))

								            return documents


								    def crawl_urls(self, urls: List[str], **crawl_kwargs) -> List[Document]:

								        """

								        Crawl multiple URLs.


								        Args:

								            urls: A list of URLs to crawl.

								            **crawl_kwargs: Optional keyword arguments for the crawling process.


								        Returns:

								            A list of Document objects with the markdown content and metadata from all URLs,

								            or an empty list if an error occurs.

								        """

								        try:

								            return asyncio.run(self._async_crawl_many(urls))

								        except Exception as e:

								            log.error(f"Error during crawling {urls}: {e}")

								            return []