deep-searcher/deepsearcher/loader/web_crawler/firecrawl_crawler.py


								import os

								from typing import List, Optional


								from firecrawl import FirecrawlApp, ScrapeOptions

								from langchain_core.documents import Document


								from deepsearcher.loader.web_crawler.base import BaseCrawler


								class FireCrawlCrawler(BaseCrawler):

								    """

								    Web crawler using the FireCrawl service.


								    This crawler uses the FireCrawl service to crawl web pages and convert them

								    into markdown format for further processing. It supports both single-page scraping

								    and recursive crawling of multiple pages.

								    """


								    def __init__(self, **kwargs):

								        """

								        Initialize the FireCrawlCrawler.


								        Args:

								            **kwargs: Optional keyword arguments.

								        """

								        super().__init__(**kwargs)

								        self.app = None


								    def crawl_url(

								        self,

								        url: str,

								        max_depth: Optional[int] = None,

								        limit: Optional[int] = None,

								        allow_backward_links: Optional[bool] = None,

								    ) -> List[Document]:

								        """

								        Dynamically crawls a URL using either scrape_url or crawl_url:


								        - Uses scrape_url for single-page extraction if no params are provided.

								        - Uses crawl_url to recursively gather pages when any param is provided.


								        Args:

								            url (str): The starting URL to crawl.

								            max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).

								            limit (Optional[int]): Maximum number of pages to crawl (default: 20).

								            allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).


								        Returns:

								            List[Document]: List of Document objects with page content and metadata.

								        """

								        # Lazy init

								        self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))


								        # if user just inputs a single url as param

								        # scrape single page

								        if max_depth is None and limit is None and allow_backward_links is None:

								            # Call the new Firecrawl API, passing formats directly

								            scrape_response = self.app.scrape_url(url=url, formats=["markdown"])

								            data = scrape_response.model_dump()

								            return [

								                Document(

								                    page_content=data.get("markdown", ""),

								                    metadata={"reference": url, **data.get("metadata", {})},

								                )

								            ]


								        # else, crawl multiple pages based on users' input params

								        # set default values if not provided

								        crawl_response = self.app.crawl_url(

								            url=url,

								            limit=limit or 20,

								            max_depth=max_depth or 2,

								            allow_backward_links=allow_backward_links or False,

								            scrape_options=ScrapeOptions(formats=["markdown"]),

								            poll_interval=5,

								        )

								        items = crawl_response.model_dump().get("data", [])


								        documents: List[Document] = []

								        for item in items:

								            # Support items that are either dicts or Pydantic sub-models

								            item_dict = item.model_dump() if hasattr(item, "model_dump") else item

								            md = item_dict.get("markdown", "")

								            meta = item_dict.get("metadata", {})

								            meta["reference"] = meta.get("url", url)

								            documents.append(Document(page_content=md, metadata=meta))


								        return documents