import os from typing import List, Optional from firecrawl import FirecrawlApp, ScrapeOptions from langchain_core.documents import Document from deepsearcher.loader.web_crawler.base import BaseCrawler class FireCrawlCrawler(BaseCrawler): """ Web crawler using the FireCrawl service. This crawler uses the FireCrawl service to crawl web pages and convert them into markdown format for further processing. It supports both single-page scraping and recursive crawling of multiple pages. """ def __init__(self, **kwargs): """ Initialize the FireCrawlCrawler. Args: **kwargs: Optional keyword arguments. """ super().__init__(**kwargs) self.app = None def crawl_url( self, url: str, max_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, ) -> List[Document]: """ Dynamically crawls a URL using either scrape_url or crawl_url: - Uses scrape_url for single-page extraction if no params are provided. - Uses crawl_url to recursively gather pages when any param is provided. Args: url (str): The starting URL to crawl. max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2). limit (Optional[int]): Maximum number of pages to crawl (default: 20). allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False). Returns: List[Document]: List of Document objects with page content and metadata. """ # Lazy init self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) # if user just inputs a single url as param # scrape single page if max_depth is None and limit is None and allow_backward_links is None: # Call the new Firecrawl API, passing formats directly scrape_response = self.app.scrape_url(url=url, formats=["markdown"]) data = scrape_response.model_dump() return [ Document( page_content=data.get("markdown", ""), metadata={"reference": url, **data.get("metadata", {})}, ) ] # else, crawl multiple pages based on users' input params # set default values if not provided crawl_response = self.app.crawl_url( url=url, limit=limit or 20, max_depth=max_depth or 2, allow_backward_links=allow_backward_links or False, scrape_options=ScrapeOptions(formats=["markdown"]), poll_interval=5, ) items = crawl_response.model_dump().get("data", []) documents: List[Document] = [] for item in items: # Support items that are either dicts or Pydantic sub-models item_dict = item.model_dump() if hasattr(item, "model_dump") else item md = item_dict.get("markdown", "") meta = item_dict.get("metadata", {}) meta["reference"] = meta.get("url", url) documents.append(Document(page_content=md, metadata=meta)) return documents