You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
88 lines
3.2 KiB
88 lines
3.2 KiB
import os
|
|
from typing import List, Optional
|
|
|
|
from firecrawl import FirecrawlApp, ScrapeOptions
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler.base import BaseCrawler
|
|
|
|
|
|
class FireCrawlCrawler(BaseCrawler):
|
|
"""
|
|
Web crawler using the FireCrawl service.
|
|
|
|
This crawler uses the FireCrawl service to crawl web pages and convert them
|
|
into markdown format for further processing. It supports both single-page scraping
|
|
and recursive crawling of multiple pages.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize the FireCrawlCrawler.
|
|
|
|
Args:
|
|
**kwargs: Optional keyword arguments.
|
|
"""
|
|
super().__init__(**kwargs)
|
|
self.app = None
|
|
|
|
def crawl_url(
|
|
self,
|
|
url: str,
|
|
max_depth: Optional[int] = None,
|
|
limit: Optional[int] = None,
|
|
allow_backward_links: Optional[bool] = None,
|
|
) -> List[Document]:
|
|
"""
|
|
Dynamically crawls a URL using either scrape_url or crawl_url:
|
|
|
|
- Uses scrape_url for single-page extraction if no params are provided.
|
|
- Uses crawl_url to recursively gather pages when any param is provided.
|
|
|
|
Args:
|
|
url (str): The starting URL to crawl.
|
|
max_depth (Optional[int]): Maximum depth for recursive crawling (default: 2).
|
|
limit (Optional[int]): Maximum number of pages to crawl (default: 20).
|
|
allow_backward_links (Optional[bool]): Allow crawling pages outside the URL's children (default: False).
|
|
|
|
Returns:
|
|
List[Document]: List of Document objects with page content and metadata.
|
|
"""
|
|
# Lazy init
|
|
self.app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
|
|
|
# if user just inputs a single url as param
|
|
# scrape single page
|
|
if max_depth is None and limit is None and allow_backward_links is None:
|
|
# Call the new Firecrawl API, passing formats directly
|
|
scrape_response = self.app.scrape_url(url=url, formats=["markdown"])
|
|
data = scrape_response.model_dump()
|
|
return [
|
|
Document(
|
|
page_content=data.get("markdown", ""),
|
|
metadata={"reference": url, **data.get("metadata", {})},
|
|
)
|
|
]
|
|
|
|
# else, crawl multiple pages based on users' input params
|
|
# set default values if not provided
|
|
crawl_response = self.app.crawl_url(
|
|
url=url,
|
|
limit=limit or 20,
|
|
max_depth=max_depth or 2,
|
|
allow_backward_links=allow_backward_links or False,
|
|
scrape_options=ScrapeOptions(formats=["markdown"]),
|
|
poll_interval=5,
|
|
)
|
|
items = crawl_response.model_dump().get("data", [])
|
|
|
|
documents: List[Document] = []
|
|
for item in items:
|
|
# Support items that are either dicts or Pydantic sub-models
|
|
item_dict = item.model_dump() if hasattr(item, "model_dump") else item
|
|
md = item_dict.get("markdown", "")
|
|
meta = item_dict.get("metadata", {})
|
|
meta["reference"] = meta.get("url", url)
|
|
documents.append(Document(page_content=md, metadata=meta))
|
|
|
|
return documents
|
|
|