deep-searcher/deepsearcher/loader/web_crawler/docling_crawler.py


								from typing import List


								from langchain_core.documents import Document


								from deepsearcher.loader.web_crawler.base import BaseCrawler

								from deepsearcher.utils import log


								class DoclingCrawler(BaseCrawler):

								    """

								    Web crawler using Docling's DocumentConverter and HierarchicalChunker.


								    This crawler leverages Docling's capabilities to convert web pages into structured

								    documents and chunk them appropriately for further processing.

								    """


								    def __init__(self, **kwargs):

								        """

								        Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.


								        Args:

								            **kwargs: Optional keyword arguments.

								        """

								        super().__init__(**kwargs)

								        from docling.document_converter import DocumentConverter

								        from docling_core.transforms.chunker import HierarchicalChunker


								        self.converter = DocumentConverter()

								        self.chunker = HierarchicalChunker()


								    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:

								        """

								        Crawl a single URL using Docling's conversion and perform hierarchical chunking.


								        Args:

								            url: The URL to crawl.

								            **crawl_kwargs: Optional keyword arguments for the crawling process.


								        Returns:

								            A list of Document objects, each representing a chunk from the crawled URL.


								        Raises:

								            IOError: If there is an error processing the URL.

								        """

								        try:

								            # Use Docling to convert the URL to a document

								            conversion_result = self.converter.convert(url)

								            docling_document = conversion_result.document


								            # Chunk the document using hierarchical chunking

								            chunks = list(self.chunker.chunk(docling_document))


								            documents = []

								            for chunk in chunks:

								                metadata = {"reference": url, "text": chunk.text}

								                documents.append(Document(page_content=chunk.text, metadata=metadata))


								            return documents


								        except Exception as e:

								            log.color_print(f"Error processing URL {url}: {str(e)}")

								            raise IOError(f"Failed to process URL {url}: {str(e)}")


								    @property

								    def supported_file_types(self) -> List[str]:

								        """

								        Return the list of file types and formats supported by Docling.


								        Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):

								        - PDF

								        - Office formats: DOCX, XLSX, PPTX

								        - Markdown

								        - AsciiDoc

								        - HTML, XHTML

								        - CSV

								        - Images: PNG, JPEG, TIFF, BMP


								        Returns:

								            A list of file extensions supported by this crawler.

								        """

								        return [

								            "pdf",

								            "docx",

								            "xlsx",

								            "pptx",

								            "md",

								            "adoc",

								            "asciidoc",

								            "html",

								            "xhtml",

								            "csv",

								            "png",

								            "jpg",

								            "jpeg",

								            "tif",

								            "tiff",

								            "bmp",

								        ]