from typing import List

from langchain_core.documents import Document

from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log


class DoclingCrawler(BaseCrawler):
    """
    Web crawler using Docling's DocumentConverter and HierarchicalChunker.

    This crawler leverages Docling's capabilities to convert web pages into structured
    documents and chunk them appropriately for further processing.
    """

    def __init__(self, **kwargs):
        """
        Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.

        Args:
            **kwargs: Optional keyword arguments.
        """
        super().__init__(**kwargs)
        from docling.document_converter import DocumentConverter
        from docling_core.transforms.chunker import HierarchicalChunker

        self.converter = DocumentConverter()
        self.chunker = HierarchicalChunker()

    def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
        """
        Crawl a single URL using Docling's conversion and perform hierarchical chunking.

        Args:
            url: The URL to crawl.
            **crawl_kwargs: Optional keyword arguments for the crawling process.

        Returns:
            A list of Document objects, each representing a chunk from the crawled URL.

        Raises:
            IOError: If there is an error processing the URL.
        """
        try:
            # Use Docling to convert the URL to a document
            conversion_result = self.converter.convert(url)
            docling_document = conversion_result.document

            # Chunk the document using hierarchical chunking
            chunks = list(self.chunker.chunk(docling_document))

            documents = []
            for chunk in chunks:
                metadata = {"reference": url, "text": chunk.text}
                documents.append(Document(page_content=chunk.text, metadata=metadata))

            return documents

        except Exception as e:
            log.color_print(f"Error processing URL {url}: {str(e)}")
            raise IOError(f"Failed to process URL {url}: {str(e)}")

    @property
    def supported_file_types(self) -> List[str]:
        """
        Return the list of file types and formats supported by Docling.

        Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
        - PDF
        - Office formats: DOCX, XLSX, PPTX
        - Markdown
        - AsciiDoc
        - HTML, XHTML
        - CSV
        - Images: PNG, JPEG, TIFF, BMP

        Returns:
            A list of file extensions supported by this crawler.
        """
        return [
            "pdf",
            "docx",
            "xlsx",
            "pptx",
            "md",
            "adoc",
            "asciidoc",
            "html",
            "xhtml",
            "csv",
            "png",
            "jpg",
            "jpeg",
            "tif",
            "tiff",
            "bmp",
        ]