You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

98 lines
3.0 KiB

from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.web_crawler.base import BaseCrawler
from deepsearcher.utils import log
class DoclingCrawler(BaseCrawler):
"""
Web crawler using Docling's DocumentConverter and HierarchicalChunker.
This crawler leverages Docling's capabilities to convert web pages into structured
documents and chunk them appropriately for further processing.
"""
def __init__(self, **kwargs):
"""
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
Args:
**kwargs: Optional keyword arguments.
"""
super().__init__(**kwargs)
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
self.converter = DocumentConverter()
self.chunker = HierarchicalChunker()
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
"""
Crawl a single URL using Docling's conversion and perform hierarchical chunking.
Args:
url: The URL to crawl.
**crawl_kwargs: Optional keyword arguments for the crawling process.
Returns:
A list of Document objects, each representing a chunk from the crawled URL.
Raises:
IOError: If there is an error processing the URL.
"""
try:
# Use Docling to convert the URL to a document
conversion_result = self.converter.convert(url)
docling_document = conversion_result.document
# Chunk the document using hierarchical chunking
chunks = list(self.chunker.chunk(docling_document))
documents = []
for chunk in chunks:
metadata = {"reference": url, "text": chunk.text}
documents.append(Document(page_content=chunk.text, metadata=metadata))
return documents
except Exception as e:
log.color_print(f"Error processing URL {url}: {str(e)}")
raise IOError(f"Failed to process URL {url}: {str(e)}")
@property
def supported_file_types(self) -> List[str]:
"""
Return the list of file types and formats supported by Docling.
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
- PDF
- Office formats: DOCX, XLSX, PPTX
- Markdown
- AsciiDoc
- HTML, XHTML
- CSV
- Images: PNG, JPEG, TIFF, BMP
Returns:
A list of file extensions supported by this crawler.
"""
return [
"pdf",
"docx",
"xlsx",
"pptx",
"md",
"adoc",
"asciidoc",
"html",
"xhtml",
"csv",
"png",
"jpg",
"jpeg",
"tif",
"tiff",
"bmp",
]