You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
3.0 KiB
98 lines
3.0 KiB
from typing import List
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from deepsearcher.loader.web_crawler.base import BaseCrawler
|
|
from deepsearcher.utils import log
|
|
|
|
|
|
class DoclingCrawler(BaseCrawler):
|
|
"""
|
|
Web crawler using Docling's DocumentConverter and HierarchicalChunker.
|
|
|
|
This crawler leverages Docling's capabilities to convert web pages into structured
|
|
documents and chunk them appropriately for further processing.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize the DoclingCrawler with DocumentConverter and HierarchicalChunker instances.
|
|
|
|
Args:
|
|
**kwargs: Optional keyword arguments.
|
|
"""
|
|
super().__init__(**kwargs)
|
|
from docling.document_converter import DocumentConverter
|
|
from docling_core.transforms.chunker import HierarchicalChunker
|
|
|
|
self.converter = DocumentConverter()
|
|
self.chunker = HierarchicalChunker()
|
|
|
|
def crawl_url(self, url: str, **crawl_kwargs) -> List[Document]:
|
|
"""
|
|
Crawl a single URL using Docling's conversion and perform hierarchical chunking.
|
|
|
|
Args:
|
|
url: The URL to crawl.
|
|
**crawl_kwargs: Optional keyword arguments for the crawling process.
|
|
|
|
Returns:
|
|
A list of Document objects, each representing a chunk from the crawled URL.
|
|
|
|
Raises:
|
|
IOError: If there is an error processing the URL.
|
|
"""
|
|
try:
|
|
# Use Docling to convert the URL to a document
|
|
conversion_result = self.converter.convert(url)
|
|
docling_document = conversion_result.document
|
|
|
|
# Chunk the document using hierarchical chunking
|
|
chunks = list(self.chunker.chunk(docling_document))
|
|
|
|
documents = []
|
|
for chunk in chunks:
|
|
metadata = {"reference": url, "text": chunk.text}
|
|
documents.append(Document(page_content=chunk.text, metadata=metadata))
|
|
|
|
return documents
|
|
|
|
except Exception as e:
|
|
log.color_print(f"Error processing URL {url}: {str(e)}")
|
|
raise IOError(f"Failed to process URL {url}: {str(e)}")
|
|
|
|
@property
|
|
def supported_file_types(self) -> List[str]:
|
|
"""
|
|
Return the list of file types and formats supported by Docling.
|
|
|
|
Supported formats (refer to the official Docling documentation: https://docling-project.github.io/docling/usage/supported_formats/):
|
|
- PDF
|
|
- Office formats: DOCX, XLSX, PPTX
|
|
- Markdown
|
|
- AsciiDoc
|
|
- HTML, XHTML
|
|
- CSV
|
|
- Images: PNG, JPEG, TIFF, BMP
|
|
|
|
Returns:
|
|
A list of file extensions supported by this crawler.
|
|
"""
|
|
return [
|
|
"pdf",
|
|
"docx",
|
|
"xlsx",
|
|
"pptx",
|
|
"md",
|
|
"adoc",
|
|
"asciidoc",
|
|
"html",
|
|
"xhtml",
|
|
"csv",
|
|
"png",
|
|
"jpg",
|
|
"jpeg",
|
|
"tif",
|
|
"tiff",
|
|
"bmp",
|
|
]
|
|
|