You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

117 lines
3.8 KiB

import os
from typing import List
from langchain_core.documents import Document
from deepsearcher.loader.file_loader.base import BaseLoader
from deepsearcher.utils import log
class DoclingLoader(BaseLoader):
"""
Loader that utilizes Docling's DocumentConverter and HierarchicalChunker
to convert and chunk files (e.g. Markdown or HTML) into Document objects.
"""
def __init__(self):
"""
Initialize the DoclingLoader with DocumentConverter and HierarchicalChunker instances.
"""
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker import HierarchicalChunker
self.converter = DocumentConverter()
self.chunker = HierarchicalChunker()
def load_file(self, file_path: str) -> List[Document]:
"""
Load a local file (or URL) using docling's conversion and perform hierarchical chunking.
Args:
file_path: Path or URL of the file to be loaded.
Returns:
A list of Document objects, each representing a chunk.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the file type is not supported.
IOError: If there is an error reading the file.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
# Check if the file has a supported extension
file_extension = os.path.splitext(file_path)[1].lower().lstrip(".")
if file_extension not in self.supported_file_types:
supported_formats = ", ".join(self.supported_file_types)
raise ValueError(
f"Unsupported file type: '{file_extension}'. "
f"Supported file types are: {supported_formats}"
)
try:
conversion_result = self.converter.convert(file_path)
docling_document = conversion_result.document
chunks = list(self.chunker.chunk(docling_document))
documents = []
for chunk in chunks:
metadata = {"reference": file_path, "text": chunk.text}
documents.append(Document(page_content=chunk.text, metadata=metadata))
return documents
except Exception as e:
log.color_print(f"Error processing file {file_path}: {str(e)}")
raise IOError(f"Failed to process file {file_path}: {str(e)}")
def load_directory(self, directory: str) -> List[Document]:
"""
Load all supported files from a directory.
Args:
directory: Path to the directory containing files to be loaded.
Returns:
A list of Document objects from all supported files in the directory.
Raises:
NotADirectoryError: If the specified path is not a directory.
"""
if not os.path.isdir(directory):
raise NotADirectoryError(f"Error: '{directory}' is not a directory.")
return super().load_directory(directory)
@property
def supported_file_types(self) -> List[str]:
"""
Return the list of file extensions supported by this loader.
Supported formats (refer to the official website: https://docling-project.github.io/docling/usage/supported_formats/):
- PDF
- Office formats: DOCX, XLSX, PPTX
- Markdown
- AsciiDoc
- HTML, XHTML
- CSV
- Images: PNG, JPEG, TIFF, BMP
"""
return [
"pdf",
"docx",
"xlsx",
"pptx",
"md",
"adoc",
"asciidoc",
"html",
"xhtml",
"csv",
"png",
"jpg",
"jpeg",
"tif",
"tiff",
"bmp",
]