You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
118 lines
3.8 KiB
118 lines
3.8 KiB
2 weeks ago
|
import os
|
||
|
from typing import List
|
||
|
|
||
|
from langchain_core.documents import Document
|
||
|
|
||
|
from deepsearcher.loader.file_loader.base import BaseLoader
|
||
|
from deepsearcher.utils import log
|
||
|
|
||
|
|
||
|
class DoclingLoader(BaseLoader):
|
||
|
"""
|
||
|
Loader that utilizes Docling's DocumentConverter and HierarchicalChunker
|
||
|
to convert and chunk files (e.g. Markdown or HTML) into Document objects.
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
"""
|
||
|
Initialize the DoclingLoader with DocumentConverter and HierarchicalChunker instances.
|
||
|
"""
|
||
|
from docling.document_converter import DocumentConverter
|
||
|
from docling_core.transforms.chunker import HierarchicalChunker
|
||
|
|
||
|
self.converter = DocumentConverter()
|
||
|
self.chunker = HierarchicalChunker()
|
||
|
|
||
|
def load_file(self, file_path: str) -> List[Document]:
|
||
|
"""
|
||
|
Load a local file (or URL) using docling's conversion and perform hierarchical chunking.
|
||
|
|
||
|
Args:
|
||
|
file_path: Path or URL of the file to be loaded.
|
||
|
|
||
|
Returns:
|
||
|
A list of Document objects, each representing a chunk.
|
||
|
|
||
|
Raises:
|
||
|
FileNotFoundError: If the file does not exist.
|
||
|
ValueError: If the file type is not supported.
|
||
|
IOError: If there is an error reading the file.
|
||
|
"""
|
||
|
if not os.path.exists(file_path):
|
||
|
raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
|
||
|
|
||
|
# Check if the file has a supported extension
|
||
|
file_extension = os.path.splitext(file_path)[1].lower().lstrip(".")
|
||
|
if file_extension not in self.supported_file_types:
|
||
|
supported_formats = ", ".join(self.supported_file_types)
|
||
|
raise ValueError(
|
||
|
f"Unsupported file type: '{file_extension}'. "
|
||
|
f"Supported file types are: {supported_formats}"
|
||
|
)
|
||
|
|
||
|
try:
|
||
|
conversion_result = self.converter.convert(file_path)
|
||
|
docling_document = conversion_result.document
|
||
|
|
||
|
chunks = list(self.chunker.chunk(docling_document))
|
||
|
|
||
|
documents = []
|
||
|
for chunk in chunks:
|
||
|
metadata = {"reference": file_path, "text": chunk.text}
|
||
|
documents.append(Document(page_content=chunk.text, metadata=metadata))
|
||
|
return documents
|
||
|
except Exception as e:
|
||
|
log.color_print(f"Error processing file {file_path}: {str(e)}")
|
||
|
raise IOError(f"Failed to process file {file_path}: {str(e)}")
|
||
|
|
||
|
def load_directory(self, directory: str) -> List[Document]:
|
||
|
"""
|
||
|
Load all supported files from a directory.
|
||
|
|
||
|
Args:
|
||
|
directory: Path to the directory containing files to be loaded.
|
||
|
|
||
|
Returns:
|
||
|
A list of Document objects from all supported files in the directory.
|
||
|
|
||
|
Raises:
|
||
|
NotADirectoryError: If the specified path is not a directory.
|
||
|
"""
|
||
|
if not os.path.isdir(directory):
|
||
|
raise NotADirectoryError(f"Error: '{directory}' is not a directory.")
|
||
|
|
||
|
return super().load_directory(directory)
|
||
|
|
||
|
@property
|
||
|
def supported_file_types(self) -> List[str]:
|
||
|
"""
|
||
|
Return the list of file extensions supported by this loader.
|
||
|
|
||
|
Supported formats (refer to the official website: https://docling-project.github.io/docling/usage/supported_formats/):
|
||
|
- PDF
|
||
|
- Office formats: DOCX, XLSX, PPTX
|
||
|
- Markdown
|
||
|
- AsciiDoc
|
||
|
- HTML, XHTML
|
||
|
- CSV
|
||
|
- Images: PNG, JPEG, TIFF, BMP
|
||
|
"""
|
||
|
return [
|
||
|
"pdf",
|
||
|
"docx",
|
||
|
"xlsx",
|
||
|
"pptx",
|
||
|
"md",
|
||
|
"adoc",
|
||
|
"asciidoc",
|
||
|
"html",
|
||
|
"xhtml",
|
||
|
"csv",
|
||
|
"png",
|
||
|
"jpg",
|
||
|
"jpeg",
|
||
|
"tif",
|
||
|
"tiff",
|
||
|
"bmp",
|
||
|
]
|