You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

70 lines
2.2 KiB

import os
from abc import ABC
from typing import List
from langchain_core.documents import Document
class BaseLoader(ABC):
"""
Abstract base class for file loaders.
This class defines the interface for loading documents from files and directories.
All specific file loaders should inherit from this class and implement the required methods.
"""
def __init__(self, **kwargs):
"""
Initialize the loader with optional keyword arguments.
Args:
**kwargs: Optional keyword arguments for specific loader implementations.
"""
pass
def load_file(self, file_path: str) -> List[Document]:
"""
Load a single file and convert it to Document objects.
Args:
file_path: Path to the file to be loaded.
Returns:
A list of Document objects containing the text and metadata.
Note:
Return a list of Document objects which contain the text and metadata.
In the metadata, it's recommended to include the reference to the file.
e.g. return [Document(page_content=..., metadata={"reference": file_path})]
"""
pass
def load_directory(self, directory: str) -> List[Document]:
"""
Load all supported files from a directory and its subdirectories recursively.
Args:
directory: Path to the directory containing files to be loaded.
Returns:
A list of Document objects from all supported files in the directory and subdirectories.
"""
documents = []
for root, _, files in os.walk(directory):
for file in files:
for suffix in self.supported_file_types:
if file.endswith(suffix):
full_path = os.path.join(root, file)
documents.extend(self.load_file(full_path))
break
return documents
@property
def supported_file_types(self) -> List[str]:
"""
Get the list of file extensions supported by this loader.
Returns:
A list of supported file extensions (without the dot).
"""
pass