from typing import List

from langchain_core.documents import Document

from deepsearcher.loader.file_loader.base import BaseLoader


class PDFLoader(BaseLoader):
    """
    Loader for PDF files.

    This loader handles PDF files and also supports text files with extensions like .txt and .md,
    converting them into Document objects for further processing.
    """

    def __init__(self):
        """
        Initialize the PDFLoader.
        """
        pass

    def load_file(self, file_path: str) -> List[Document]:
        """
        Load a PDF file and convert it to a Document object.

        Args:
            file_path: Path to the PDF file to be loaded.

        Returns:
            A list containing a single Document object with the file content and reference.

        Note:
            This loader also supports .txt and .md files for convenience.
        """
        import pdfplumber

        if file_path.endswith(".pdf"):
            with pdfplumber.open(file_path) as file:
                page_content = "\n\n".join([page.extract_text() for page in file.pages])
                return [Document(page_content=page_content, metadata={"reference": file_path})]
        elif file_path.endswith(".txt") or file_path.endswith(".md"):
            with open(file_path, "r", encoding="utf-8") as file:
                page_content = file.read()
                return [Document(page_content=page_content, metadata={"reference": file_path})]

    @property
    def supported_file_types(self) -> List[str]:
        """
        Get the list of file extensions supported by this loader.

        Returns:
            A list of supported file extensions: ["pdf", "md", "txt"].
        """
        return ["pdf", "md", "txt"]