You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
1.7 KiB
55 lines
1.7 KiB
2 weeks ago
|
from typing import List
|
||
|
|
||
|
from langchain_core.documents import Document
|
||
|
|
||
|
from deepsearcher.loader.file_loader.base import BaseLoader
|
||
|
|
||
|
|
||
|
class PDFLoader(BaseLoader):
|
||
|
"""
|
||
|
Loader for PDF files.
|
||
|
|
||
|
This loader handles PDF files and also supports text files with extensions like .txt and .md,
|
||
|
converting them into Document objects for further processing.
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
"""
|
||
|
Initialize the PDFLoader.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
def load_file(self, file_path: str) -> List[Document]:
|
||
|
"""
|
||
|
Load a PDF file and convert it to a Document object.
|
||
|
|
||
|
Args:
|
||
|
file_path: Path to the PDF file to be loaded.
|
||
|
|
||
|
Returns:
|
||
|
A list containing a single Document object with the file content and reference.
|
||
|
|
||
|
Note:
|
||
|
This loader also supports .txt and .md files for convenience.
|
||
|
"""
|
||
|
import pdfplumber
|
||
|
|
||
|
if file_path.endswith(".pdf"):
|
||
|
with pdfplumber.open(file_path) as file:
|
||
|
page_content = "\n\n".join([page.extract_text() for page in file.pages])
|
||
|
return [Document(page_content=page_content, metadata={"reference": file_path})]
|
||
|
elif file_path.endswith(".txt") or file_path.endswith(".md"):
|
||
|
with open(file_path, "r", encoding="utf-8") as file:
|
||
|
page_content = file.read()
|
||
|
return [Document(page_content=page_content, metadata={"reference": file_path})]
|
||
|
|
||
|
@property
|
||
|
def supported_file_types(self) -> List[str]:
|
||
|
"""
|
||
|
Get the list of file extensions supported by this loader.
|
||
|
|
||
|
Returns:
|
||
|
A list of supported file extensions: ["pdf", "md", "txt"].
|
||
|
"""
|
||
|
return ["pdf", "md", "txt"]
|