Source code for automated_document_parser.loaders.pdf_load.base

"""
Base PDF loader class and type definitions.
"""

import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Literal

from langchain_core.documents import Document

logger = logging.getLogger(__name__)

PDFLoaderMethod = Literal[
    "pypdf",
    "unstructured",
    "amazon_textract",
    "mathpix",
    "pdfplumber",
    "pypdfium2",
    "pymupdf",
    "pymupdf4llm",
    "opendataloader",
]



[docs]
class BasePDFLoader(ABC):
    """
    Abstract base class for PDF loaders.

    All PDF loader implementations should inherit from this class.
    """


[docs]
    def __init__(self, file_path: str | Path, **kwargs):
        """
        Initialize the PDF loader.

        Args:
            file_path: Path to PDF file or URL
            **kwargs: Additional loader-specific arguments
        """
        self.file_path = (
            Path(file_path)
            if not str(file_path).startswith(("http", "s3://"))
            else str(file_path)
        )
        self.kwargs = kwargs
        logger.info(f"Initialized {self.__class__.__name__} for: {file_path}")



[docs]
    @abstractmethod
    def load(self) -> List[Document]:
        """
        Load PDF documents.

        Returns:
            List of LangChain Document objects

        Raises:
            ImportError: If required dependencies are not installed
            RuntimeError: If loading fails
        """
        pass



[docs]
    @abstractmethod
    def get_install_command(self) -> str:
        """
        Get the pip install command for required dependencies.

        Returns:
            Install command string
        """
        pass