Source code for automated_document_parser.loaders.pdf_load.loader

"""
Main PDF loader with support for multiple backends.
"""

import logging
from pathlib import Path
from typing import List

from langchain_core.documents import Document

from .base import BasePDFLoader, PDFLoaderMethod
from .pypdf_loader import PyPDFLoaderImpl
from .unstructured_loader import UnstructuredPDFLoader
from .textract_loader import AmazonTextractPDFLoader
from .mathpix_loader import MathpixPDFLoader
from .pdfplumber_loader import PDFPlumberLoader
from .pypdfium2_loader import PyPDFium2Loader
from .pymupdf_loader import PyMuPDFLoader
from .pymupdf4llm_loader import PyMuPDF4LLMLoader
from .opendataloader_loader import OpenDataLoaderPDFLoader

logger = logging.getLogger(__name__)

# Registry of available PDF loaders
PDF_LOADER_REGISTRY = {
    "pypdf": PyPDFLoaderImpl,
    "unstructured": UnstructuredPDFLoader,
    "amazon_textract": AmazonTextractPDFLoader,
    "mathpix": MathpixPDFLoader,
    "pdfplumber": PDFPlumberLoader,
    "pypdfium2": PyPDFium2Loader,
    "pymupdf": PyMuPDFLoader,
    "pymupdf4llm": PyMuPDF4LLMLoader,
    "opendataloader": OpenDataLoaderPDFLoader,
}



[docs]
class PDFLoader:
    """
    Flexible PDF loader supporting multiple parsing backends.

    By default, uses PyPDF for standard PDF parsing. Users can specify
    alternative methods like 'unstructured' for advanced parsing or
    'amazon_textract' for OCR capabilities.

    Users can also provide custom loader classes that inherit from BasePDFLoader.
    """


[docs]
    def __init__(
        self,
        file_path: str | Path,
        method: PDFLoaderMethod | str = "pypdf",
        loader_class: type[BasePDFLoader] | None = None,
        **kwargs,
    ):
        """
        Initialize PDF loader with specified method or custom loader.

        Args:
            file_path: Path to PDF file or URL (for amazon_textract)
            method: Loading method - 'pypdf' (default), 'unstructured', or 'amazon_textract'
                   Can also be a custom string if loader_class is provided
            loader_class: Optional custom loader class inheriting from BasePDFLoader.
                         If provided, this takes precedence over the method parameter.
            **kwargs: Additional arguments passed to the specific loader
                For amazon_textract:
                    - client: boto3 Textract client (optional)
                    - region_name: AWS region (default: 'us-east-2')
                For unstructured:
                    - api_key: Unstructured API key (or set UNSTRUCTURED_API_KEY env var)
                For mathpix:
                    - mathpix_api_key: Mathpix API key (or set MATHPIX_API_KEY env var)

        Raises:
            ValueError: If method is not supported and no loader_class is provided
            TypeError: If loader_class doesn't inherit from BasePDFLoader

        Examples:
            >>> # Default PyPDF loader
            >>> loader = PDFLoader("document.pdf")
            >>> docs = loader.load()

            >>> # Use Unstructured
            >>> loader = PDFLoader("document.pdf", method="unstructured")
            >>> docs = loader.load()

            >>> # Use custom loader class
            >>> from my_loaders import CustomPDFLoader
            >>> loader = PDFLoader("document.pdf", loader_class=CustomPDFLoader)
            >>> docs = loader.load()
        """
        self.file_path = file_path
        self.method = method
        self.kwargs = kwargs

        # Use custom loader class if provided
        if loader_class is not None:
            if not issubclass(loader_class, BasePDFLoader):
                raise TypeError(
                    f"loader_class must inherit from BasePDFLoader, got {loader_class}"
                )
            self.loader_impl = loader_class(file_path, **kwargs)
            logger.info(f"Using custom PDF loader: {loader_class.__name__}")
        else:
            # Use registered loader based on method
            if method not in PDF_LOADER_REGISTRY:
                available = ", ".join(PDF_LOADER_REGISTRY.keys())
                raise ValueError(
                    f"Unsupported PDF loader method: {method}. "
                    f"Choose from: {available}, or provide a custom loader_class"
                )

            loader_class = PDF_LOADER_REGISTRY[method]
            self.loader_impl = loader_class(file_path, **kwargs)
            logger.info(f"Initialized PDFLoader with method: {method}")



[docs]
    def load(self) -> List[Document]:
        """
        Load PDF documents using the specified method or loader.

        Returns:
            List of LangChain Document objects

        Raises:
            ImportError: If required dependencies are not installed
            RuntimeError: If loading fails
        """
        return self.loader_impl.load()



[docs]
    def get_install_command(self) -> str:
        """Get pip install command for the current loader's dependencies."""
        return self.loader_impl.get_install_command()





[docs]
def load_pdf(
    file_path: str | Path,
    method: PDFLoaderMethod | str = "pypdf",
    loader_class: type[BasePDFLoader] | None = None,
    **kwargs,
) -> List[Document]:
    """
    Convenience function to load a PDF document.

    Args:
        file_path: Path to PDF file or URL
        method: Loading method - 'pypdf' (default), 'unstructured', or 'amazon_textract'
        loader_class: Optional custom loader class inheriting from BasePDFLoader
        **kwargs: Additional arguments for the loader

    Returns:
        List of LangChain Document objects

    Examples:
        >>> # Basic usage with PyPDF (default)
        >>> docs = load_pdf("paper.pdf")

        >>> # Use Unstructured API
        >>> docs = load_pdf("paper.pdf", method="unstructured")

        >>> # Use Amazon Textract with URL
        >>> docs = load_pdf(
        ...     "https://example.com/document.pdf",
        ...     method="amazon_textract"
        ... )

        >>> # Use Amazon Textract with S3
        >>> import boto3
        >>> client = boto3.client("textract", region_name="us-east-2")
        >>> docs = load_pdf(
        ...     "s3://bucket/document.pdf",
        ...     method="amazon_textract",
        ...     client=client
        ... )

        >>> # Use custom loader
        >>> from my_loaders import MyCustomLoader
        >>> docs = load_pdf("paper.pdf", loader_class=MyCustomLoader)
    """
    loader = PDFLoader(file_path, method=method, loader_class=loader_class, **kwargs)
    return loader.load()