"""
Main PDF loader with support for multiple backends.
"""
import logging
from pathlib import Path
from typing import List
from langchain_core.documents import Document
from .base import BasePDFLoader, PDFLoaderMethod
from .pypdf_loader import PyPDFLoaderImpl
from .unstructured_loader import UnstructuredPDFLoader
from .textract_loader import AmazonTextractPDFLoader
from .mathpix_loader import MathpixPDFLoader
from .pdfplumber_loader import PDFPlumberLoader
from .pypdfium2_loader import PyPDFium2Loader
from .pymupdf_loader import PyMuPDFLoader
from .pymupdf4llm_loader import PyMuPDF4LLMLoader
from .opendataloader_loader import OpenDataLoaderPDFLoader
logger = logging.getLogger(__name__)
# Registry of available PDF loaders
PDF_LOADER_REGISTRY = {
"pypdf": PyPDFLoaderImpl,
"unstructured": UnstructuredPDFLoader,
"amazon_textract": AmazonTextractPDFLoader,
"mathpix": MathpixPDFLoader,
"pdfplumber": PDFPlumberLoader,
"pypdfium2": PyPDFium2Loader,
"pymupdf": PyMuPDFLoader,
"pymupdf4llm": PyMuPDF4LLMLoader,
"opendataloader": OpenDataLoaderPDFLoader,
}
[docs]
class PDFLoader:
"""
Flexible PDF loader supporting multiple parsing backends.
By default, uses PyPDF for standard PDF parsing. Users can specify
alternative methods like 'unstructured' for advanced parsing or
'amazon_textract' for OCR capabilities.
Users can also provide custom loader classes that inherit from BasePDFLoader.
"""
[docs]
def __init__(
self,
file_path: str | Path,
method: PDFLoaderMethod | str = "pypdf",
loader_class: type[BasePDFLoader] | None = None,
**kwargs,
):
"""
Initialize PDF loader with specified method or custom loader.
Args:
file_path: Path to PDF file or URL (for amazon_textract)
method: Loading method - 'pypdf' (default), 'unstructured', or 'amazon_textract'
Can also be a custom string if loader_class is provided
loader_class: Optional custom loader class inheriting from BasePDFLoader.
If provided, this takes precedence over the method parameter.
**kwargs: Additional arguments passed to the specific loader
For amazon_textract:
- client: boto3 Textract client (optional)
- region_name: AWS region (default: 'us-east-2')
For unstructured:
- api_key: Unstructured API key (or set UNSTRUCTURED_API_KEY env var)
For mathpix:
- mathpix_api_key: Mathpix API key (or set MATHPIX_API_KEY env var)
Raises:
ValueError: If method is not supported and no loader_class is provided
TypeError: If loader_class doesn't inherit from BasePDFLoader
Examples:
>>> # Default PyPDF loader
>>> loader = PDFLoader("document.pdf")
>>> docs = loader.load()
>>> # Use Unstructured
>>> loader = PDFLoader("document.pdf", method="unstructured")
>>> docs = loader.load()
>>> # Use custom loader class
>>> from my_loaders import CustomPDFLoader
>>> loader = PDFLoader("document.pdf", loader_class=CustomPDFLoader)
>>> docs = loader.load()
"""
self.file_path = file_path
self.method = method
self.kwargs = kwargs
# Use custom loader class if provided
if loader_class is not None:
if not issubclass(loader_class, BasePDFLoader):
raise TypeError(
f"loader_class must inherit from BasePDFLoader, got {loader_class}"
)
self.loader_impl = loader_class(file_path, **kwargs)
logger.info(f"Using custom PDF loader: {loader_class.__name__}")
else:
# Use registered loader based on method
if method not in PDF_LOADER_REGISTRY:
available = ", ".join(PDF_LOADER_REGISTRY.keys())
raise ValueError(
f"Unsupported PDF loader method: {method}. "
f"Choose from: {available}, or provide a custom loader_class"
)
loader_class = PDF_LOADER_REGISTRY[method]
self.loader_impl = loader_class(file_path, **kwargs)
logger.info(f"Initialized PDFLoader with method: {method}")
[docs]
def load(self) -> List[Document]:
"""
Load PDF documents using the specified method or loader.
Returns:
List of LangChain Document objects
Raises:
ImportError: If required dependencies are not installed
RuntimeError: If loading fails
"""
return self.loader_impl.load()
[docs]
def get_install_command(self) -> str:
"""Get pip install command for the current loader's dependencies."""
return self.loader_impl.get_install_command()
[docs]
def load_pdf(
file_path: str | Path,
method: PDFLoaderMethod | str = "pypdf",
loader_class: type[BasePDFLoader] | None = None,
**kwargs,
) -> List[Document]:
"""
Convenience function to load a PDF document.
Args:
file_path: Path to PDF file or URL
method: Loading method - 'pypdf' (default), 'unstructured', or 'amazon_textract'
loader_class: Optional custom loader class inheriting from BasePDFLoader
**kwargs: Additional arguments for the loader
Returns:
List of LangChain Document objects
Examples:
>>> # Basic usage with PyPDF (default)
>>> docs = load_pdf("paper.pdf")
>>> # Use Unstructured API
>>> docs = load_pdf("paper.pdf", method="unstructured")
>>> # Use Amazon Textract with URL
>>> docs = load_pdf(
... "https://example.com/document.pdf",
... method="amazon_textract"
... )
>>> # Use Amazon Textract with S3
>>> import boto3
>>> client = boto3.client("textract", region_name="us-east-2")
>>> docs = load_pdf(
... "s3://bucket/document.pdf",
... method="amazon_textract",
... client=client
... )
>>> # Use custom loader
>>> from my_loaders import MyCustomLoader
>>> docs = load_pdf("paper.pdf", loader_class=MyCustomLoader)
"""
loader = PDFLoader(file_path, method=method, loader_class=loader_class, **kwargs)
return loader.load()