Source code for automated_document_parser.loaders.pdf_load.pypdf_loader

"""
PyPDF loader implementation.

Reference: https://docs.langchain.com/oss/python/integrations/document_loaders/pypdfloader
"""

import logging
from typing import List

from langchain_core.documents import Document

from .base import BasePDFLoader

logger = logging.getLogger(__name__)



[docs]
class PyPDFLoaderImpl(BasePDFLoader):
    """
    PDF loader using PyPDF backend.

    Fast and simple PDF parsing suitable for most standard PDFs.
    This is the default loader method.
    """


[docs]
    def load(self) -> List[Document]:
        """
        Load PDF using PyPDF.

        Returns:
            List of LangChain Document objects
        """
        try:
            from langchain_community.document_loaders import PyPDFLoader

            loader = PyPDFLoader(str(self.file_path))
            docs = loader.load()
            logger.info(f"Loaded {len(docs)} pages with PyPDF")
            return docs
        except ImportError as e:
            logger.error(f"PyPDF dependencies not installed: {e}")
            raise ImportError(
                f"Required package not installed for pypdf. "
                f"Install with: {self.get_install_command()}"
            ) from e
        except Exception as e:
            logger.error(f"Error loading PDF with PyPDF: {e}")
            raise RuntimeError(f"Failed to load PDF: {e}") from e



[docs]
    def get_install_command(self) -> str:
        """Get pip install command for PyPDF dependencies."""
        return "pip install langchain-community pypdf"