Source code for automated_document_parser.loaders.pdf_load.pypdf_loader

"""
PyPDF loader implementation.

Reference: https://docs.langchain.com/oss/python/integrations/document_loaders/pypdfloader
"""

import logging
from typing import List

from langchain_core.documents import Document

from .base import BasePDFLoader

logger = logging.getLogger(__name__)


[docs] class PyPDFLoaderImpl(BasePDFLoader): """ PDF loader using PyPDF backend. Fast and simple PDF parsing suitable for most standard PDFs. This is the default loader method. """
[docs] def load(self) -> List[Document]: """ Load PDF using PyPDF. Returns: List of LangChain Document objects """ try: from langchain_community.document_loaders import PyPDFLoader loader = PyPDFLoader(str(self.file_path)) docs = loader.load() logger.info(f"Loaded {len(docs)} pages with PyPDF") return docs except ImportError as e: logger.error(f"PyPDF dependencies not installed: {e}") raise ImportError( f"Required package not installed for pypdf. " f"Install with: {self.get_install_command()}" ) from e except Exception as e: logger.error(f"Error loading PDF with PyPDF: {e}") raise RuntimeError(f"Failed to load PDF: {e}") from e
[docs] def get_install_command(self) -> str: """Get pip install command for PyPDF dependencies.""" return "pip install langchain-community pypdf"