Source code for automated_document_parser.loaders.pdf_load.pymupdf4llm_loader

"""
PyMuPDF4LLM loader implementation.

Reference: https://python.langchain.com/docs/integrations/document_loaders/pymupdf4llm/
"""

import logging
from typing import List

from langchain_core.documents import Document

from .base import BasePDFLoader

logger = logging.getLogger(__name__)



[docs]
class PyMuPDF4LLMLoader(BasePDFLoader):
    """
    PDF loader using PyMuPDF4LLM backend.

    Optimized for LLM processing with enhanced text extraction and formatting.
    No API key required - works locally.
    """


[docs]
    def load(self) -> List[Document]:
        """
        Load PDF using PyMuPDF4LLM.

        Returns:
            List of LangChain Document objects

        Raises:
            ImportError: If langchain-pymupdf4llm is not installed
        """
        try:
            from langchain_pymupdf4llm import PyMuPDF4LLMLoader as LCPyMuPDF4LLMLoader

            loader = LCPyMuPDF4LLMLoader(str(self.file_path))
            return loader.load()

        except ImportError as e:
            logger.error(f"Failed to import PyMuPDF4LLMLoader: {e}")
            raise ImportError(
                f"langchain-pymupdf4llm is required for PyMuPDF4LLM loader. {self.get_install_command()}"
            )
        except Exception as e:
            logger.error(f"Error loading PDF with PyMuPDF4LLM: {e}")
            raise



[docs]
    @staticmethod
    def get_install_command() -> str:
        """
        Return the command to install required dependencies.

        Returns:
            Installation command string
        """
        return "pip install langchain-pymupdf4llm"