Source code for automated_document_parser.loaders.file_loaders

"""Local file system loaders (pdf, txt, csv, etc.)."""

import logging
from pathlib import Path
from typing import List

from langchain_core.documents import Document

from ..config import LOADER_CONFIG
from ..utils import detect_file_type, validate_file_path
from .file_load import (
    CSVFileLoader,
    DOCXFileLoader,
    HTMLFileLoader,
    JSONFileLoader,
    TextFileLoader,
)
from .pdf_load import PDFLoader, PDFLoaderMethod

logger = logging.getLogger(__name__)



[docs]
class FileLoader:
    """Automated file loader that detects file type and loads documents."""


[docs]
    def __init__(
        self,
        file_path: str | Path,
        pdf_loader_method: PDFLoaderMethod = "pypdf",
        **pdf_loader_kwargs,
    ):
        """
        Initialize the FileLoader.

        Args:
            file_path: Path to the file to load
            pdf_loader_method: Method to use for PDF loading ('pypdf', 'unstructured', 'amazon_textract')
            **pdf_loader_kwargs: Additional keyword arguments for PDF loader (e.g., client, api_key)

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file type is unsupported
        """
        self.file_path = validate_file_path(file_path)
        self.file_type = detect_file_type(self.file_path)
        self.pdf_loader_method = pdf_loader_method
        self.pdf_loader_kwargs = pdf_loader_kwargs

        if self.file_type is None:
            raise ValueError(
                f"Unsupported file type: {self.file_path.suffix}. "
                f"File: {self.file_path}"
            )

        logger.info(f"Initialized loader for {self.file_type} file: {self.file_path}")
        if self.file_type == "pdf":
            logger.info(f"PDF loader method: {pdf_loader_method}")



[docs]
    def load(self) -> List[Document]:
        """
        Load documents from the file.

        Returns:
            List of LangChain Document objects

        Raises:
            RuntimeError: If loading fails
        """
        try:
            loader = self._get_loader()
            documents = loader.load()
            logger.info(
                f"Successfully loaded {len(documents)} documents from {self.file_path}"
            )
            return documents
        except Exception as e:
            logger.error(f"Error loading file {self.file_path}: {e}")
            raise RuntimeError(f"Failed to load file: {e}") from e


    def _get_loader(self):
        """
        Get appropriate LangChain loader based on file type.

        Returns:
            LangChain document loader instance
        """
        # Special handling for PDF files with configurable method
        if self.file_type == "pdf":
            return PDFLoader(
                self.file_path,
                method=self.pdf_loader_method,
                **self.pdf_loader_kwargs,
            )

        # Other file type loaders
        loaders = {
            "text": lambda: TextFileLoader(
                self.file_path, encoding=LOADER_CONFIG["text"]["encoding"]
            ),
            "csv": lambda: CSVFileLoader(
                self.file_path, encoding=LOADER_CONFIG["csv"]["encoding"]
            ),
            "json": lambda: JSONFileLoader(
                self.file_path, jq_schema=".", text_content=False
            ),
            "docx": lambda: DOCXFileLoader(self.file_path),
            "html": lambda: HTMLFileLoader(self.file_path),
        }

        if self.file_type not in loaders:
            raise ValueError(f"No loader available for file type: {self.file_type}")

        return loaders[self.file_type]()




[docs]
def load_document(
    file_path: str | Path,
    pdf_loader_method: PDFLoaderMethod = "pypdf",
    **pdf_loader_kwargs,
) -> List[Document]:
    """
    Convenience function to load a document from a file.

    Args:
        file_path: Path to the file
        pdf_loader_method: Method to use for PDF loading ('pypdf', 'unstructured', 'amazon_textract')
        **pdf_loader_kwargs: Additional keyword arguments for PDF loader

    Returns:
        List of LangChain Document objects

    Examples:
        >>> # Load a text file
        >>> documents = load_document("path/to/file.txt")

        >>> # Load a PDF with default PyPDF
        >>> documents = load_document("path/to/file.pdf")

        >>> # Load a PDF with Unstructured
        >>> documents = load_document("path/to/file.pdf", pdf_loader_method="unstructured")

        >>> # Load a PDF with Amazon Textract
        >>> import boto3
        >>> client = boto3.client("textract", region_name="us-east-2")
        >>> documents = load_document(
        ...     "s3://bucket/file.pdf",
        ...     pdf_loader_method="amazon_textract",
        ...     client=client
        ... )
    """
    loader = FileLoader(
        file_path, pdf_loader_method=pdf_loader_method, **pdf_loader_kwargs
    )
    return loader.load()