Source code for automated_document_parser.loaders.file_loaders

"""Local file system loaders (pdf, txt, csv, etc.)."""

import logging
from pathlib import Path
from typing import List

from langchain_core.documents import Document

from ..config import LOADER_CONFIG
from ..utils import detect_file_type, validate_file_path
from .file_load import (
    CSVFileLoader,
    DOCXFileLoader,
    HTMLFileLoader,
    JSONFileLoader,
    TextFileLoader,
)
from .pdf_load import PDFLoader, PDFLoaderMethod

logger = logging.getLogger(__name__)


[docs] class FileLoader: """Automated file loader that detects file type and loads documents."""
[docs] def __init__( self, file_path: str | Path, pdf_loader_method: PDFLoaderMethod = "pypdf", **pdf_loader_kwargs, ): """ Initialize the FileLoader. Args: file_path: Path to the file to load pdf_loader_method: Method to use for PDF loading ('pypdf', 'unstructured', 'amazon_textract') **pdf_loader_kwargs: Additional keyword arguments for PDF loader (e.g., client, api_key) Raises: FileNotFoundError: If file doesn't exist ValueError: If file type is unsupported """ self.file_path = validate_file_path(file_path) self.file_type = detect_file_type(self.file_path) self.pdf_loader_method = pdf_loader_method self.pdf_loader_kwargs = pdf_loader_kwargs if self.file_type is None: raise ValueError( f"Unsupported file type: {self.file_path.suffix}. " f"File: {self.file_path}" ) logger.info(f"Initialized loader for {self.file_type} file: {self.file_path}") if self.file_type == "pdf": logger.info(f"PDF loader method: {pdf_loader_method}")
[docs] def load(self) -> List[Document]: """ Load documents from the file. Returns: List of LangChain Document objects Raises: RuntimeError: If loading fails """ try: loader = self._get_loader() documents = loader.load() logger.info( f"Successfully loaded {len(documents)} documents from {self.file_path}" ) return documents except Exception as e: logger.error(f"Error loading file {self.file_path}: {e}") raise RuntimeError(f"Failed to load file: {e}") from e
def _get_loader(self): """ Get appropriate LangChain loader based on file type. Returns: LangChain document loader instance """ # Special handling for PDF files with configurable method if self.file_type == "pdf": return PDFLoader( self.file_path, method=self.pdf_loader_method, **self.pdf_loader_kwargs, ) # Other file type loaders loaders = { "text": lambda: TextFileLoader( self.file_path, encoding=LOADER_CONFIG["text"]["encoding"] ), "csv": lambda: CSVFileLoader( self.file_path, encoding=LOADER_CONFIG["csv"]["encoding"] ), "json": lambda: JSONFileLoader( self.file_path, jq_schema=".", text_content=False ), "docx": lambda: DOCXFileLoader(self.file_path), "html": lambda: HTMLFileLoader(self.file_path), } if self.file_type not in loaders: raise ValueError(f"No loader available for file type: {self.file_type}") return loaders[self.file_type]()
[docs] def load_document( file_path: str | Path, pdf_loader_method: PDFLoaderMethod = "pypdf", **pdf_loader_kwargs, ) -> List[Document]: """ Convenience function to load a document from a file. Args: file_path: Path to the file pdf_loader_method: Method to use for PDF loading ('pypdf', 'unstructured', 'amazon_textract') **pdf_loader_kwargs: Additional keyword arguments for PDF loader Returns: List of LangChain Document objects Examples: >>> # Load a text file >>> documents = load_document("path/to/file.txt") >>> # Load a PDF with default PyPDF >>> documents = load_document("path/to/file.pdf") >>> # Load a PDF with Unstructured >>> documents = load_document("path/to/file.pdf", pdf_loader_method="unstructured") >>> # Load a PDF with Amazon Textract >>> import boto3 >>> client = boto3.client("textract", region_name="us-east-2") >>> documents = load_document( ... "s3://bucket/file.pdf", ... pdf_loader_method="amazon_textract", ... client=client ... ) """ loader = FileLoader( file_path, pdf_loader_method=pdf_loader_method, **pdf_loader_kwargs ) return loader.load()