Source code for automated_document_parser.loaders.pdf_load.textract_loader

"""
Amazon Textract loader implementation.

Reference: https://docs.langchain.com/oss/python/integrations/document_loaders/amazon_textract
"""

import logging
from typing import List

from langchain_core.documents import Document

from .base import BasePDFLoader

logger = logging.getLogger(__name__)


[docs] class AmazonTextractPDFLoader(BasePDFLoader): """ PDF loader using Amazon Textract backend. OCR service for extracting text from scanned documents and images. Supports local files, HTTP/HTTPS URLs, and S3 URIs. Requires AWS credentials to be configured. """
[docs] def load(self) -> List[Document]: """ Load PDF using Amazon Textract. Returns: List of LangChain Document objects Raises: ImportError: If boto3 or amazon-textract-caller is not installed """ try: from langchain_community.document_loaders import ( AmazonTextractPDFLoader as TextractLoader, ) # Get optional boto3 client textract_client = self.kwargs.get("client") if textract_client is None and str(self.file_path).startswith("s3://"): # For S3 files, create a client with specified region import boto3 region = self.kwargs.get("region_name", "us-east-2") textract_client = boto3.client("textract", region_name=region) logger.info(f"Created Textract client for region: {region}") # Load with or without client if textract_client: loader = TextractLoader(str(self.file_path), client=textract_client) else: loader = TextractLoader(str(self.file_path)) docs = loader.load() logger.info(f"Loaded {len(docs)} documents with Amazon Textract") return docs except ImportError as e: logger.error(f"Amazon Textract dependencies not installed: {e}") raise ImportError( f"Required package not installed for amazon_textract. " f"Install with: {self.get_install_command()}" ) from e except Exception as e: logger.error(f"Error loading PDF with Amazon Textract: {e}") raise RuntimeError(f"Failed to load PDF: {e}") from e
[docs] def get_install_command(self) -> str: """Get pip install command for Amazon Textract dependencies.""" return "pip install boto3 amazon-textract-caller>=0.2.0"