Source code for automated_document_parser.utils

"""Helper functions for document parsing."""

import logging
from pathlib import Path
from typing import Optional

from .config import FILE_EXTENSION_MAPPING, SUPPORTED_EXTENSIONS

# Configure logging
logger = logging.getLogger(__name__)


[docs] def detect_file_type(file_path: str | Path) -> Optional[str]: """ Detect file type based on extension. Args: file_path: Path to the file Returns: Loader type string or None if unsupported Raises: FileNotFoundError: If file doesn't exist """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not path.is_file(): raise ValueError(f"Path is not a file: {file_path}") extension = path.suffix.lower() if extension not in FILE_EXTENSION_MAPPING: logger.warning(f"Unsupported file extension: {extension}") return None return FILE_EXTENSION_MAPPING[extension]
[docs] def is_supported_file(file_path: str | Path) -> bool: """ Check if file type is supported. Args: file_path: Path to the file Returns: True if supported, False otherwise """ try: extension = Path(file_path).suffix.lower() return extension in SUPPORTED_EXTENSIONS except Exception as e: logger.error(f"Error checking file support: {e}") return False
[docs] def validate_file_path(file_path: str | Path) -> Path: """ Validate and normalize file path. Args: file_path: Path to validate Returns: Normalized Path object Raises: FileNotFoundError: If file doesn't exist ValueError: If path is not a file """ path = Path(file_path).resolve() if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not path.is_file(): raise ValueError(f"Path is not a file: {file_path}") return path
[docs] def get_file_info(file_path: str | Path) -> dict: """ Get basic file information. Args: file_path: Path to the file Returns: Dictionary with file metadata """ path = Path(file_path) stat = path.stat() return { "name": path.name, "extension": path.suffix.lower(), "size_bytes": stat.st_size, "absolute_path": str(path.resolve()), }