Source code for automated_document_parser.core

"""Main DocumentParser class for automated document loading."""

import logging
from pathlib import Path
from typing import List, Union

from langchain_core.documents import Document

from .loaders.file_loaders import FileLoader
from .utils import get_file_info, is_supported_file

logger = logging.getLogger(__name__)


[docs] class DocumentParser: """ Main class for automated document parsing. Automatically detects file type and loads documents using appropriate loaders. Designed for seamless integration with LangChain RAG pipelines. """
[docs] def __init__(self): """Initialize the DocumentParser.""" self.loaded_files: List[str] = []
[docs] def parse( self, file_path: str | Path, pdf_loader_method: str = "pypdf", **kwargs ) -> List[Document]: """ Parse a document from file path. Args: file_path: Path to the document file pdf_loader_method: Method to use for PDF files (default: 'pypdf'). Options: 'pypdf', 'unstructured', 'amazon_textract', 'mathpix', 'pdfplumber', 'pypdfium2', 'pymupdf', 'pymupdf4llm', 'opendataloader' **kwargs: Additional keyword arguments for the loader (e.g., encoding, api_key) Returns: List of LangChain Document objects Raises: FileNotFoundError: If file doesn't exist ValueError: If file type is unsupported RuntimeError: If parsing fails Example: >>> parser = DocumentParser() >>> # Basic usage with auto-detection >>> docs = parser.parse("document.pdf") >>> # Specify PDF loading method >>> docs = parser.parse("document.pdf", pdf_loader_method="pdfplumber") >>> # Pass additional parameters >>> docs = parser.parse("math.pdf", pdf_loader_method="mathpix", ... mathpix_app_id="id", mathpix_app_key="key") """ if not is_supported_file(file_path): path = Path(file_path) raise ValueError( f"Unsupported file type: {path.suffix}. " f"Supported types: .txt, .pdf, .csv, .json, .docx, .html, .md" ) loader = FileLoader(file_path, pdf_loader_method=pdf_loader_method, **kwargs) documents = loader.load() # Track loaded files self.loaded_files.append(str(Path(file_path).resolve())) # Add file metadata to documents file_info = get_file_info(file_path) for doc in documents: doc.metadata.update( { "source": file_info["absolute_path"], "file_name": file_info["name"], "file_type": file_info["extension"], } ) logger.info(f"Parsed {len(documents)} documents from {file_path}") return documents
[docs] def parse_multiple( self, file_paths: List[Union[str, Path]], pdf_loader_method: str = "pypdf", **kwargs, ) -> dict[str, List[Document]]: """ Parse multiple documents with automatic file type detection. Args: file_paths: List of file paths pdf_loader_method: Method to use for PDF files (default: 'pypdf'). Options: 'pypdf', 'unstructured', 'amazon_textract', 'mathpix', 'pdfplumber', 'pypdfium2', 'pymupdf', 'pymupdf4llm', 'opendataloader' **kwargs: Additional keyword arguments for loaders (e.g., encoding, api_key) Returns: Dictionary mapping file paths to their loaded documents Example: >>> parser = DocumentParser() >>> # Auto-detect all file types with default settings >>> results = parser.parse_multiple(["doc1.pdf", "doc2.txt", "data.csv"]) >>> # Specify PDF method for all PDFs >>> results = parser.parse_multiple( ... ["doc1.pdf", "doc2.pdf", "data.csv"], ... pdf_loader_method="pdfplumber" ... ) >>> for file, docs in results.items(): ... print(f"{file}: {len(docs)} documents") """ results = {} for file_path in file_paths: try: documents = self.parse(file_path, pdf_loader_method, **kwargs) results[str(file_path)] = documents except Exception as e: logger.error(f"Failed to parse {file_path}: {e}") results[str(file_path)] = [] return results
[docs] def get_loaded_files(self) -> List[str]: """ Get list of successfully loaded files. Returns: List of file paths that were successfully loaded """ return self.loaded_files.copy()