Source code for automated_document_parser.core

"""Main DocumentParser class for automated document loading."""

import logging
from pathlib import Path
from typing import List, Union

from langchain_core.documents import Document

from .loaders.file_loaders import FileLoader
from .utils import get_file_info, is_supported_file

logger = logging.getLogger(__name__)



[docs]
class DocumentParser:
    """
    Main class for automated document parsing.

    Automatically detects file type and loads documents using appropriate loaders.
    Designed for seamless integration with LangChain RAG pipelines.
    """


[docs]
    def __init__(self):
        """Initialize the DocumentParser."""
        self.loaded_files: List[str] = []



[docs]
    def parse(
        self, file_path: str | Path, pdf_loader_method: str = "pypdf", **kwargs
    ) -> List[Document]:
        """
        Parse a document from file path.

        Args:
            file_path: Path to the document file
            pdf_loader_method: Method to use for PDF files (default: 'pypdf').
                Options: 'pypdf', 'unstructured', 'amazon_textract', 'mathpix',
                'pdfplumber', 'pypdfium2', 'pymupdf', 'pymupdf4llm', 'opendataloader'
            **kwargs: Additional keyword arguments for the loader (e.g., encoding, api_key)

        Returns:
            List of LangChain Document objects

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file type is unsupported
            RuntimeError: If parsing fails

        Example:
            >>> parser = DocumentParser()
            >>> # Basic usage with auto-detection
            >>> docs = parser.parse("document.pdf")
            >>> # Specify PDF loading method
            >>> docs = parser.parse("document.pdf", pdf_loader_method="pdfplumber")
            >>> # Pass additional parameters
            >>> docs = parser.parse("math.pdf", pdf_loader_method="mathpix",
            ...                     mathpix_app_id="id", mathpix_app_key="key")
        """
        if not is_supported_file(file_path):
            path = Path(file_path)
            raise ValueError(
                f"Unsupported file type: {path.suffix}. "
                f"Supported types: .txt, .pdf, .csv, .json, .docx, .html, .md"
            )

        loader = FileLoader(file_path, pdf_loader_method=pdf_loader_method, **kwargs)
        documents = loader.load()

        # Track loaded files
        self.loaded_files.append(str(Path(file_path).resolve()))

        # Add file metadata to documents
        file_info = get_file_info(file_path)
        for doc in documents:
            doc.metadata.update(
                {
                    "source": file_info["absolute_path"],
                    "file_name": file_info["name"],
                    "file_type": file_info["extension"],
                }
            )

        logger.info(f"Parsed {len(documents)} documents from {file_path}")
        return documents



[docs]
    def parse_multiple(
        self,
        file_paths: List[Union[str, Path]],
        pdf_loader_method: str = "pypdf",
        **kwargs,
    ) -> dict[str, List[Document]]:
        """
        Parse multiple documents with automatic file type detection.

        Args:
            file_paths: List of file paths
            pdf_loader_method: Method to use for PDF files (default: 'pypdf').
                Options: 'pypdf', 'unstructured', 'amazon_textract', 'mathpix',
                'pdfplumber', 'pypdfium2', 'pymupdf', 'pymupdf4llm', 'opendataloader'
            **kwargs: Additional keyword arguments for loaders (e.g., encoding, api_key)

        Returns:
            Dictionary mapping file paths to their loaded documents

        Example:
            >>> parser = DocumentParser()
            >>> # Auto-detect all file types with default settings
            >>> results = parser.parse_multiple(["doc1.pdf", "doc2.txt", "data.csv"])
            >>> # Specify PDF method for all PDFs
            >>> results = parser.parse_multiple(
            ...     ["doc1.pdf", "doc2.pdf", "data.csv"],
            ...     pdf_loader_method="pdfplumber"
            ... )
            >>> for file, docs in results.items():
            ...     print(f"{file}: {len(docs)} documents")
        """
        results = {}
        for file_path in file_paths:
            try:
                documents = self.parse(file_path, pdf_loader_method, **kwargs)
                results[str(file_path)] = documents
            except Exception as e:
                logger.error(f"Failed to parse {file_path}: {e}")
                results[str(file_path)] = []

        return results



[docs]
    def get_loaded_files(self) -> List[str]:
        """
        Get list of successfully loaded files.

        Returns:
            List of file paths that were successfully loaded
        """
        return self.loaded_files.copy()