Source code for langchain_upstage.layout_analysis

import os
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union

from langchain_core.document_loaders import BaseLoader, Blob
from langchain_core.documents import Document

from .layout_analysis_parsers import UpstageLayoutAnalysisParser

DEFAULT_PAGE_BATCH_SIZE = 10

OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]


[docs]def validate_api_key(api_key: str) -> None: """ Validates the provided API key. Args: api_key (str): The API key to be validated. Raises: ValueError: If the API key is empty or None. Returns: None """ if not api_key: raise ValueError("API Key is required for Upstage Document Loader")
[docs]def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None: """ Validates if a file exists at the given file path. Args: file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be validated. Raises: FileNotFoundError: If the file or any of the files in the list do not exist. """ if isinstance(file_path, list): for path in file_path: validate_file_path(path) return if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}")
[docs]def get_from_param_or_env( key: str, param: Optional[str] = None, env_key: Optional[str] = None, default: Optional[str] = None, ) -> str: """Get a value from a param or an environment variable.""" if param is not None: return param elif env_key and env_key in os.environ and os.environ[env_key]: return os.environ[env_key] elif default is not None: return default else: raise ValueError( f"Did not find {key}, please add an environment variable" f" `{env_key}` which contains it, or pass" f" `{key}` as a named parameter." )
[docs]class UpstageLayoutAnalysisLoader(BaseLoader): """Upstage Layout Analysis. To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY` set with your API key or pass it as a named parameter to the constructor. Example: .. code-block:: python from langchain_upstage import UpstageLayoutAnalysis file_path = "/PATH/TO/YOUR/FILE.pdf" loader = UpstageLayoutAnalysis( file_path, split="page", output_type="text" ) """
[docs] def __init__( self, file_path: Union[str, Path, List[str], List[Path]], output_type: Union[OutputType, dict] = "html", split: SplitType = "none", api_key: Optional[str] = None, use_ocr: bool = False, exclude: list = ["header", "footer"], ): """ Initializes an instance of the Upstage document loader. Args: file_path (Union[str, Path, List[str], List[Path]): The path to the document to be loaded. output_type (Union[OutputType, dict], optional): The type of output to be generated by the parser. Defaults to "html". split (SplitType, optional): The type of splitting to be applied. Defaults to "none" (no splitting). api_key (str, optional): The API key for accessing the Upstage API. Defaults to None, in which case it will be fetched from the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`. use_ocr (bool, optional): Extract text from images in the document. Defaults to False. (Use text info in PDF file) exclude (list, optional): Exclude specific elements from the output. Defaults to ["header", "footer"]. """ self.file_path = file_path self.output_type = output_type self.split = split self.api_key = get_from_param_or_env( "UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY" ) self.use_ocr = use_ocr self.exclude = exclude validate_file_path(self.file_path) validate_api_key(self.api_key)
[docs] def load(self) -> List[Document]: """ Loads and parses the document using the UpstageLayoutAnalysisParser. Returns: A list of Document objects representing the parsed layout analysis. """ if isinstance(self.file_path, list): result = [] for file_path in self.file_path: blob = Blob.from_path(file_path) parser = UpstageLayoutAnalysisParser( self.api_key, split=self.split, output_type=self.output_type, use_ocr=self.use_ocr, exclude=self.exclude, ) result.extend(list(parser.lazy_parse(blob, is_batch=True))) return result else: blob = Blob.from_path(self.file_path) parser = UpstageLayoutAnalysisParser( self.api_key, split=self.split, output_type=self.output_type, use_ocr=self.use_ocr, exclude=self.exclude, ) return list(parser.lazy_parse(blob, is_batch=True))
[docs] def lazy_load(self) -> Iterator[Document]: """ Lazily loads and parses the document using the UpstageLayoutAnalysisParser. Returns: An iterator of Document objects representing the parsed layout analysis. """ if isinstance(self.file_path, list): for file_path in self.file_path: blob = Blob.from_path(file_path) parser = UpstageLayoutAnalysisParser( self.api_key, split=self.split, output_type=self.output_type, use_ocr=self.use_ocr, exclude=self.exclude, ) yield from parser.lazy_parse(blob, is_batch=True) else: blob = Blob.from_path(self.file_path) parser = UpstageLayoutAnalysisParser( self.api_key, split=self.split, output_type=self.output_type, use_ocr=self.use_ocr, exclude=self.exclude, ) yield from parser.lazy_parse(blob)