Source code for langchain_upstage.layout_analysis_parsers

import io
import json
import os
from typing import Dict, Iterator, List, Literal, Optional, Union

import fitz  # type: ignore
import requests
from fitz import Document as fitzDocument
from langchain_core.document_loaders import BaseBlobParser, Blob
from langchain_core.documents import Document

LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis"

DEFAULT_NUMBER_OF_PAGE = 10

OutputType = Literal["text", "html"]
SplitType = Literal["none", "element", "page"]


[docs]def validate_api_key(api_key: str) -> None: """ Validates the provided API key. Args: api_key (str): The API key to be validated. Raises: ValueError: If the API key is empty or None. Returns: None """ if not api_key: raise ValueError("API Key is required for Upstage Document Loader")
[docs]def validate_file_path(file_path: str) -> None: """ Validates if a file exists at the given file path. Args: file_path (str): The path to the file. Raises: FileNotFoundError: If the file does not exist at the given file path. """ if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}")
[docs]def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str: """ Parse the output data based on the specified output type. Args: data (dict): The data to be parsed. output_type (Union[OutputType, dict]): The output type to parse the element data into. Returns: str: The parsed output. Raises: ValueError: If the output type is invalid. """ if isinstance(output_type, dict): if data["category"] in output_type: return data[output_type[data["category"]]] else: return data["text"] elif isinstance(output_type, str): if output_type == "text": return data["text"] elif output_type == "html": return data["html"] else: raise ValueError(f"Invalid output type: {output_type}") else: raise ValueError(f"Invalid output type: {output_type}")
[docs]def get_from_param_or_env( key: str, param: Optional[str] = None, env_key: Optional[str] = None, default: Optional[str] = None, ) -> str: """Get a value from a param or an environment variable.""" if param is not None: return param elif env_key and env_key in os.environ and os.environ[env_key]: return os.environ[env_key] elif default is not None: return default else: raise ValueError( f"Did not find {key}, please add an environment variable" f" `{env_key}` which contains it, or pass" f" `{key}` as a named parameter." )
[docs]class UpstageLayoutAnalysisParser(BaseBlobParser): """Upstage Layout Analysis Parser. To use, you should have the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY` set with your API key or pass it as a named parameter to the constructor. Example: .. code-block:: python from langchain_upstage import UpstageLayoutAnalysisParser loader = UpstageLayoutAnalysisParser(split="page", output_type="text") """
[docs] def __init__( self, api_key: Optional[str] = None, output_type: Union[OutputType, dict] = "html", split: SplitType = "none", use_ocr: bool = False, exclude: list = [], ): """ Initializes an instance of the Upstage class. Args: api_key (str, optional): The API key for accessing the Upstage API. Defaults to None, in which case it will be fetched from the environment variable `UPSTAGE_DOCUMENT_AI_API_KEY`. output_type (Union[OutputType, dict], optional): The type of output to be generated by the parser. Defaults to "html". split (SplitType, optional): The type of splitting to be applied. Defaults to "none" (no splitting). use_ocr (bool, optional): Extract text from images in the document. Defaults to False. (Use text info in PDF file) exclude (list, optional): Exclude specific elements from the output. Defaults to [] (all included). """ self.api_key = get_from_param_or_env( "UPSTAGE_DOCUMENT_AI_API_KEY", api_key, "UPSTAGE_DOCUMENT_AI_API_KEY" ) self.output_type = output_type self.split = split self.use_ocr = use_ocr self.exclude = exclude validate_api_key(self.api_key)
def _get_response(self, files: Dict) -> List: """ Sends a POST request to the API endpoint with the provided files and returns the response. Args: files (dict): A dictionary containing the files to be sent in the request. Returns: dict: The JSON response from the API. Raises: ValueError: If there is an error in the API call. """ try: headers = {"Authorization": f"Bearer {self.api_key}"} options = {"ocr": self.use_ocr} response = requests.post( LAYOUT_ANALYSIS_URL, headers=headers, files=files, data=options ) response.raise_for_status() result = response.json().get("elements", []) except requests.RequestException as req_err: # Handle any request-related exceptions print(f"Request Exception: {req_err}") except json.JSONDecodeError as json_err: # Handle JSON decode errors print(f"JSON Decode Error: {json_err}") raise ValueError(f"Failed to decode JSON response: {json_err}") elements = [ element for element in result if element["category"] not in self.exclude ] return elements def _split_and_request( self, full_docs: fitzDocument, start_page: int, num_pages: int = DEFAULT_NUMBER_OF_PAGE, ) -> List: """ Splits the full pdf document into partial pages and sends a request to the server. Args: full_docs (str): The full document to be split and requested. start_page (int): The starting page number for splitting the document. num_pages (int, optional): The number of pages to split the document into. Defaults to DEFAULT_NUMBER_OF_PAGE. Returns: response: The response from the server. """ with fitz.open() as chunk_pdf: chunk_pdf.insert_pdf( full_docs, from_page=start_page, to_page=start_page + num_pages - 1, ) pdf_bytes = chunk_pdf.write() with io.BytesIO(pdf_bytes) as f: response = self._get_response({"document": f}) return response def _element_document(self, elements: Dict) -> Document: """ Converts an elements into a Document object. Args: elements: The elements to convert. Returns: A list containing a single Document object. """ return Document( page_content=(parse_output(elements, self.output_type)), metadata={ "page": elements["page"], "id": elements["id"], "type": self.output_type, "split": self.split, }, ) def _page_document(self, elements: List) -> List[Document]: """ Combines elements with the same page number into a single Document object. Args: elements (List): A list of elements containing page numbers. Returns: List[Document]: A list of Document objects, each representing a page with its content and metadata. """ _docs = [] pages = sorted(set(map(lambda x: x["page"], elements))) page_group = [ [element for element in elements if element["page"] == x] for x in pages ] for group in page_group: page_content = " ".join( [parse_output(element, self.output_type) for element in group] ) _docs.append( Document( page_content=page_content, metadata={ "page": group[0]["page"], "type": self.output_type, "split": self.split, }, ) ) return _docs
[docs] def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]: """ Lazily parses a document and yields Document objects based on the specified split type. Args: blob (Blob): The input document blob to parse. is_batch (bool, optional): Whether to parse the document in batches. Defaults to False (single page parsing) Yields: Document: The parsed document object. Raises: ValueError: If an invalid split type is provided. """ if is_batch: num_pages = DEFAULT_NUMBER_OF_PAGE else: num_pages = 1 full_docs = fitz.open(blob.path) number_of_pages = full_docs.page_count if self.split == "none": if full_docs.is_pdf: result = "" start_page = 0 num_pages = DEFAULT_NUMBER_OF_PAGE for _ in range(number_of_pages): if start_page >= number_of_pages: break elements = self._split_and_request(full_docs, start_page, num_pages) for element in elements: result += parse_output(element, self.output_type) start_page += num_pages else: if not blob.path: raise ValueError("Blob path is required for non-PDF files.") result = "" with open(blob.path, "rb") as f: elements = self._get_response({"document": f}) for element in elements: result += parse_output(element, self.output_type) yield Document( page_content=result, metadata={ "total_pages": number_of_pages, "type": self.output_type, "split": self.split, }, ) elif self.split == "element": if full_docs.is_pdf: start_page = 0 for _ in range(number_of_pages): if start_page >= number_of_pages: break elements = self._split_and_request(full_docs, start_page, num_pages) for element in elements: yield self._element_document(element) start_page += num_pages else: if not blob.path: raise ValueError("Blob path is required for non-PDF files.") with open(blob.path, "rb") as f: elements = self._get_response({"document": f}) for element in elements: yield self._element_document(element) elif self.split == "page": if full_docs.is_pdf: start_page = 0 for _ in range(number_of_pages): if start_page >= number_of_pages: break elements = self._split_and_request(full_docs, start_page, num_pages) yield from self._page_document(elements) start_page += num_pages else: if not blob.path: raise ValueError("Blob path is required for non-PDF files.") with open(blob.path, "rb") as f: elements = self._get_response({"document": f}) yield from self._page_document(elements) else: raise ValueError(f"Invalid split type: {self.split}")