Module llmflex.TextSplitters.base_text_splitter

Expand source code
from ..Schemas.documents import Document
from typing import List
from abc import ABC, abstractmethod

class BaseTextSplitter(ABC):
    """Base class for text splitter.
    """

    def __init__(self, chunk_size: int = 400, chunk_overlap: int = 40) -> None:
        """Initialise the TextSplitter.

        Args:
            chunk_size (int, optional): Maximum number of tokens per text chunk. Defaults to 400.
            chunk_overlap (int, optional): Numbers of tokens that overlaps for each subsequent chunks. Defaults to 40.
        """
        self._chunk_size = chunk_size
        self._chunk_overlap = chunk_overlap

    @abstractmethod
    def split_text(self, text: str) -> List[str]:
        """Splitting the given text.

        Args:
            text (str): Text to split.

        Returns:
            List[str]: List of split texts.
        """
        pass

    def split_documents(self, docs: List[Document]) -> List[Document]:
        """Split the list of given documents.

        Args:
            docs (List[Document]): Documents to split.

        Returns:
            List[Document]: List of splitted documents.
        """
        def split_doc(doc: Document):
            text = doc.index
            metadata = doc.metadata
            text_ls = self.split_text(text)
            new_docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(text_ls, [metadata] * len(text_ls)))))
            return new_docs
        new_docs = list(map(split_doc, docs))
        return sum(new_docs, [])

Classes

class BaseTextSplitter (chunk_size: int = 400, chunk_overlap: int = 40)

Base class for text splitter.

Initialise the TextSplitter.

Args

chunk_size : int, optional
Maximum number of tokens per text chunk. Defaults to 400.
chunk_overlap : int, optional
Numbers of tokens that overlaps for each subsequent chunks. Defaults to 40.
Expand source code
class BaseTextSplitter(ABC):
    """Base class for text splitter.
    """

    def __init__(self, chunk_size: int = 400, chunk_overlap: int = 40) -> None:
        """Initialise the TextSplitter.

        Args:
            chunk_size (int, optional): Maximum number of tokens per text chunk. Defaults to 400.
            chunk_overlap (int, optional): Numbers of tokens that overlaps for each subsequent chunks. Defaults to 40.
        """
        self._chunk_size = chunk_size
        self._chunk_overlap = chunk_overlap

    @abstractmethod
    def split_text(self, text: str) -> List[str]:
        """Splitting the given text.

        Args:
            text (str): Text to split.

        Returns:
            List[str]: List of split texts.
        """
        pass

    def split_documents(self, docs: List[Document]) -> List[Document]:
        """Split the list of given documents.

        Args:
            docs (List[Document]): Documents to split.

        Returns:
            List[Document]: List of splitted documents.
        """
        def split_doc(doc: Document):
            text = doc.index
            metadata = doc.metadata
            text_ls = self.split_text(text)
            new_docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(text_ls, [metadata] * len(text_ls)))))
            return new_docs
        new_docs = list(map(split_doc, docs))
        return sum(new_docs, [])

Ancestors

  • abc.ABC

Subclasses

Methods

def split_documents(self, docs: List[Document]) ‑> List[Document]

Split the list of given documents.

Args

docs : List[Document]
Documents to split.

Returns

List[Document]
List of splitted documents.
Expand source code
def split_documents(self, docs: List[Document]) -> List[Document]:
    """Split the list of given documents.

    Args:
        docs (List[Document]): Documents to split.

    Returns:
        List[Document]: List of splitted documents.
    """
    def split_doc(doc: Document):
        text = doc.index
        metadata = doc.metadata
        text_ls = self.split_text(text)
        new_docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(text_ls, [metadata] * len(text_ls)))))
        return new_docs
    new_docs = list(map(split_doc, docs))
    return sum(new_docs, [])
def split_text(self, text: str) ‑> List[str]

Splitting the given text.

Args

text : str
Text to split.

Returns

List[str]
List of split texts.
Expand source code
@abstractmethod
def split_text(self, text: str) -> List[str]:
    """Splitting the given text.

    Args:
        text (str): Text to split.

    Returns:
        List[str]: List of split texts.
    """
    pass