Module llmflex.Embeddings.base_embeddings

Expand source code
from ..TextSplitters.base_text_splitter import BaseTextSplitter
from ..Schemas.tokenizer import Tokenizer
from langchain.embeddings.base import Embeddings
from abc import ABC, abstractmethod
import numpy as np
from typing import List, Type

class BaseEmbeddings(ABC):
    """Base class for embeddings model.
    """

    @abstractmethod
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed list of texts.

        Args:
            texts (List[str]): List of texts to embed.

        Returns:
            List[List[float]]: List of embedded vectors.
        """
        pass

    def embed_query(self, text: str) -> List[float]:
        """Embed one string.

        Args:
            text (str): String to embed.

        Returns:
            List[float]: embeddings of the string. 
        """
        return self.embed_documents([text])[0]

class LangchainEmbeddings(Embeddings):
    """Class for langchain compatible embeddings.
    """
    def __init__(self, model: Type[BaseEmbeddings]) -> None:
        self._model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self._model.embed_documents(texts=texts)
    
    def embed_query(self, text: str) -> List[float]:
        return self._model.embed_query(text=text)

class BaseEmbeddingsToolkit:
    """Base class for storing the embedding model and the text splitter.
    """
    def __init__(self, embedding_model: Type[BaseEmbeddings], text_splitter: Type[BaseTextSplitter], tokenizer: Tokenizer,
                 name: str, type: str, embedding_size: int, max_seq_length: int) -> None:
        from ..utils import validate_type
        self._embedding_model = validate_type(embedding_model, BaseEmbeddings)
        self._text_splitter = validate_type(text_splitter, BaseTextSplitter)
        self._tokenizer = validate_type(tokenizer, Tokenizer)
        self._name = validate_type(name, str)
        self._type = validate_type(type, str)
        self._embedding_size = validate_type(embedding_size, int)
        self._max_seq_length = validate_type(max_seq_length, int)

    @property
    def embedding_model(self) -> BaseEmbeddings:
        """The embedding model.

        Returns:
            BaseEmbeddings: The embedding model.
        """
        return self._embedding_model
    
    @property
    def text_splitter(self) -> BaseTextSplitter:
        """The text splitter.

        Returns:
            BaseTextSplitter: The text splitter.
        """
        return self._text_splitter
    
    @property
    def tokenizer(self) -> Tokenizer:
        """Tokenizer of the embedding model.

        Returns:
            Tokenizer: Tokenizer of the embedding model.
        """
        return self._tokenizer
    
    @property
    def embedding_size(self) -> int:
        """The embedding model's output dimensions.

        Returns:
            int: The embedding model's output dimensions.
        """
        return self._embedding_size
    
    @property
    def max_seq_length(self) -> int:
        """Maximum number of tokens used in each embedding vector.

        Returns:
            int: Maximum number of tokens used in each embedding vector.
        """
        self._max_seq_length
    
    @property
    def type(self) -> str:
        """Type of the embedding toolkit.

        Returns:
            str: Type of the embedding toolkit.
        """
        return self._type
    
    @property
    def name(self) -> str:
        """Name of the embedding model.

        Returns:
            str: Name of the embedding model.
        """
        return self._name
    
    @property
    def langchain_embeddings(self) -> LangchainEmbeddings:
        """Langchain compatible embeddings model.

        Returns:
            LangchainEmbeddings: Langchain compatible embeddings model.
        """
        return LangchainEmbeddings(self.embedding_model)
    
    def batch_embed(self, texts: List[str]) -> np.ndarray[np.float32]:
        """Embed list of texts.

        Args:
            texts (List[str]): List of text to embed.

        Returns:
            np.ndarray[np.float32]: Array of embedding vectors of the list of texts.
        """
        vectors = self.embedding_model.embed_documents(texts=texts)
        return np.array(vectors, dtype=np.float32)
    
    def embed(self, text: str) -> np.ndarray[np.float32]:
        """Embed a single string.

        Args:
            text (str): String to embed.

        Returns:
            np.ndarray[np.float32]: Vector of the embedded stirng.
        """
        return self.batch_embed([text])[0]

Classes

class BaseEmbeddings

Base class for embeddings model.

Expand source code
class BaseEmbeddings(ABC):
    """Base class for embeddings model.
    """

    @abstractmethod
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed list of texts.

        Args:
            texts (List[str]): List of texts to embed.

        Returns:
            List[List[float]]: List of embedded vectors.
        """
        pass

    def embed_query(self, text: str) -> List[float]:
        """Embed one string.

        Args:
            text (str): String to embed.

        Returns:
            List[float]: embeddings of the string. 
        """
        return self.embed_documents([text])[0]

Ancestors

  • abc.ABC

Subclasses

Methods

def embed_documents(self, texts: List[str]) ‑> List[List[float]]

Embed list of texts.

Args

texts : List[str]
List of texts to embed.

Returns

List[List[float]]
List of embedded vectors.
Expand source code
@abstractmethod
def embed_documents(self, texts: List[str]) -> List[List[float]]:
    """Embed list of texts.

    Args:
        texts (List[str]): List of texts to embed.

    Returns:
        List[List[float]]: List of embedded vectors.
    """
    pass
def embed_query(self, text: str) ‑> List[float]

Embed one string.

Args

text : str
String to embed.

Returns

List[float]
embeddings of the string.
Expand source code
def embed_query(self, text: str) -> List[float]:
    """Embed one string.

    Args:
        text (str): String to embed.

    Returns:
        List[float]: embeddings of the string. 
    """
    return self.embed_documents([text])[0]
class BaseEmbeddingsToolkit (embedding_model: Type[BaseEmbeddings], text_splitter: Type[BaseTextSplitter], tokenizer: Tokenizer, name: str, type: str, embedding_size: int, max_seq_length: int)

Base class for storing the embedding model and the text splitter.

Expand source code
class BaseEmbeddingsToolkit:
    """Base class for storing the embedding model and the text splitter.
    """
    def __init__(self, embedding_model: Type[BaseEmbeddings], text_splitter: Type[BaseTextSplitter], tokenizer: Tokenizer,
                 name: str, type: str, embedding_size: int, max_seq_length: int) -> None:
        from ..utils import validate_type
        self._embedding_model = validate_type(embedding_model, BaseEmbeddings)
        self._text_splitter = validate_type(text_splitter, BaseTextSplitter)
        self._tokenizer = validate_type(tokenizer, Tokenizer)
        self._name = validate_type(name, str)
        self._type = validate_type(type, str)
        self._embedding_size = validate_type(embedding_size, int)
        self._max_seq_length = validate_type(max_seq_length, int)

    @property
    def embedding_model(self) -> BaseEmbeddings:
        """The embedding model.

        Returns:
            BaseEmbeddings: The embedding model.
        """
        return self._embedding_model
    
    @property
    def text_splitter(self) -> BaseTextSplitter:
        """The text splitter.

        Returns:
            BaseTextSplitter: The text splitter.
        """
        return self._text_splitter
    
    @property
    def tokenizer(self) -> Tokenizer:
        """Tokenizer of the embedding model.

        Returns:
            Tokenizer: Tokenizer of the embedding model.
        """
        return self._tokenizer
    
    @property
    def embedding_size(self) -> int:
        """The embedding model's output dimensions.

        Returns:
            int: The embedding model's output dimensions.
        """
        return self._embedding_size
    
    @property
    def max_seq_length(self) -> int:
        """Maximum number of tokens used in each embedding vector.

        Returns:
            int: Maximum number of tokens used in each embedding vector.
        """
        self._max_seq_length
    
    @property
    def type(self) -> str:
        """Type of the embedding toolkit.

        Returns:
            str: Type of the embedding toolkit.
        """
        return self._type
    
    @property
    def name(self) -> str:
        """Name of the embedding model.

        Returns:
            str: Name of the embedding model.
        """
        return self._name
    
    @property
    def langchain_embeddings(self) -> LangchainEmbeddings:
        """Langchain compatible embeddings model.

        Returns:
            LangchainEmbeddings: Langchain compatible embeddings model.
        """
        return LangchainEmbeddings(self.embedding_model)
    
    def batch_embed(self, texts: List[str]) -> np.ndarray[np.float32]:
        """Embed list of texts.

        Args:
            texts (List[str]): List of text to embed.

        Returns:
            np.ndarray[np.float32]: Array of embedding vectors of the list of texts.
        """
        vectors = self.embedding_model.embed_documents(texts=texts)
        return np.array(vectors, dtype=np.float32)
    
    def embed(self, text: str) -> np.ndarray[np.float32]:
        """Embed a single string.

        Args:
            text (str): String to embed.

        Returns:
            np.ndarray[np.float32]: Vector of the embedded stirng.
        """
        return self.batch_embed([text])[0]

Subclasses

Instance variables

var embedding_modelBaseEmbeddings

The embedding model.

Returns

BaseEmbeddings
The embedding model.
Expand source code
@property
def embedding_model(self) -> BaseEmbeddings:
    """The embedding model.

    Returns:
        BaseEmbeddings: The embedding model.
    """
    return self._embedding_model
var embedding_size : int

The embedding model's output dimensions.

Returns

int
The embedding model's output dimensions.
Expand source code
@property
def embedding_size(self) -> int:
    """The embedding model's output dimensions.

    Returns:
        int: The embedding model's output dimensions.
    """
    return self._embedding_size
var langchain_embeddingsLangchainEmbeddings

Langchain compatible embeddings model.

Returns

LangchainEmbeddings
Langchain compatible embeddings model.
Expand source code
@property
def langchain_embeddings(self) -> LangchainEmbeddings:
    """Langchain compatible embeddings model.

    Returns:
        LangchainEmbeddings: Langchain compatible embeddings model.
    """
    return LangchainEmbeddings(self.embedding_model)
var max_seq_length : int

Maximum number of tokens used in each embedding vector.

Returns

int
Maximum number of tokens used in each embedding vector.
Expand source code
@property
def max_seq_length(self) -> int:
    """Maximum number of tokens used in each embedding vector.

    Returns:
        int: Maximum number of tokens used in each embedding vector.
    """
    self._max_seq_length
var name : str

Name of the embedding model.

Returns

str
Name of the embedding model.
Expand source code
@property
def name(self) -> str:
    """Name of the embedding model.

    Returns:
        str: Name of the embedding model.
    """
    return self._name
var text_splitterBaseTextSplitter

The text splitter.

Returns

BaseTextSplitter
The text splitter.
Expand source code
@property
def text_splitter(self) -> BaseTextSplitter:
    """The text splitter.

    Returns:
        BaseTextSplitter: The text splitter.
    """
    return self._text_splitter
var tokenizerTokenizer

Tokenizer of the embedding model.

Returns

Tokenizer
Tokenizer of the embedding model.
Expand source code
@property
def tokenizer(self) -> Tokenizer:
    """Tokenizer of the embedding model.

    Returns:
        Tokenizer: Tokenizer of the embedding model.
    """
    return self._tokenizer
var type : str

Type of the embedding toolkit.

Returns

str
Type of the embedding toolkit.
Expand source code
@property
def type(self) -> str:
    """Type of the embedding toolkit.

    Returns:
        str: Type of the embedding toolkit.
    """
    return self._type

Methods

def batch_embed(self, texts: List[str]) ‑> numpy.ndarray[numpy.float32]

Embed list of texts.

Args

texts : List[str]
List of text to embed.

Returns

np.ndarray[np.float32]
Array of embedding vectors of the list of texts.
Expand source code
def batch_embed(self, texts: List[str]) -> np.ndarray[np.float32]:
    """Embed list of texts.

    Args:
        texts (List[str]): List of text to embed.

    Returns:
        np.ndarray[np.float32]: Array of embedding vectors of the list of texts.
    """
    vectors = self.embedding_model.embed_documents(texts=texts)
    return np.array(vectors, dtype=np.float32)
def embed(self, text: str) ‑> numpy.ndarray[numpy.float32]

Embed a single string.

Args

text : str
String to embed.

Returns

np.ndarray[np.float32]
Vector of the embedded stirng.
Expand source code
def embed(self, text: str) -> np.ndarray[np.float32]:
    """Embed a single string.

    Args:
        text (str): String to embed.

    Returns:
        np.ndarray[np.float32]: Vector of the embedded stirng.
    """
    return self.batch_embed([text])[0]
class LangchainEmbeddings (model: Type[BaseEmbeddings])

Class for langchain compatible embeddings.

Expand source code
class LangchainEmbeddings(Embeddings):
    """Class for langchain compatible embeddings.
    """
    def __init__(self, model: Type[BaseEmbeddings]) -> None:
        self._model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self._model.embed_documents(texts=texts)
    
    def embed_query(self, text: str) -> List[float]:
        return self._model.embed_query(text=text)

Ancestors

  • langchain_core.embeddings.embeddings.Embeddings
  • abc.ABC

Methods

def embed_documents(self, texts: List[str]) ‑> List[List[float]]

Embed search docs.

Expand source code
def embed_documents(self, texts: List[str]) -> List[List[float]]:
    return self._model.embed_documents(texts=texts)
def embed_query(self, text: str) ‑> List[float]

Embed query text.

Expand source code
def embed_query(self, text: str) -> List[float]:
    return self._model.embed_query(text=text)