Module llmflex.VectorDBs.faiss_vectordb

Expand source code
from __future__ import annotations
from ..Embeddings.base_embeddings import BaseEmbeddingsToolkit
from ..TextSplitters.base_text_splitter import BaseTextSplitter
from ..Schemas.documents import Document
from .base_vectordb import BaseVectorDatabase
from typing import List, Any, Type, Optional, Tuple
import os, numpy as np

class FaissVectorDatabase(BaseVectorDatabase):

    def __init__(self, embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None,
                 text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> None:
        """Initialise a vector database.

        Args:
            embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
            name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
            vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
            text_splitter (Optional[Type[BaseTextSplitter]], optional): Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None.
        """
        super().__init__(embeddings=embeddings, name=name, vectordb_dir=vectordb_dir)

    def _get_empty_index(self) -> Any:
        """Return an empty index.

        Returns:
            Any: An empty index.
        """
        from faiss import IndexFlatL2
        index = IndexFlatL2(self.embeddings.embedding_size)
        return index

    @property
    def _index_filename(self) -> str:
        """Base name of the file for the index in the vector database directory.

        Returns:
            str: Base name of the file for the index in the vector database directory.
        """
        return 'index.faiss'

    def _save_index(self) -> None:
        """Save the index of the vector database.
        """
        from faiss import write_index
        write_index(self.index, os.path.join(self.db_dir, self._index_filename))

    def _load_index(self, index_dir: str) -> Any:
        """Load the index from an existing saved file.
        """
        from faiss import read_index
        return read_index(index_dir)

    def _add(self, vectors: np.ndarray[np.float32], docs: List[Document]) -> None:
        """Core method to add documents into the vector database.

        Args:
            vectors (np.ndarray[np.float32]): Array of vectors created by the indexes of the documents.
            docs (List[Document]): List of documents to add.
        """
        from faiss import normalize_L2
        current_size = self.index.ntotal
        add_size = vectors.shape[0]
        add_doc_dict = dict(zip(range(current_size, current_size + add_size), docs))
        normalize_L2(vectors)
        self.index.add(vectors)
        self._data.update(add_doc_dict)

    def _delete(self, ids: List[int]) -> None:
        """Core method to remove records by ids.

        Args:
            ids (List[int]): Ids to remove.
        """
        ids = np.array(ids, dtype=np.int32)
        if (ids > self.index.ntotal).sum() > 0:
            raise ValueError('Non-existence ids provided in the list of ids.')
        new_data = self.data.items()
        new_data = filter(lambda x: x[0] not in ids, new_data)
        new_data = map(lambda x: x[1], new_data)
        self.index.remove_ids(ids)
        self._data = dict(zip(range(self.index.ntotal), new_data))

    def _batch_search_with_scores(self, vectors: np.ndarray[np.float32], k: int = 5, ids_scope: Optional[List[int]] = None) -> Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]:
        """Batch similarity search with multiple vectors.

        Args:
            vectors (np.ndarray[np.float32]): Array of vectors for the search.
            k (int, optional): Maximum results for each vector. Defaults to 5.
            ids_scope (Optional[List[int]], optional): The list of allowed ids to return for the similarity search. Defaults to None.

        Returns:
            Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]: Tuple of scores and ids. Both matrices must be in the same shape.
        """
        from faiss import normalize_L2, SearchParametersIVF, IDSelectorArray
        normalize_L2(vectors)
        if ids_scope is None:
            scores, ids = self.index.search(vectors, k=k)
        else:
            id_selector = IDSelectorArray(ids_scope)
            k = min(k, len(ids_scope))
            scores, ids = self.index.search(vectors, k=k, params=SearchParametersIVF(sel=id_selector))
        scores = 1 - scores / (2 ** 0.5)
        return scores, ids
    
    def _get_vectors_by_ids(self, ids: List[int]) -> np.ndarray[np.float32]:
        """Get the array of vectors by ids.

        Args:
            ids (List[int]): Document ids.

        Returns:
            np.ndarray[np.float32]: Arrray of vectors.
        """
        ids_norm = np.array(ids, dtype=np.int32)
        if (ids_norm > self.index.ntotal).sum() > 0:
            raise ValueError('Non-existence ids provided in the list of ids.')
        vectors = list(map(lambda x: self.index.reconstruct(x), ids))
        return np.array(vectors, dtype=np.float32)

Classes

class FaissVectorDatabase (embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs)

Base class for vector databases.

Initialise a vector database.

Args

embeddings : Type[BaseEmbeddingsToolkit]
Embeddings toolkit to use.
name : Optional[str], optional
Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir : Optional[str], optional
Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
text_splitter : Optional[Type[BaseTextSplitter]], optional
Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None.
Expand source code
class FaissVectorDatabase(BaseVectorDatabase):

    def __init__(self, embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None,
                 text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> None:
        """Initialise a vector database.

        Args:
            embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
            name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
            vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
            text_splitter (Optional[Type[BaseTextSplitter]], optional): Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None.
        """
        super().__init__(embeddings=embeddings, name=name, vectordb_dir=vectordb_dir)

    def _get_empty_index(self) -> Any:
        """Return an empty index.

        Returns:
            Any: An empty index.
        """
        from faiss import IndexFlatL2
        index = IndexFlatL2(self.embeddings.embedding_size)
        return index

    @property
    def _index_filename(self) -> str:
        """Base name of the file for the index in the vector database directory.

        Returns:
            str: Base name of the file for the index in the vector database directory.
        """
        return 'index.faiss'

    def _save_index(self) -> None:
        """Save the index of the vector database.
        """
        from faiss import write_index
        write_index(self.index, os.path.join(self.db_dir, self._index_filename))

    def _load_index(self, index_dir: str) -> Any:
        """Load the index from an existing saved file.
        """
        from faiss import read_index
        return read_index(index_dir)

    def _add(self, vectors: np.ndarray[np.float32], docs: List[Document]) -> None:
        """Core method to add documents into the vector database.

        Args:
            vectors (np.ndarray[np.float32]): Array of vectors created by the indexes of the documents.
            docs (List[Document]): List of documents to add.
        """
        from faiss import normalize_L2
        current_size = self.index.ntotal
        add_size = vectors.shape[0]
        add_doc_dict = dict(zip(range(current_size, current_size + add_size), docs))
        normalize_L2(vectors)
        self.index.add(vectors)
        self._data.update(add_doc_dict)

    def _delete(self, ids: List[int]) -> None:
        """Core method to remove records by ids.

        Args:
            ids (List[int]): Ids to remove.
        """
        ids = np.array(ids, dtype=np.int32)
        if (ids > self.index.ntotal).sum() > 0:
            raise ValueError('Non-existence ids provided in the list of ids.')
        new_data = self.data.items()
        new_data = filter(lambda x: x[0] not in ids, new_data)
        new_data = map(lambda x: x[1], new_data)
        self.index.remove_ids(ids)
        self._data = dict(zip(range(self.index.ntotal), new_data))

    def _batch_search_with_scores(self, vectors: np.ndarray[np.float32], k: int = 5, ids_scope: Optional[List[int]] = None) -> Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]:
        """Batch similarity search with multiple vectors.

        Args:
            vectors (np.ndarray[np.float32]): Array of vectors for the search.
            k (int, optional): Maximum results for each vector. Defaults to 5.
            ids_scope (Optional[List[int]], optional): The list of allowed ids to return for the similarity search. Defaults to None.

        Returns:
            Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]: Tuple of scores and ids. Both matrices must be in the same shape.
        """
        from faiss import normalize_L2, SearchParametersIVF, IDSelectorArray
        normalize_L2(vectors)
        if ids_scope is None:
            scores, ids = self.index.search(vectors, k=k)
        else:
            id_selector = IDSelectorArray(ids_scope)
            k = min(k, len(ids_scope))
            scores, ids = self.index.search(vectors, k=k, params=SearchParametersIVF(sel=id_selector))
        scores = 1 - scores / (2 ** 0.5)
        return scores, ids
    
    def _get_vectors_by_ids(self, ids: List[int]) -> np.ndarray[np.float32]:
        """Get the array of vectors by ids.

        Args:
            ids (List[int]): Document ids.

        Returns:
            np.ndarray[np.float32]: Arrray of vectors.
        """
        ids_norm = np.array(ids, dtype=np.int32)
        if (ids_norm > self.index.ntotal).sum() > 0:
            raise ValueError('Non-existence ids provided in the list of ids.')
        vectors = list(map(lambda x: self.index.reconstruct(x), ids))
        return np.array(vectors, dtype=np.float32)

Ancestors

Inherited members