Module llmflex.VectorDBs.base_vectordb
Expand source code
from __future__ import annotations
from ..Embeddings.base_embeddings import BaseEmbeddingsToolkit
from ..TextSplitters.base_text_splitter import BaseTextSplitter
from ..Schemas.documents import Document
from abc import abstractmethod, ABC
from typing import List, Dict, Union, Any, Type, Optional, Sequence, Callable, Tuple
import os, numpy as np
def default_vectordb_dir() -> str:
"""Default home directory of vector databases.
Returns:
str: Default home directory of vector databases.
"""
from ..utils import get_config
home = os.path.join(get_config()['package_home'], 'vector_databases')
if not os.path.exists(home):
os.makedirs(home)
return home
def list_vectordbs(vectordb_dir: Optional[str] = None) -> List[str]:
"""List all the vector databases in the given directory.
Args:
vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
Returns:
List[str]: List all the vector databases in the given directory.
"""
vectordb_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir
vectordb_dir = vectordb_dir if os.path.exists(vectordb_dir) else default_vectordb_dir()
dbs = list(filter(lambda x: os.path.isdir(os.path.join(vectordb_dir, x)), os.listdir(vectordb_dir)))
dbs = list(filter(lambda x: os.path.exists(os.path.join(vectordb_dir, x, 'info.json')), dbs))
return dbs
def name_checker(name: str) -> str:
"""Raise error if the given string has space, newline characters, or tab characters.
Args:
name (str): String to check.
Returns:
str: Return the given text if it passes all the checkes.
"""
if ' ' in name:
raise ValueError(f'Spaces cannot be in the name.')
if '\n' in name:
raise ValueError(f'Newline characters cannot be in the name.')
if '\r' in name:
raise ValueError(f'Newline characters cannot be in the name.')
if '\t' in name:
raise ValueError(f'Tab characters cannot be in the name.')
return name
class BaseVectorDatabase(ABC):
"""Base class for vector databases.
"""
def __init__(self, embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None,
text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> None:
"""Initialise a vector database.
Args:
embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None.
"""
self._embeddings = embeddings
self._name = name_checker(name) if name is not None else None
vectordb_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir
self._db_dir = os.path.join(vectordb_dir, self.name) if self.name is not None else None
if self.db_dir is not None:
os.makedirs(self.db_dir, exist_ok=True)
self._index = self._get_empty_index()
self._data = dict()
self._text_splitter = self.embeddings.text_splitter if text_splitter is None else text_splitter
@property
def embeddings(self) -> BaseEmbeddingsToolkit:
"""Embeddings toolkit used in the vector database.
Returns:
BaseEmbeddingsToolkit: Embeddings toolkit used in the vector database.
"""
return self._embeddings
@property
def text_splitter(self) -> BaseTextSplitter:
"""Default text splitter for the vector database.
Returns:
BaseTextSplitter: Default text splitter for the vector database.
"""
return self._text_splitter
@property
def index(self) -> Any:
"""Index of the vector database.
Returns:
Any: Index of the vector database.
"""
return self._index
@property
def name(self) -> Optional[str]:
"""Name of the vector database.
Returns:
Optional[str]: Name of the vector database.
"""
return self._name
@property
def info(self) -> Dict[str, Any]:
"""Information of the vector database.
Returns:
Dict[str, Any]: Information of the vector database.
"""
if not hasattr(self, '_info'):
from ..utils import current_time, read_json
if self.db_dir is not None:
info_dir = os.path.join(self.db_dir, 'info.json')
if os.path.exists(info_dir):
self._info = read_json(info_dir)
else:
self._info = dict(embeddings=self.embeddings.name, last_update=current_time())
else:
self._info = dict(embeddings=self.embeddings.name, last_update=current_time())
return self._info
@property
def db_dir(self) -> Optional[str]:
"""Directory of the vector database.
Returns:
Optional[str]: Directory of the vector database.
"""
return self._db_dir
@property
def data(self) -> Dict[int, Document]:
"""Dictionary of all the documents in the vector database.
Returns:
Dict[int, Document]: Dictionary of all the documents in the vector database.
"""
return self._data
@property
def size(self) -> int:
"""Number of documents in the vector database.
Returns:
int: Number of documents in the vector database.
"""
return len(self.data)
@abstractmethod
def _get_empty_index(self) -> Any:
"""Return an empty index.
Returns:
Any: An empty index.
"""
pass
@property
@abstractmethod
def _index_filename(self) -> str:
"""Base name of the file for the index in the vector database directory.
Returns:
str: Base name of the file for the index in the vector database directory.
"""
pass
@abstractmethod
def _save_index(self) -> None:
"""Save the index of the vector database.
"""
pass
@abstractmethod
def _load_index(self, index_dir: str) -> Any:
"""Load the index from an existing saved file.
"""
pass
@abstractmethod
def _add(self, vectors: np.ndarray[np.float32], docs: List[Document]) -> None:
"""Core method to add documents into the vector database.
Args:
vectors (np.ndarray[np.float32]): Array of vectors created by the indexes of the documents.
docs (List[Document]): List of documents to add.
"""
pass
@abstractmethod
def _delete(self, ids: List[int]) -> None:
"""Core method to remove records by ids.
Args:
ids (List[int]): Ids to remove.
"""
pass
@abstractmethod
def _batch_search_with_scores(self, vectors: np.ndarray[np.float32], k: int = 5, ids_scope: Optional[List[int]] = None) -> Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]:
"""Batch similarity search with multiple vectors.
Args:
vectors (np.ndarray[np.float32]): Array of vectors for the search.
k (int, optional): Maximum results for each vector. Defaults to 5.
ids_scope (Optional[List[int]], optional): The list of allowed ids to return for the similarity search. Defaults to None.
Returns:
Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]: Tuple of scores and ids. Both matrices must be in the same shape.
"""
pass
@abstractmethod
def _get_vectors_by_ids(self, ids: List[int]) -> np.ndarray[np.float32]:
"""Get the array of vectors by ids.
Args:
ids (List[int]): Document ids.
Returns:
np.ndarray[np.float32]: Arrray of vectors.
"""
pass
@classmethod
def from_exist(cls, embeddings: Type[BaseEmbeddingsToolkit], name: str, vectordb_dir: Optional[str] = None,
text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase:
"""Load the vector database from an existing vector database.
Args:
embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
name (str): Name of the existing database.
vectordbs_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns:
BaseVectorDatabase: The initialised vector database.
"""
vectordbs_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir
name = name_checker(name)
existing_dbs = list_vectordbs(vectordb_dir=vectordbs_dir)
if name not in existing_dbs:
raise ValueError(f'The vector database "{name}" does not exist.')
from ..utils import read_json
import pickle
db_info_dir = os.path.join(vectordbs_dir, name, 'info.json')
db_info = read_json(db_info_dir)
db_embeddings_name = db_info.get('embeddings', None)
vdb = cls(embeddings, name, vectordbs_dir, text_splitter)
data_dir =os.path.join(vdb.db_dir, 'data.pkl')
if os.path.exists(data_dir):
with open(data_dir, 'rb') as f:
vdb._data = pickle.load(f)
else: # recovering from old format
print('Trying to recover from old format...')
old_data_dir = os.path.join(vdb.db_dir, 'index.pkl')
if os.path.exists(old_data_dir):
with open(old_data_dir, 'rb') as f:
data = pickle.load(f)
data = list(map(lambda x: Document(index=x.page_content, metadata=x.metadata), data[0]._dict.values()))
vdb.add_documents(data, split_text=False)
else:
raise FileExistsError(f'No raw data has been saved. Vector database cannot be recovered.')
if db_embeddings_name == embeddings.name:
vdb._index = vdb._load_index(os.path.join(vdb.db_dir, vdb._index_filename))
else:
print(f'You are using a different embeddings model. Switching from embedding model {db_embeddings_name} to {embeddings.name}.')
vdb.add_documents(list(vdb.data.values()), split_text=False)
vdb.info['embeddings'] = embeddings.name
vdb.save()
return vdb
@classmethod
def from_documents(cls, embeddings: Type[BaseEmbeddingsToolkit], docs: List[Document],
name: Optional[str] = None, vectordb_dir: Optional[str] = None,
split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase:
"""Load the vector database from existing documents.
Args:
embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
docs (List[Document]): List of documents to use.
name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns:
BaseVectorDatabase: The initialised vector database.
"""
vdb = cls(embeddings, name, vectordb_dir, text_splitter)
vdb.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter)
vdb.save() # In case an empty list of docs is given.
return vdb
@classmethod
def from_texts(cls, embeddings: Type[BaseEmbeddingsToolkit], texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
name: Optional[str] = None, vectordb_dir: Optional[str] = None,
split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase:
"""Load the vector database from existing texts.
Args:
embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use.
texts (List[str]): List of texts to add.
metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None.
name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns:
BaseVectorDatabase: The initialised vector database.
"""
vdb = cls(embeddings, name, vectordb_dir, text_splitter)
vdb.add_texts(texts=texts, metadata=metadata, split_text=split_text, text_splitter=text_splitter)
vdb.save() # In case an empty list of texts is given.
return vdb
def _save_data(self) -> None:
"""Save the documents in the vector database.
"""
if self.db_dir is not None:
import pickle
with open(os.path.join(self.db_dir, 'data.pkl'), 'wb') as f:
pickle.dump(self.data, f)
def _save_info(self) -> None:
"""Save information about the vector database.
"""
from ..utils import save_json, current_time
self.info['last_update'] = current_time()
if self.db_dir is not None:
save_json(self.info, os.path.join(self.db_dir, 'info.json'))
def save(self) -> None:
"""Save the vector database.
"""
if self.db_dir is not None:
self._save_data()
self._save_index()
self._save_info()
def add_documents(self, docs: List[Document], split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None:
"""Add documents into the vector database.
Args:
docs (List[Document]): List of documents to split.
split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
"""
if len(docs) != 0:
text_splitter = self.text_splitter if text_splitter is None else text_splitter
docs = text_splitter.split_documents(docs) if split_text else docs
vectors = list(map(lambda x: x.index, docs))
vectors = self.embeddings.batch_embed(vectors)
vectors = np.array(vectors, dtype=np.float32)
self._add(vectors=vectors, docs=docs)
self.save()
def add_docs_with_vectors(self, vectors: Sequence[Sequence[float]], docs: List[Document]) -> None:
"""Add documents with pre-embedded vectors into the vector database.
Args:
vectors (Sequence[Sequence[float]]): Pre-embedded vectors.
docs (List[Document]): List of documents.
"""
len_vec = len(vectors)
len_doc = len(docs)
if len_vec != 0:
if len_vec != len_doc:
raise ValueError(f'{len_vec} vectors are given but the number of documents given is {len_doc}. Make sure the number of documents match with the number of vectors given.')
vectors = np.array(vectors, dtype=np.float32)
self._add(vectors, docs)
self.save()
def add_texts(self, texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None:
"""Add texts into the vector database.
Args:
texts (List[str]): List of texts to add.
metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None.
split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
"""
if len(texts) != 0:
if metadata is None:
metadata = [dict()] * len(texts)
if isinstance(metadata, list):
if len(metadata) != len(texts):
raise ValueError('Number of texts does not match with number of metadata.')
else:
metadata = [metadata] * len(texts)
docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(texts, metadata))))
self.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter)
def batch_search(self, queries: List[str], top_k: int = 5, index_only: bool = True,
batch_size: int = 100, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[List[Union[str, Dict[str, Any]]]]:
"""Batch simlarity search on multiple queries.
Args:
queries (List[str]): List of queries.
top_k (int, optional): Maximum number of results for each query. Defaults to 5.
index_only (bool, optional): Whether to return the list of indexes only. Defaults to True.
batch_size (int, optional): Batch size to perform similarity search. Defaults to 100.
filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None.
Returns:
List[List[Union[str, Dict[str, Any]]]]: List of list of search results.
"""
import gc
# Filtering the scope of search
scope = None
if filter_fn:
scope = filter(lambda x: filter_fn(x[1]), self.data.items())
if kwargs:
scope = self.data.items() if scope is None else scope
for k, v in kwargs.items():
scope = filter(lambda x: x[1].metadata.get(k) == v, scope)
ids_scope = scope if scope is None else list(map(lambda x: x[0], scope))
top_k = min(self.size, top_k) if ids_scope is None else min(self.size, top_k, len(ids_scope))
if top_k == 0:
return [[]] * len(queries)
q_num = len(queries)
batch_num = q_num // batch_size if ((q_num // batch_size) == (q_num / batch_size)) else (q_num // batch_size) + 1
batches = list(map(lambda x: (x * batch_size, min(q_num, (x + 1) * batch_size)), range(batch_num)))
scores = list()
ids = list()
for b in batches:
qvecs = self.embeddings.batch_embed(queries[b[0]:b[1]])
score, id = self._batch_search_with_scores(vectors=qvecs, k=top_k, ids_scope=ids_scope)
scores.append(score)
ids.append(id)
del qvecs
gc.collect()
scores = np.concatenate(scores, axis=0)
ids = np.concatenate(ids, axis=0)
get_docs = np.vectorize(lambda x: self.data[x])
get_indexes = np.vectorize(lambda x: x.index)
get_metadatas = np.vectorize(lambda x: x.metadata)
get_results = np.vectorize(lambda index, score, id, metadata: dict(index=index, score=score, id=id, metadata=metadata))
docs = get_docs(ids)
indexes = get_indexes(docs)
metadatas = get_metadatas(docs)
results = get_results(indexes, scores, ids, metadatas)
if index_only:
get_str = np.vectorize(lambda x: x['index'])
return get_str(results).tolist()
else:
return results.tolist()
def search(self, query: str, top_k: int = 5, index_only: bool = True, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[Union[str, Dict[str, Any]]]:
"""Simlarity search on the given query.
Args:
query (str): Query for similarity search.
top_k (int, optional): Maximum number of results. Defaults to 5.
index_only (bool, optional): Whether to return the list of indexes only. Defaults to True.
filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None.
Returns:
List[Union[str, Dict[str, Any]]]: List of search results.
"""
return self.batch_search(queries=[query], top_k=top_k, index_only=index_only,filter_fn=filter_fn, **kwargs)[0]
def search_by_metadata(self, ids_only: bool = False, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> Union[List[int], Dict[int, Document]]:
"""Search documents or ids by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn.
Args:
ids_only (bool, optional): Whether to return a list of ids or a dictionary with the ids as keys and documents as values. Defaults to False.
filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None.
Returns:
Union[List[int], Dict[int, Document]]: List of ids or dictionary with the ids as keys and documents as values.
"""
results = self.data.items()
if filter_fn:
results = filter(lambda x: filter_fn(x[1]), results)
if kwargs:
for k, v in kwargs.items():
results = list(filter(lambda x: x[1].metadata.get(k) == v, results))
if ids_only:
return list(map(lambda x: x[0], results))
return dict(results)
def delete_by_metadata(self, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> None:
"""Remove records by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn.
Args:
filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None.
"""
if ((not kwargs) and (not filter_fn)):
raise ValueError('No keyword arguments or filter_fn are passed. Use the "clear" method to clear the entire database.')
ids = self.search_by_metadata(ids_only=True, filter_fn=filter_fn, **kwargs)
self._delete(ids)
self.save()
def clear(self) -> None:
"""Clear the entire vector database. Use it with caution.
"""
import gc
del self._index
del self._data
gc.collect()
self._index = self._get_empty_index()
self._data = dict()
self.save()
Functions
def default_vectordb_dir() ‑> str-
Default home directory of vector databases.
Returns
str- Default home directory of vector databases.
Expand source code
def default_vectordb_dir() -> str: """Default home directory of vector databases. Returns: str: Default home directory of vector databases. """ from ..utils import get_config home = os.path.join(get_config()['package_home'], 'vector_databases') if not os.path.exists(home): os.makedirs(home) return home def list_vectordbs(vectordb_dir: Optional[str] = None) ‑> List[str]-
List all the vector databases in the given directory.
Args
vectordb_dir:Optional[str], optional- Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
Returns
List[str]- List all the vector databases in the given directory.
Expand source code
def list_vectordbs(vectordb_dir: Optional[str] = None) -> List[str]: """List all the vector databases in the given directory. Args: vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. Returns: List[str]: List all the vector databases in the given directory. """ vectordb_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir vectordb_dir = vectordb_dir if os.path.exists(vectordb_dir) else default_vectordb_dir() dbs = list(filter(lambda x: os.path.isdir(os.path.join(vectordb_dir, x)), os.listdir(vectordb_dir))) dbs = list(filter(lambda x: os.path.exists(os.path.join(vectordb_dir, x, 'info.json')), dbs)) return dbs def name_checker(name: str) ‑> str-
Raise error if the given string has space, newline characters, or tab characters.
Args
name:str- String to check.
Returns
str- Return the given text if it passes all the checkes.
Expand source code
def name_checker(name: str) -> str: """Raise error if the given string has space, newline characters, or tab characters. Args: name (str): String to check. Returns: str: Return the given text if it passes all the checkes. """ if ' ' in name: raise ValueError(f'Spaces cannot be in the name.') if '\n' in name: raise ValueError(f'Newline characters cannot be in the name.') if '\r' in name: raise ValueError(f'Newline characters cannot be in the name.') if '\t' in name: raise ValueError(f'Tab characters cannot be in the name.') return name
Classes
class BaseVectorDatabase (embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs)-
Base class for vector databases.
Initialise a vector database.
Args
embeddings:Type[BaseEmbeddingsToolkit]- Embeddings toolkit to use.
name:Optional[str], optional- Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir:Optional[str], optional- Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None.
Expand source code
class BaseVectorDatabase(ABC): """Base class for vector databases. """ def __init__(self, embeddings: Type[BaseEmbeddingsToolkit], name: Optional[str] = None, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> None: """Initialise a vector database. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None. vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. text_splitter (Optional[Type[BaseTextSplitter]], optional): Default text splitter for the vecetor database. If None is given, the embeddings toolkit text splitter will be used. Defaults to None. """ self._embeddings = embeddings self._name = name_checker(name) if name is not None else None vectordb_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir self._db_dir = os.path.join(vectordb_dir, self.name) if self.name is not None else None if self.db_dir is not None: os.makedirs(self.db_dir, exist_ok=True) self._index = self._get_empty_index() self._data = dict() self._text_splitter = self.embeddings.text_splitter if text_splitter is None else text_splitter @property def embeddings(self) -> BaseEmbeddingsToolkit: """Embeddings toolkit used in the vector database. Returns: BaseEmbeddingsToolkit: Embeddings toolkit used in the vector database. """ return self._embeddings @property def text_splitter(self) -> BaseTextSplitter: """Default text splitter for the vector database. Returns: BaseTextSplitter: Default text splitter for the vector database. """ return self._text_splitter @property def index(self) -> Any: """Index of the vector database. Returns: Any: Index of the vector database. """ return self._index @property def name(self) -> Optional[str]: """Name of the vector database. Returns: Optional[str]: Name of the vector database. """ return self._name @property def info(self) -> Dict[str, Any]: """Information of the vector database. Returns: Dict[str, Any]: Information of the vector database. """ if not hasattr(self, '_info'): from ..utils import current_time, read_json if self.db_dir is not None: info_dir = os.path.join(self.db_dir, 'info.json') if os.path.exists(info_dir): self._info = read_json(info_dir) else: self._info = dict(embeddings=self.embeddings.name, last_update=current_time()) else: self._info = dict(embeddings=self.embeddings.name, last_update=current_time()) return self._info @property def db_dir(self) -> Optional[str]: """Directory of the vector database. Returns: Optional[str]: Directory of the vector database. """ return self._db_dir @property def data(self) -> Dict[int, Document]: """Dictionary of all the documents in the vector database. Returns: Dict[int, Document]: Dictionary of all the documents in the vector database. """ return self._data @property def size(self) -> int: """Number of documents in the vector database. Returns: int: Number of documents in the vector database. """ return len(self.data) @abstractmethod def _get_empty_index(self) -> Any: """Return an empty index. Returns: Any: An empty index. """ pass @property @abstractmethod def _index_filename(self) -> str: """Base name of the file for the index in the vector database directory. Returns: str: Base name of the file for the index in the vector database directory. """ pass @abstractmethod def _save_index(self) -> None: """Save the index of the vector database. """ pass @abstractmethod def _load_index(self, index_dir: str) -> Any: """Load the index from an existing saved file. """ pass @abstractmethod def _add(self, vectors: np.ndarray[np.float32], docs: List[Document]) -> None: """Core method to add documents into the vector database. Args: vectors (np.ndarray[np.float32]): Array of vectors created by the indexes of the documents. docs (List[Document]): List of documents to add. """ pass @abstractmethod def _delete(self, ids: List[int]) -> None: """Core method to remove records by ids. Args: ids (List[int]): Ids to remove. """ pass @abstractmethod def _batch_search_with_scores(self, vectors: np.ndarray[np.float32], k: int = 5, ids_scope: Optional[List[int]] = None) -> Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]: """Batch similarity search with multiple vectors. Args: vectors (np.ndarray[np.float32]): Array of vectors for the search. k (int, optional): Maximum results for each vector. Defaults to 5. ids_scope (Optional[List[int]], optional): The list of allowed ids to return for the similarity search. Defaults to None. Returns: Tuple[np.ndarray[np.float32], np.ndarray[np.int64]]: Tuple of scores and ids. Both matrices must be in the same shape. """ pass @abstractmethod def _get_vectors_by_ids(self, ids: List[int]) -> np.ndarray[np.float32]: """Get the array of vectors by ids. Args: ids (List[int]): Document ids. Returns: np.ndarray[np.float32]: Arrray of vectors. """ pass @classmethod def from_exist(cls, embeddings: Type[BaseEmbeddingsToolkit], name: str, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from an existing vector database. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. name (str): Name of the existing database. vectordbs_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vectordbs_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir name = name_checker(name) existing_dbs = list_vectordbs(vectordb_dir=vectordbs_dir) if name not in existing_dbs: raise ValueError(f'The vector database "{name}" does not exist.') from ..utils import read_json import pickle db_info_dir = os.path.join(vectordbs_dir, name, 'info.json') db_info = read_json(db_info_dir) db_embeddings_name = db_info.get('embeddings', None) vdb = cls(embeddings, name, vectordbs_dir, text_splitter) data_dir =os.path.join(vdb.db_dir, 'data.pkl') if os.path.exists(data_dir): with open(data_dir, 'rb') as f: vdb._data = pickle.load(f) else: # recovering from old format print('Trying to recover from old format...') old_data_dir = os.path.join(vdb.db_dir, 'index.pkl') if os.path.exists(old_data_dir): with open(old_data_dir, 'rb') as f: data = pickle.load(f) data = list(map(lambda x: Document(index=x.page_content, metadata=x.metadata), data[0]._dict.values())) vdb.add_documents(data, split_text=False) else: raise FileExistsError(f'No raw data has been saved. Vector database cannot be recovered.') if db_embeddings_name == embeddings.name: vdb._index = vdb._load_index(os.path.join(vdb.db_dir, vdb._index_filename)) else: print(f'You are using a different embeddings model. Switching from embedding model {db_embeddings_name} to {embeddings.name}.') vdb.add_documents(list(vdb.data.values()), split_text=False) vdb.info['embeddings'] = embeddings.name vdb.save() return vdb @classmethod def from_documents(cls, embeddings: Type[BaseEmbeddingsToolkit], docs: List[Document], name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from existing documents. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. docs (List[Document]): List of documents to use. name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None. vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vdb = cls(embeddings, name, vectordb_dir, text_splitter) vdb.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter) vdb.save() # In case an empty list of docs is given. return vdb @classmethod def from_texts(cls, embeddings: Type[BaseEmbeddingsToolkit], texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from existing texts. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. texts (List[str]): List of texts to add. metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None. name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None. vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vdb = cls(embeddings, name, vectordb_dir, text_splitter) vdb.add_texts(texts=texts, metadata=metadata, split_text=split_text, text_splitter=text_splitter) vdb.save() # In case an empty list of texts is given. return vdb def _save_data(self) -> None: """Save the documents in the vector database. """ if self.db_dir is not None: import pickle with open(os.path.join(self.db_dir, 'data.pkl'), 'wb') as f: pickle.dump(self.data, f) def _save_info(self) -> None: """Save information about the vector database. """ from ..utils import save_json, current_time self.info['last_update'] = current_time() if self.db_dir is not None: save_json(self.info, os.path.join(self.db_dir, 'info.json')) def save(self) -> None: """Save the vector database. """ if self.db_dir is not None: self._save_data() self._save_index() self._save_info() def add_documents(self, docs: List[Document], split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None: """Add documents into the vector database. Args: docs (List[Document]): List of documents to split. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. """ if len(docs) != 0: text_splitter = self.text_splitter if text_splitter is None else text_splitter docs = text_splitter.split_documents(docs) if split_text else docs vectors = list(map(lambda x: x.index, docs)) vectors = self.embeddings.batch_embed(vectors) vectors = np.array(vectors, dtype=np.float32) self._add(vectors=vectors, docs=docs) self.save() def add_docs_with_vectors(self, vectors: Sequence[Sequence[float]], docs: List[Document]) -> None: """Add documents with pre-embedded vectors into the vector database. Args: vectors (Sequence[Sequence[float]]): Pre-embedded vectors. docs (List[Document]): List of documents. """ len_vec = len(vectors) len_doc = len(docs) if len_vec != 0: if len_vec != len_doc: raise ValueError(f'{len_vec} vectors are given but the number of documents given is {len_doc}. Make sure the number of documents match with the number of vectors given.') vectors = np.array(vectors, dtype=np.float32) self._add(vectors, docs) self.save() def add_texts(self, texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None: """Add texts into the vector database. Args: texts (List[str]): List of texts to add. metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. """ if len(texts) != 0: if metadata is None: metadata = [dict()] * len(texts) if isinstance(metadata, list): if len(metadata) != len(texts): raise ValueError('Number of texts does not match with number of metadata.') else: metadata = [metadata] * len(texts) docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(texts, metadata)))) self.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter) def batch_search(self, queries: List[str], top_k: int = 5, index_only: bool = True, batch_size: int = 100, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[List[Union[str, Dict[str, Any]]]]: """Batch simlarity search on multiple queries. Args: queries (List[str]): List of queries. top_k (int, optional): Maximum number of results for each query. Defaults to 5. index_only (bool, optional): Whether to return the list of indexes only. Defaults to True. batch_size (int, optional): Batch size to perform similarity search. Defaults to 100. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None. Returns: List[List[Union[str, Dict[str, Any]]]]: List of list of search results. """ import gc # Filtering the scope of search scope = None if filter_fn: scope = filter(lambda x: filter_fn(x[1]), self.data.items()) if kwargs: scope = self.data.items() if scope is None else scope for k, v in kwargs.items(): scope = filter(lambda x: x[1].metadata.get(k) == v, scope) ids_scope = scope if scope is None else list(map(lambda x: x[0], scope)) top_k = min(self.size, top_k) if ids_scope is None else min(self.size, top_k, len(ids_scope)) if top_k == 0: return [[]] * len(queries) q_num = len(queries) batch_num = q_num // batch_size if ((q_num // batch_size) == (q_num / batch_size)) else (q_num // batch_size) + 1 batches = list(map(lambda x: (x * batch_size, min(q_num, (x + 1) * batch_size)), range(batch_num))) scores = list() ids = list() for b in batches: qvecs = self.embeddings.batch_embed(queries[b[0]:b[1]]) score, id = self._batch_search_with_scores(vectors=qvecs, k=top_k, ids_scope=ids_scope) scores.append(score) ids.append(id) del qvecs gc.collect() scores = np.concatenate(scores, axis=0) ids = np.concatenate(ids, axis=0) get_docs = np.vectorize(lambda x: self.data[x]) get_indexes = np.vectorize(lambda x: x.index) get_metadatas = np.vectorize(lambda x: x.metadata) get_results = np.vectorize(lambda index, score, id, metadata: dict(index=index, score=score, id=id, metadata=metadata)) docs = get_docs(ids) indexes = get_indexes(docs) metadatas = get_metadatas(docs) results = get_results(indexes, scores, ids, metadatas) if index_only: get_str = np.vectorize(lambda x: x['index']) return get_str(results).tolist() else: return results.tolist() def search(self, query: str, top_k: int = 5, index_only: bool = True, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[Union[str, Dict[str, Any]]]: """Simlarity search on the given query. Args: query (str): Query for similarity search. top_k (int, optional): Maximum number of results. Defaults to 5. index_only (bool, optional): Whether to return the list of indexes only. Defaults to True. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None. Returns: List[Union[str, Dict[str, Any]]]: List of search results. """ return self.batch_search(queries=[query], top_k=top_k, index_only=index_only,filter_fn=filter_fn, **kwargs)[0] def search_by_metadata(self, ids_only: bool = False, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> Union[List[int], Dict[int, Document]]: """Search documents or ids by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn. Args: ids_only (bool, optional): Whether to return a list of ids or a dictionary with the ids as keys and documents as values. Defaults to False. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None. Returns: Union[List[int], Dict[int, Document]]: List of ids or dictionary with the ids as keys and documents as values. """ results = self.data.items() if filter_fn: results = filter(lambda x: filter_fn(x[1]), results) if kwargs: for k, v in kwargs.items(): results = list(filter(lambda x: x[1].metadata.get(k) == v, results)) if ids_only: return list(map(lambda x: x[0], results)) return dict(results) def delete_by_metadata(self, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> None: """Remove records by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn. Args: filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None. """ if ((not kwargs) and (not filter_fn)): raise ValueError('No keyword arguments or filter_fn are passed. Use the "clear" method to clear the entire database.') ids = self.search_by_metadata(ids_only=True, filter_fn=filter_fn, **kwargs) self._delete(ids) self.save() def clear(self) -> None: """Clear the entire vector database. Use it with caution. """ import gc del self._index del self._data gc.collect() self._index = self._get_empty_index() self._data = dict() self.save()Ancestors
- abc.ABC
Subclasses
Static methods
def from_documents(embeddings: Type[BaseEmbeddingsToolkit], docs: List[Document], name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) ‑> BaseVectorDatabase-
Load the vector database from existing documents.
Args
embeddings:Type[BaseEmbeddingsToolkit]- Embeddings toolkit to use.
docs:List[Document]- List of documents to use.
name:Optional[str], optional- Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir:Optional[str], optional- Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
split_text:bool, optional- Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns
BaseVectorDatabase- The initialised vector database.
Expand source code
@classmethod def from_documents(cls, embeddings: Type[BaseEmbeddingsToolkit], docs: List[Document], name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from existing documents. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. docs (List[Document]): List of documents to use. name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None. vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vdb = cls(embeddings, name, vectordb_dir, text_splitter) vdb.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter) vdb.save() # In case an empty list of docs is given. return vdb def from_exist(embeddings: Type[BaseEmbeddingsToolkit], name: str, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) ‑> BaseVectorDatabase-
Load the vector database from an existing vector database.
Args
embeddings:Type[BaseEmbeddingsToolkit]- Embeddings toolkit to use.
name:str- Name of the existing database.
vectordbs_dir:Optional[str], optional- Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns
BaseVectorDatabase- The initialised vector database.
Expand source code
@classmethod def from_exist(cls, embeddings: Type[BaseEmbeddingsToolkit], name: str, vectordb_dir: Optional[str] = None, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from an existing vector database. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. name (str): Name of the existing database. vectordbs_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vectordbs_dir = default_vectordb_dir() if vectordb_dir is None else vectordb_dir name = name_checker(name) existing_dbs = list_vectordbs(vectordb_dir=vectordbs_dir) if name not in existing_dbs: raise ValueError(f'The vector database "{name}" does not exist.') from ..utils import read_json import pickle db_info_dir = os.path.join(vectordbs_dir, name, 'info.json') db_info = read_json(db_info_dir) db_embeddings_name = db_info.get('embeddings', None) vdb = cls(embeddings, name, vectordbs_dir, text_splitter) data_dir =os.path.join(vdb.db_dir, 'data.pkl') if os.path.exists(data_dir): with open(data_dir, 'rb') as f: vdb._data = pickle.load(f) else: # recovering from old format print('Trying to recover from old format...') old_data_dir = os.path.join(vdb.db_dir, 'index.pkl') if os.path.exists(old_data_dir): with open(old_data_dir, 'rb') as f: data = pickle.load(f) data = list(map(lambda x: Document(index=x.page_content, metadata=x.metadata), data[0]._dict.values())) vdb.add_documents(data, split_text=False) else: raise FileExistsError(f'No raw data has been saved. Vector database cannot be recovered.') if db_embeddings_name == embeddings.name: vdb._index = vdb._load_index(os.path.join(vdb.db_dir, vdb._index_filename)) else: print(f'You are using a different embeddings model. Switching from embedding model {db_embeddings_name} to {embeddings.name}.') vdb.add_documents(list(vdb.data.values()), split_text=False) vdb.info['embeddings'] = embeddings.name vdb.save() return vdb def from_texts(embeddings: Type[BaseEmbeddingsToolkit], texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) ‑> BaseVectorDatabase-
Load the vector database from existing texts.
Args
embeddings:Type[BaseEmbeddingsToolkit]- Embeddings toolkit to use.
texts:List[str]- List of texts to add.
metadata:Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional- Metadata to add along with the texts. Defaults to None.
name:Optional[str], optional- Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None.
vectordb_dir:Optional[str], optional- Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None.
split_text:bool, optional- Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Returns
BaseVectorDatabase- The initialised vector database.
Expand source code
@classmethod def from_texts(cls, embeddings: Type[BaseEmbeddingsToolkit], texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, name: Optional[str] = None, vectordb_dir: Optional[str] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None, **kwargs) -> BaseVectorDatabase: """Load the vector database from existing texts. Args: embeddings (Type[BaseEmbeddingsToolkit]): Embeddings toolkit to use. texts (List[str]): List of texts to add. metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None. name (Optional[str], optional): Name of the vector database. Will be used as the directory base name of the vector database in vectordb_dir. If None is given, the vector database will not be saved. Defaults to None. vectordb_dir (Optional[str], optional): Directory where the vector databases live. If None is given, the default_vectordb_dir will be used. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. Returns: BaseVectorDatabase: The initialised vector database. """ vdb = cls(embeddings, name, vectordb_dir, text_splitter) vdb.add_texts(texts=texts, metadata=metadata, split_text=split_text, text_splitter=text_splitter) vdb.save() # In case an empty list of texts is given. return vdb
Instance variables
var data : Dict[int, Document]-
Dictionary of all the documents in the vector database.
Returns
Dict[int, Document]- Dictionary of all the documents in the vector database.
Expand source code
@property def data(self) -> Dict[int, Document]: """Dictionary of all the documents in the vector database. Returns: Dict[int, Document]: Dictionary of all the documents in the vector database. """ return self._data var db_dir : Optional[str]-
Directory of the vector database.
Returns
Optional[str]- Directory of the vector database.
Expand source code
@property def db_dir(self) -> Optional[str]: """Directory of the vector database. Returns: Optional[str]: Directory of the vector database. """ return self._db_dir var embeddings : BaseEmbeddingsToolkit-
Embeddings toolkit used in the vector database.
Returns
BaseEmbeddingsToolkit- Embeddings toolkit used in the vector database.
Expand source code
@property def embeddings(self) -> BaseEmbeddingsToolkit: """Embeddings toolkit used in the vector database. Returns: BaseEmbeddingsToolkit: Embeddings toolkit used in the vector database. """ return self._embeddings var index : Any-
Index of the vector database.
Returns
Any- Index of the vector database.
Expand source code
@property def index(self) -> Any: """Index of the vector database. Returns: Any: Index of the vector database. """ return self._index var info : Dict[str, Any]-
Information of the vector database.
Returns
Dict[str, Any]- Information of the vector database.
Expand source code
@property def info(self) -> Dict[str, Any]: """Information of the vector database. Returns: Dict[str, Any]: Information of the vector database. """ if not hasattr(self, '_info'): from ..utils import current_time, read_json if self.db_dir is not None: info_dir = os.path.join(self.db_dir, 'info.json') if os.path.exists(info_dir): self._info = read_json(info_dir) else: self._info = dict(embeddings=self.embeddings.name, last_update=current_time()) else: self._info = dict(embeddings=self.embeddings.name, last_update=current_time()) return self._info var name : Optional[str]-
Name of the vector database.
Returns
Optional[str]- Name of the vector database.
Expand source code
@property def name(self) -> Optional[str]: """Name of the vector database. Returns: Optional[str]: Name of the vector database. """ return self._name var size : int-
Number of documents in the vector database.
Returns
int- Number of documents in the vector database.
Expand source code
@property def size(self) -> int: """Number of documents in the vector database. Returns: int: Number of documents in the vector database. """ return len(self.data) var text_splitter : BaseTextSplitter-
Default text splitter for the vector database.
Returns
BaseTextSplitter- Default text splitter for the vector database.
Expand source code
@property def text_splitter(self) -> BaseTextSplitter: """Default text splitter for the vector database. Returns: BaseTextSplitter: Default text splitter for the vector database. """ return self._text_splitter
Methods
def add_docs_with_vectors(self, vectors: Sequence[Sequence[float]], docs: List[Document]) ‑> None-
Add documents with pre-embedded vectors into the vector database.
Args
vectors:Sequence[Sequence[float]]- Pre-embedded vectors.
docs:List[Document]- List of documents.
Expand source code
def add_docs_with_vectors(self, vectors: Sequence[Sequence[float]], docs: List[Document]) -> None: """Add documents with pre-embedded vectors into the vector database. Args: vectors (Sequence[Sequence[float]]): Pre-embedded vectors. docs (List[Document]): List of documents. """ len_vec = len(vectors) len_doc = len(docs) if len_vec != 0: if len_vec != len_doc: raise ValueError(f'{len_vec} vectors are given but the number of documents given is {len_doc}. Make sure the number of documents match with the number of vectors given.') vectors = np.array(vectors, dtype=np.float32) self._add(vectors, docs) self.save() def add_documents(self, docs: List[Document], split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) ‑> None-
Add documents into the vector database.
Args
docs:List[Document]- List of documents to split.
split_text:bool, optional- Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Expand source code
def add_documents(self, docs: List[Document], split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None: """Add documents into the vector database. Args: docs (List[Document]): List of documents to split. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. """ if len(docs) != 0: text_splitter = self.text_splitter if text_splitter is None else text_splitter docs = text_splitter.split_documents(docs) if split_text else docs vectors = list(map(lambda x: x.index, docs)) vectors = self.embeddings.batch_embed(vectors) vectors = np.array(vectors, dtype=np.float32) self._add(vectors=vectors, docs=docs) self.save() def add_texts(self, texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) ‑> None-
Add texts into the vector database.
Args
texts:List[str]- List of texts to add.
metadata:Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional- Metadata to add along with the texts. Defaults to None.
split_text:bool, optional- Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True.
text_splitter:Optional[Type[BaseTextSplitter]], optional- Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None.
Expand source code
def add_texts(self, texts: List[str], metadata: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, split_text: bool = True, text_splitter: Optional[Type[BaseTextSplitter]] = None) -> None: """Add texts into the vector database. Args: texts (List[str]): List of texts to add. metadata (Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], optional): Metadata to add along with the texts. Defaults to None. split_text (bool, optional): Whether to split the docuements with the embeddings toolkit text splitter. Defaults to True. text_splitter (Optional[Type[BaseTextSplitter]], optional): Text splitter to split the documents. If none given, the embeddings toolkit text splitter will be used. Defaults to None. """ if len(texts) != 0: if metadata is None: metadata = [dict()] * len(texts) if isinstance(metadata, list): if len(metadata) != len(texts): raise ValueError('Number of texts does not match with number of metadata.') else: metadata = [metadata] * len(texts) docs = list(map(lambda x: Document(index=x[0], metadata=x[1]), list(zip(texts, metadata)))) self.add_documents(docs=docs, split_text=split_text, text_splitter=text_splitter) def batch_search(self, queries: List[str], top_k: int = 5, index_only: bool = True, batch_size: int = 100, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) ‑> List[List[Union[str, Dict[str, Any]]]]-
Batch simlarity search on multiple queries.
Args
queries:List[str]- List of queries.
top_k:int, optional- Maximum number of results for each query. Defaults to 5.
index_only:bool, optional- Whether to return the list of indexes only. Defaults to True.
batch_size:int, optional- Batch size to perform similarity search. Defaults to 100.
filter_fn:Optional[Callable[[Document], bool]], optional- The filter function to limit the scope of similarity search. Defaults to None.
Returns
List[List[Union[str, Dict[str, Any]]]]- List of list of search results.
Expand source code
def batch_search(self, queries: List[str], top_k: int = 5, index_only: bool = True, batch_size: int = 100, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[List[Union[str, Dict[str, Any]]]]: """Batch simlarity search on multiple queries. Args: queries (List[str]): List of queries. top_k (int, optional): Maximum number of results for each query. Defaults to 5. index_only (bool, optional): Whether to return the list of indexes only. Defaults to True. batch_size (int, optional): Batch size to perform similarity search. Defaults to 100. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None. Returns: List[List[Union[str, Dict[str, Any]]]]: List of list of search results. """ import gc # Filtering the scope of search scope = None if filter_fn: scope = filter(lambda x: filter_fn(x[1]), self.data.items()) if kwargs: scope = self.data.items() if scope is None else scope for k, v in kwargs.items(): scope = filter(lambda x: x[1].metadata.get(k) == v, scope) ids_scope = scope if scope is None else list(map(lambda x: x[0], scope)) top_k = min(self.size, top_k) if ids_scope is None else min(self.size, top_k, len(ids_scope)) if top_k == 0: return [[]] * len(queries) q_num = len(queries) batch_num = q_num // batch_size if ((q_num // batch_size) == (q_num / batch_size)) else (q_num // batch_size) + 1 batches = list(map(lambda x: (x * batch_size, min(q_num, (x + 1) * batch_size)), range(batch_num))) scores = list() ids = list() for b in batches: qvecs = self.embeddings.batch_embed(queries[b[0]:b[1]]) score, id = self._batch_search_with_scores(vectors=qvecs, k=top_k, ids_scope=ids_scope) scores.append(score) ids.append(id) del qvecs gc.collect() scores = np.concatenate(scores, axis=0) ids = np.concatenate(ids, axis=0) get_docs = np.vectorize(lambda x: self.data[x]) get_indexes = np.vectorize(lambda x: x.index) get_metadatas = np.vectorize(lambda x: x.metadata) get_results = np.vectorize(lambda index, score, id, metadata: dict(index=index, score=score, id=id, metadata=metadata)) docs = get_docs(ids) indexes = get_indexes(docs) metadatas = get_metadatas(docs) results = get_results(indexes, scores, ids, metadatas) if index_only: get_str = np.vectorize(lambda x: x['index']) return get_str(results).tolist() else: return results.tolist() def clear(self) ‑> None-
Clear the entire vector database. Use it with caution.
Expand source code
def clear(self) -> None: """Clear the entire vector database. Use it with caution. """ import gc del self._index del self._data gc.collect() self._index = self._get_empty_index() self._data = dict() self.save() def delete_by_metadata(self, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) ‑> None-
Remove records by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn.
Args
filter_fn:Optional[Callable[[Document], bool]], optional- The filter function. Defaults to None.
Expand source code
def delete_by_metadata(self, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> None: """Remove records by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn. Args: filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None. """ if ((not kwargs) and (not filter_fn)): raise ValueError('No keyword arguments or filter_fn are passed. Use the "clear" method to clear the entire database.') ids = self.search_by_metadata(ids_only=True, filter_fn=filter_fn, **kwargs) self._delete(ids) self.save() def save(self) ‑> None-
Save the vector database.
Expand source code
def save(self) -> None: """Save the vector database. """ if self.db_dir is not None: self._save_data() self._save_index() self._save_info() def search(self, query: str, top_k: int = 5, index_only: bool = True, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) ‑> List[Union[str, Dict[str, Any]]]-
Simlarity search on the given query.
Args
query:str- Query for similarity search.
top_k:int, optional- Maximum number of results. Defaults to 5.
index_only:bool, optional- Whether to return the list of indexes only. Defaults to True.
filter_fn:Optional[Callable[[Document], bool]], optional- The filter function to limit the scope of similarity search. Defaults to None.
Returns
List[Union[str, Dict[str, Any]]]- List of search results.
Expand source code
def search(self, query: str, top_k: int = 5, index_only: bool = True, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> List[Union[str, Dict[str, Any]]]: """Simlarity search on the given query. Args: query (str): Query for similarity search. top_k (int, optional): Maximum number of results. Defaults to 5. index_only (bool, optional): Whether to return the list of indexes only. Defaults to True. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function to limit the scope of similarity search. Defaults to None. Returns: List[Union[str, Dict[str, Any]]]: List of search results. """ return self.batch_search(queries=[query], top_k=top_k, index_only=index_only,filter_fn=filter_fn, **kwargs)[0] def search_by_metadata(self, ids_only: bool = False, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) ‑> Union[List[int], Dict[int, Document]]-
Search documents or ids by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn.
Args
ids_only:bool, optional- Whether to return a list of ids or a dictionary with the ids as keys and documents as values. Defaults to False.
filter_fn:Optional[Callable[[Document], bool]], optional- The filter function. Defaults to None.
Returns
Union[List[int], Dict[int, Document]]- List of ids or dictionary with the ids as keys and documents as values.
Expand source code
def search_by_metadata(self, ids_only: bool = False, filter_fn: Optional[Callable[[Document], bool]] = None, **kwargs) -> Union[List[int], Dict[int, Document]]: """Search documents or ids by metadata. Pass the filters on metadata as keyword arguments or pass a filter_fn. Args: ids_only (bool, optional): Whether to return a list of ids or a dictionary with the ids as keys and documents as values. Defaults to False. filter_fn (Optional[Callable[[Document], bool]], optional): The filter function. Defaults to None. Returns: Union[List[int], Dict[int, Document]]: List of ids or dictionary with the ids as keys and documents as values. """ results = self.data.items() if filter_fn: results = filter(lambda x: filter_fn(x[1]), results) if kwargs: for k, v in kwargs.items(): results = list(filter(lambda x: x[1].metadata.get(k) == v, results)) if ids_only: return list(map(lambda x: x[0], results)) return dict(results)