from __future__ import annotations
from typing import Any, Generator, List, Optional, Sequence
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import Document
from langchain_core.documents.compressor import BaseDocumentCompressor
from langchain_core.pydantic_v1 import BaseModel, Field, PrivateAttr
from ._common import _MODE_TYPE, _NVIDIAClient
from ._statics import Model
[docs]class Ranking(BaseModel):
index: int
logit: float
[docs]class NVIDIARerank(BaseDocumentCompressor):
"""
LangChain Document Compressor that uses the NVIDIA NeMo Retriever Reranking API.
"""
class Config:
validate_assignment = True
_client: _NVIDIAClient = PrivateAttr(_NVIDIAClient)
_default_batch_size: int = 32
_default_model: str = "ai-rerank-qa-mistral-4b"
_default_model_name: str = "nv-rerank-qa-mistral-4b:1"
top_n: int = Field(5, ge=0, description="The number of documents to return.")
model: str = Field(_default_model, description="The model to use for reranking.")
max_batch_size: int = Field(
_default_batch_size, ge=1, description="The maximum batch size."
)
def __init__(self, **kwargs: Any):
"""
Create a new NVIDIARerank document compressor.
Unless you plan to use the "nim" mode, you need to provide an API key. Your
options are -
0. Pass the key as the nvidia_api_key parameter.
1. Pass the key as the api_key parameter.
2. Set the NVIDIA_API_KEY environment variable, recommended.
Precedence is in the order listed above.
"""
super().__init__(**kwargs)
self._client = _NVIDIAClient(
model=self.model,
api_key=kwargs.get("nvidia_api_key", kwargs.get("api_key", None)),
)
@property
def available_models(self) -> List[Model]:
"""
Get a list of available models that work with NVIDIARerank.
"""
if self._client.curr_mode in ["nim", "open"]:
# local NIM supports a single model and no /models endpoint
models = [
Model(
id=NVIDIARerank._default_model,
model_name=NVIDIARerank._default_model_name,
model_type="ranking",
client="NVIDIARerank",
path="magic",
)
]
else:
models = self._client.get_available_models(
client=self._client,
filter=self.__class__.__name__,
)
return models
[docs] @classmethod
def get_available_models(
cls,
mode: Optional[_MODE_TYPE] = None,
list_all: bool = False,
**kwargs: Any,
) -> List[Model]:
"""
Get a list of available models. These models will work with the NVIDIARerank
interface.
Use the mode parameter to specify the mode to use. See the docs for mode()
to understand additional keyword arguments required when setting mode.
It is possible to get a list of all models, including those that are not
chat models, by setting the list_all parameter to True.
"""
self = cls(**kwargs).mode(mode=mode, **kwargs)
if mode in ["nim", "open"]:
# ignoring list_all because there is one
models = self.available_models
else:
models = self._client.get_available_models(
mode=mode,
list_all=list_all,
client=self._client,
filter=cls.__name__,
**kwargs,
)
return models
[docs] def mode(
self,
mode: Optional[_MODE_TYPE] = "nvidia",
base_url: Optional[str] = None,
model: Optional[str] = None,
api_key: Optional[str] = None,
**kwargs: Any,
) -> NVIDIARerank:
"""
Change the mode.
There are two modes, "nvidia" and "nim". The "nvidia" mode is the default mode
and is used to interact with hosted NVIDIA AI endpoints. The "nim" mode is
used to interact with NVIDIA NIM endpoints, which are typically hosted
on-premises.
For the "nvidia" mode, the "api_key" parameter is available to specify your
API key. If not specified, the NVIDIA_API_KEY environment variable will be used.
For the "nim" mode, the "base_url" and "model" parameters are required. Set
base_url to the url of your NVIDIA NIM endpoint. For instance,
"https://localhost:9999/v1". Additionally, the "model" parameter must be set
to the name of the model inside the NIM.
"""
# set a default base_url for nim mode
if not base_url and mode == "nim":
base_url = "http://localhost:1976/v1"
self._client = self._client.mode(
mode=mode,
base_url=base_url,
model=model,
api_key=api_key,
infer_path="{base_url}/ranking",
**kwargs,
)
self.model = self._client.model
return self
# todo: batching when len(documents) > endpoint's max batch size
def _rank(self, documents: List[str], query: str) -> List[Ranking]:
response = self._client.client.get_req(
model_name=self.model,
payload={
"model": "nv-rerank-qa-mistral-4b:1",
"query": {"text": query},
"passages": [{"text": passage} for passage in documents],
},
endpoint="infer",
)
if response.status_code != 200:
response.raise_for_status()
# todo: handle errors
rankings = response.json()["rankings"]
# todo: callback support
return [Ranking(**ranking) for ranking in rankings[: self.top_n]]
[docs] def compress_documents(
self,
documents: Sequence[Document],
query: str,
callbacks: Optional[Callbacks] = None,
) -> Sequence[Document]:
"""
Compress documents using the NVIDIA NeMo Retriever Reranking microservice API.
Args:
documents: A sequence of documents to compress.
query: The query to use for compressing the documents.
callbacks: Callbacks to run during the compression process.
Returns:
A sequence of compressed documents.
"""
if len(documents) == 0 or self.top_n < 1:
return []
def batch(ls: list, size: int) -> Generator[List[Document], None, None]:
for i in range(0, len(ls), size):
yield ls[i : i + size]
doc_list = list(documents)
results = []
for doc_batch in batch(doc_list, self.max_batch_size):
rankings = self._rank(
query=query, documents=[d.page_content for d in doc_batch]
)
for ranking in rankings:
doc = doc_batch[ranking.index]
doc.metadata["relevance_score"] = ranking.logit
results.append(doc)
# if we batched, we need to sort the results
if len(doc_list) > self.max_batch_size:
results.sort(key=lambda x: x.metadata["relevance_score"], reverse=True)
return results[: self.top_n]