diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md index 4e16b020eea44bd8dcaefd8fc501907e1d81ebef..2150a9c4d853f4c94467e6784e64fa37f95ca251 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md @@ -103,3 +103,20 @@ nodes = parser.get_nodes_from_documents(documents) # rerank rerank.postprocess_nodes(nodes, query_str=query) ``` + +### Custom HTTP Client + +If you need more control over HTTP settings (e.g., timeouts, proxies, retries), you can pass your own `httpx.Client` instance to the `NVIDIARerank` initializer: + +```python +import httpx +from llama_index.postprocessor.nvidia_rerank import NVIDIARerank + +# Create a custom httpx client with a 10-second timeout +custom_client = httpx.Client(timeout=10.0) + +# Pass the custom client to the reranker +rerank = NVIDIARerank( + base_url="http://localhost:1976/v1", http_client=custom_client +) +``` diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py index db4e051b2d983c95b1d102bda8b73377d6c0470b..a1938d49339345e9ab51fd39544fa364b111d305 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py @@ -1,6 +1,7 @@ from typing import Any, List, Optional, Generator, Literal import os from urllib.parse import urlparse, urlunparse +import httpx from llama_index.core.bridge.pydantic import Field, PrivateAttr, ConfigDict from llama_index.core.callbacks import CBEventType, EventPayload @@ -11,10 +12,16 @@ from llama_index.core.instrumentation.events.rerank import ( ) from llama_index.core.postprocessor.types import BaseNodePostprocessor from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle -import requests import warnings from llama_index.core.base.llms.generic_utils import get_from_param_or_env +from .utils import ( + RANKING_MODEL_TABLE, + BASE_URL, + DEFAULT_MODEL, + Model, + determine_model, +) from .utils import ( RANKING_MODEL_TABLE, BASE_URL, @@ -56,6 +63,7 @@ class NVIDIARerank(BaseNodePostprocessor): _mode: str = PrivateAttr("nvidia") _is_hosted: bool = PrivateAttr(True) base_url: Optional[str] = None + _http_client: Optional[httpx.Client] = PrivateAttr(None) def __init__( self, @@ -63,6 +71,7 @@ class NVIDIARerank(BaseNodePostprocessor): nvidia_api_key: Optional[str] = None, api_key: Optional[str] = None, base_url: Optional[str] = os.getenv("NVIDIA_BASE_URL", BASE_URL), + http_client: Optional[httpx.Client] = None, **kwargs: Any, ): """ @@ -75,6 +84,7 @@ class NVIDIARerank(BaseNodePostprocessor): nvidia_api_key (str, optional): The NVIDIA API key. Defaults to None. api_key (str, optional): The API key. Defaults to None. base_url (str, optional): The base URL of the on-premises NIM. Defaults to None. + http_client (httpx.Client, optional): Custom HTTP client for making requests. truncate (str): "NONE", "END", truncate input text if it exceeds the model's context length. Default is model dependent and is likely to raise an error if an input is too long. @@ -87,6 +97,8 @@ class NVIDIARerank(BaseNodePostprocessor): model = model or DEFAULT_MODEL super().__init__(model=model, **kwargs) + self._is_hosted = base_url in KNOWN_URLS + self.base_url = base_url self._is_hosted = base_url in KNOWN_URLS self.base_url = base_url self._api_key = get_from_param_or_env( @@ -95,12 +107,11 @@ class NVIDIARerank(BaseNodePostprocessor): "NVIDIA_API_KEY", "NO_API_KEY_PROVIDED", ) - if self._is_hosted: # hosted on API Catalog (build.nvidia.com) if (not self._api_key) or (self._api_key == "NO_API_KEY_PROVIDED"): raise ValueError("An API key is required for hosted NIM.") else: # not hosted - self.base_url = self._validate_url(base_url) + self.base_url = self._validate_url(self.base_url) self.model = model if not self.model: @@ -110,10 +121,9 @@ class NVIDIARerank(BaseNodePostprocessor): self.__get_default_model() if not self.model.startswith("nvdev/"): - # allow internal models - # TODO: add test case for this self._validate_model(self.model) ## validate model - self.base_url = base_url + + self._http_client = http_client def __get_default_model(self): """Set default model.""" @@ -136,24 +146,30 @@ class NVIDIARerank(BaseNodePostprocessor): else: self.model = DEFAULT_MODEL + @property + def normalized_base_url(self) -> str: + """Return the normalized base URL (without trailing slashes).""" + return self.base_url.rstrip("/") + + def _get_headers(self, auth_required: bool = False) -> dict: + """Return default headers for HTTP requests. + + If auth_required is True or the client is hosted, includes an Authorization header. + """ + headers = {"Accept": "application/json"} + if auth_required or self._is_hosted: + headers["Authorization"] = f"Bearer {self._api_key}" + return headers + def _get_models(self) -> List[Model]: - session = requests.Session() - self.base_url = self.base_url.rstrip("/") + "/" - if self._is_hosted: - _headers = { - "Authorization": f"Bearer {self._api_key}", - "Accept": "application/json", - } - else: - _headers = { - "Accept": "application/json", - } + client = self.client + _headers = self._get_headers(auth_required=self._is_hosted) url = ( "https://integrate.api.nvidia.com/v1/models" if self._is_hosted - else self.base_url.rstrip("/") + "/models" + else self.normalized_base_url + "/models" ) - response = session.get(url, headers=_headers) + response = client.get(url, headers=_headers) response.raise_for_status() assert ( @@ -181,6 +197,18 @@ class NVIDIARerank(BaseNodePostprocessor): ] else: return RANKING_MODEL_TABLE + # TODO: hosted now has a model listing, need to merge known and listed models + # TODO: parse model config for local models + if not self._is_hosted: + return [ + Model( + id=model["id"], + base_model=getattr(model, "params", {}).get("root", None), + ) + for model in response.json()["data"] + ] + else: + return RANKING_MODEL_TABLE def _validate_url(self, base_url): """ @@ -190,10 +218,37 @@ class NVIDIARerank(BaseNodePostprocessor): emit a warning. old documentation told users to pass in the full inference url, which is incorrect and prevents model listing from working. normalize base_url to end in /v1. + validate the base_url. + if the base_url is not a url, raise an error + if the base_url does not end in /v1, e.g. /embeddings + emit a warning. old documentation told users to pass in the full + inference url, which is incorrect and prevents model listing from working. + normalize base_url to end in /v1. """ if base_url is not None: parsed = urlparse(base_url) + # Ensure scheme and netloc (domain name) are present + if not (parsed.scheme and parsed.netloc): + expected_format = "Expected format is: http://host:port" + raise ValueError( + f"Invalid base_url format. {expected_format} Got: {base_url}" + ) + + normalized_path = parsed.path.rstrip("/") + if not normalized_path.endswith("/v1"): + warnings.warn( + f"{base_url} does not end in /v1, you may " + "have inference and listing issues" + ) + normalized_path += "/v1" + + base_url = urlunparse( + (parsed.scheme, parsed.netloc, normalized_path, None, None, None) + ) + if base_url is not None: + parsed = urlparse(base_url) + # Ensure scheme and netloc (domain name) are present if not (parsed.scheme and parsed.netloc): expected_format = "Expected format is: http://host:port" @@ -228,6 +283,15 @@ class NVIDIARerank(BaseNodePostprocessor): model = determine_model(model_name) available_model_ids = [model.id for model in self.available_models] + if not model: + if self._is_hosted: + warnings.warn(f"Unable to determine validity of {model_name}") + else: + if model_name not in available_model_ids: + raise ValueError(f"No locally hosted {model_name} was found.") + model = determine_model(model_name) + available_model_ids = [model.id for model in self.available_models] + if not model: if self._is_hosted: warnings.warn(f"Unable to determine validity of {model_name}") @@ -238,16 +302,29 @@ class NVIDIARerank(BaseNodePostprocessor): if model and model.endpoint: self.base_url = model.endpoint + if model and model.endpoint: + self.base_url = model.endpoint + @property def available_models(self) -> List[Model]: """Get available models.""" # all available models are in the map ids = RANKING_MODEL_TABLE.keys() + ids = RANKING_MODEL_TABLE.keys() if not self._is_hosted: return self._get_models() else: return [Model(id=id) for id in ids] + @property + def client(self) -> httpx.Client: + """ + Lazy initialization of the HTTP client. + """ + if self._http_client is None: + self._http_client = httpx.Client() + return self._http_client + @classmethod def class_name(cls) -> str: return "NVIDIARerank" @@ -273,12 +350,8 @@ class NVIDIARerank(BaseNodePostprocessor): if len(nodes) == 0: return [] - session = requests.Session() - - _headers = { - "Authorization": f"Bearer {self._api_key}", - "Accept": "application/json", - } + client = self.client + _headers = self._get_headers(auth_required=True) # TODO: replace with itertools.batched in python 3.12 def batched(ls: list, size: int) -> Generator[List[NodeWithScore], None, None]: @@ -305,7 +378,7 @@ class NVIDIARerank(BaseNodePostprocessor): for n in batch ], } - response = session.post(self.base_url, headers=_headers, json=payloads) + response = client.post(self.base_url, headers=_headers, json=payloads) response.raise_for_status() # expected response format: # { diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml index 4958cb1ac0d7cc4c8e74df3853803d38582e4999..a59369273279524b3dd1cc079ea89d1cb620aab6 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml @@ -30,7 +30,7 @@ license = "MIT" name = "llama-index-postprocessor-nvidia-rerank" packages = [{include = "llama_index/"}] readme = "README.md" -version = "0.4.1" +version = "0.4.2" [tool.poetry.dependencies] python = ">=3.9,<4.0" @@ -56,6 +56,10 @@ types-redis = "4.5.5.0" types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 types-setuptools = "67.1.0.0" +[tool.poetry.group.test-integration.dependencies] +responses = "^0.25.6" +respx = {extras = ["pytest"], version = "^0.22.0"} + [tool.poetry.group.test_integration.dependencies] pytest-httpx = "*" requests-mock = "^1.12.1" diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py index 09ca0a2b0bd0b819e1125a9e72e2317e1ac6ba11..6487cd5bf8895f32e63dcfb7f8365ebbe32e73a3 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py @@ -1,23 +1,17 @@ import os import pytest - +import respx from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface from llama_index.core.schema import NodeWithScore, Document from typing import Any -from requests_mock import Mocker @pytest.fixture() -def mock_local_models(requests_mock: Mocker) -> None: - requests_mock.get( - "https://test_url/v1/models", - json={ - "data": [ - {"id": "model1"}, - ] - }, +def mock_local_models(respx_mock: respx.MockRouter) -> None: + respx_mock.get("https://test_url/v1/models").respond( + json={"data": [{"id": "model1"}]} ) diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py index fa2e9acd1e92c43c2d9675a514f265db887941ff..541202d7d8a4b4a2991ddcc05eb35f6bd58f17d7 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py @@ -1,12 +1,12 @@ import pytest from llama_index.postprocessor.nvidia_rerank import NVIDIARerank -from requests_mock import Mocker +import respx @pytest.fixture(autouse=True) -def mock_local_models(requests_mock: Mocker) -> None: - requests_mock.get( +def mock_local_models(respx_mock: respx.MockRouter) -> None: + respx_mock.get( "https://test_url/v1/models", json={ "data": [ diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_base_url.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_base_url.py index 5af6922c83f9bb897e4ebcf0f4797531b30629b6..f2108f10bf082033239316346f152499281ac9bd 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_base_url.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_base_url.py @@ -1,12 +1,13 @@ from urllib.parse import urlparse, urlunparse import pytest -from requests_mock import Mocker from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface +from llama_index.postprocessor.nvidia_rerank.utils import BASE_URL +import respx @pytest.fixture() -def mock_v1_local_models2(requests_mock: Mocker, base_url: str) -> None: +def mock_v1_local_models2(respx_mock: respx.MockRouter, base_url: str) -> None: parsed = urlparse(base_url) normalized_path = parsed.path.rstrip("/") if not normalized_path.endswith("/v1"): @@ -14,8 +15,8 @@ def mock_v1_local_models2(requests_mock: Mocker, base_url: str) -> None: base_url = urlunparse( (parsed.scheme, parsed.netloc, normalized_path, None, None, None) ) - requests_mock.get( - f"{base_url}/models", + # Intercept GET call for retrieving models using httpx. + respx_mock.get(f"{base_url}/models").respond( json={ "data": [ { @@ -26,11 +27,11 @@ def mock_v1_local_models2(requests_mock: Mocker, base_url: str) -> None: "root": "model1", }, ] - }, + } ) -# test case for invalid base_url +# Updated test for non-hosted URLs that may need normalization. @pytest.mark.parametrize( "base_url", [ @@ -43,9 +44,19 @@ def mock_v1_local_models2(requests_mock: Mocker, base_url: str) -> None: def test_base_url_invalid_not_hosted( base_url: str, mock_v1_local_models2: None ) -> None: - Interface(base_url=base_url) + parsed = urlparse(base_url) + normalized_path = parsed.path.rstrip("/") + # Expect a warning if the URL does NOT already end with "/v1" + if not normalized_path.endswith("/v1"): + with pytest.warns(UserWarning, match="does not end in /v1"): + client = Interface(base_url=base_url) + else: + client = Interface(base_url=base_url) + # Assert that the client's base_url is normalized to end with '/v1' + assert client.base_url.endswith("/v1") +# Updated test for valid non-hosted URL. @pytest.mark.parametrize( "base_url", [ @@ -53,17 +64,38 @@ def test_base_url_invalid_not_hosted( ], ) def test_base_url_valid_not_hosted(base_url: str, mock_v1_local_models2: None) -> None: - with pytest.warns(UserWarning) as record: - Interface(base_url=base_url) - assert "Default model is set" in str(record[0].message) + # The default model warning is expected in non-hosted mode + with pytest.warns(UserWarning, match="Default model is set") as record: + client = Interface(base_url=base_url) + # Also verify the base_url remains normalized (unchanged in this case) + assert client.base_url.endswith("/v1") +# Updated test for hosted base URL. @pytest.mark.parametrize( "base_url", - ["https://ai.api.nvidia.com/v1"], + [BASE_URL], ) def test_base_url_valid_hosted(base_url: str, mock_v1_local_models2: None) -> None: - Interface(base_url=base_url, api_key="BOGUS") + client = Interface(base_url=base_url, api_key="BOGUS") + assert client._is_hosted + # Hosted client should use the provided base_url exactly. + assert client.base_url == base_url + + +# Updated test for proxy base URLs. +@pytest.mark.parametrize( + "base_url", + [ + "http://host/path0/path1/path2/v1", + "http://host:123/path0/path1/path2/v1", + ], +) +def test_proxy_base_url(base_url: str, mock_v1_local_models2: None) -> None: + client = Interface(api_key="NO_API_KEY_PROVIDED", base_url=base_url) + assert not client._is_hosted + # Since the URL is already normalized, verify it remains unchanged. + assert client.base_url == base_url @pytest.mark.parametrize( @@ -80,16 +112,3 @@ def test_param_base_url_negative(base_url: str, monkeypatch) -> None: with pytest.raises(ValueError) as e: Interface(model="model1", base_url=base_url) assert "Invalid base_url" in str(e.value) - - -@pytest.mark.parametrize( - "base_url", - [ - "http://host/path0/path1/path2/v1", - "http://host:123/path0/path1/path2/v1", - ], -) -def test_proxy_base_url(base_url: str, mock_v1_local_models2: None) -> None: - client = Interface(api_key="NO_API_KEY_PROVIDED", base_url=base_url) - assert not client._is_hosted - assert base_url.startswith(client.base_url) diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py index 2b6f9521284484163af8929cb33cb72f76bd53a1..15999b6cc2eab9d55f4942d197aad4025d8ffd54 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py @@ -6,8 +6,7 @@ from llama_index.core.schema import NodeWithScore, Document from llama_index.core.node_parser import SentenceSplitter import faker - -from requests_mock import Mocker +import respx @pytest.fixture() @@ -16,16 +15,9 @@ def known_unknown() -> str: @pytest.fixture() -def mock_local_models(requests_mock: Mocker, known_unknown) -> None: - requests_mock.get( - "http://localhost:8000/v1/models", - json={ - "data": [ - { - "id": known_unknown, - }, - ] - }, +def mock_local_models(respx_mock: respx.MockRouter, known_unknown: str) -> None: + respx_mock.get("http://localhost:8000/v1/models").respond( + json={"data": [{"id": known_unknown}]} ) diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_truncate.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_truncate.py index 351e7c7fda013ccd6af1c94f04c048811a6128a7..c5961e102481963ae18b1f7578ff85b3adbb4f93 100644 --- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_truncate.py +++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_truncate.py @@ -2,15 +2,15 @@ from typing import Any, Literal, Optional import pytest import re -from requests_mock import Mocker +import respx +import json from llama_index.postprocessor.nvidia_rerank import NVIDIARerank from llama_index.core.schema import NodeWithScore, Document @pytest.fixture() -def mock_v1_models(requests_mock: Mocker) -> None: - requests_mock.get( - "https://integrate.api.nvidia.com/v1/models", +def mock_v1_models(respx_mock: respx.MockRouter) -> None: + respx_mock.get("https://integrate.api.nvidia.com/v1/models").respond( json={ "data": [ { @@ -20,19 +20,20 @@ def mock_v1_models(requests_mock: Mocker) -> None: "owned_by": "OWNER", } ] - }, + } ) @pytest.fixture() -def mock_v1_ranking(requests_mock: Mocker) -> None: - requests_mock.post( - re.compile(r"https://ai\.api\.nvidia\.com/v1/.*/reranking"), +def mock_v1_ranking(respx_mock: respx.MockRouter) -> None: + respx_mock.post( + re.compile(r"https://ai\.api\.nvidia\.com/v1/.*/reranking") + ).respond( json={ "rankings": [ {"index": 0, "logit": 4.2}, ] - }, + } ) @@ -51,7 +52,7 @@ def mock(mock_v1_models: None, mock_v1_ranking: None) -> None: ) def test_truncate_passed( mock: None, - requests_mock: Mocker, + respx_mock: respx.MockRouter, truncate: Optional[Literal["END", "NONE"]], ) -> None: client = NVIDIARerank( @@ -65,8 +66,9 @@ def test_truncate_passed( assert len(response) == 1 - assert requests_mock.last_request is not None - request_payload = requests_mock.last_request.json() + assert len(respx.calls) > 0 + last_call = list(respx.calls)[-1] + request_payload = json.loads(last_call.request.content.decode("utf-8")) if truncate is None: assert "truncate" not in request_payload else: