From 347f122df39ad0fefb830c476c31a721b48ff996 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Sat, 18 May 2024 12:06:36 -0400
Subject: [PATCH] Deprecate mode() in favor of __init__(base_url=...) for
 NVIDIA, NVIDIAEmbedding, NVIDIARerank (#13572)

* deprecate NVIDIA().mode(...), use NVIDIA(...) instead

* bump version to 0.1.3

* deprecate NVIDIAEmbedding().mode(...), use NVIDIAEmbedding(...) instead

* bump version to 0.1.2

* deprecate NVIDIARerank().mode(...), use NVIDIARerank(...) instead

* bump version to 0.1.3
---
 .../llama_index/embeddings/nvidia/base.py     | 82 +++++++++++----
 .../pyproject.toml                            |  2 +-
 .../tests/conftest.py                         | 34 +++----
 .../tests/test_api_key.py                     | 72 +++++++-------
 .../tests/test_available_models.py            |  4 +-
 .../tests/test_integration.py                 |  2 +-
 .../tests/test_mode_switch.py                 | 99 ++++++++++++-------
 .../llama_index/llms/nvidia/base.py           | 79 +++++++++------
 .../llama-index-llms-nvidia/pyproject.toml    |  2 +-
 .../llama-index-llms-nvidia/tests/conftest.py |  4 +-
 .../tests/test_additional_kwargs.py           |  6 +-
 .../tests/test_api_key.py                     | 16 ++-
 .../tests/test_available_models.py            |  2 +-
 .../tests/test_integration.py                 | 20 ++--
 .../tests/test_mode_switch.py                 | 67 ++++++++++---
 .../postprocessor/nvidia_rerank/base.py       | 88 ++++++++++++-----
 .../pyproject.toml                            |  2 +-
 .../tests/conftest.py                         | 32 +++---
 .../tests/test_api_key.py                     | 88 +++++++++--------
 .../tests/test_available_models.py            |  2 +-
 .../tests/test_mode_switch.py                 | 87 +++++++++++-----
 .../tests/test_postprocessor_nvidia_rerank.py | 38 +++----
 22 files changed, 522 insertions(+), 306 deletions(-)

diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/llama_index/embeddings/nvidia/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/llama_index/embeddings/nvidia/base.py
index cdf2cc7dc9..7d0c99072b 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/llama_index/embeddings/nvidia/base.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/llama_index/embeddings/nvidia/base.py
@@ -1,6 +1,8 @@
 """NVIDIA embeddings file."""
 
 from typing import Any, List, Literal, Optional
+import warnings
+from deprecated import deprecated
 
 from llama_index.core.base.embeddings.base import (
     DEFAULT_EMBED_BATCH_SIZE,
@@ -12,14 +14,16 @@ from llama_index.core.base.llms.generic_utils import get_from_param_or_env
 
 from openai import OpenAI, AsyncOpenAI
 
-BASE_RETRIEVAL_URL = "https://ai.api.nvidia.com/v1/retrieval/nvidia"
+BASE_URL = "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
 DEFAULT_MODEL = "NV-Embed-QA"
 
 MODEL_ENDPOINT_MAP = {
-    DEFAULT_MODEL: BASE_RETRIEVAL_URL,
+    DEFAULT_MODEL: BASE_URL,
     "snowflake/arctic-embed-l": "https://ai.api.nvidia.com/v1/retrieval/snowflake/arctic-embed-l",
 }
 
+KNOWN_URLS = list(MODEL_ENDPOINT_MAP.values())
+
 
 class Model(BaseModel):
     id: str
@@ -55,6 +59,7 @@ class NVIDIAEmbedding(BaseEmbedding):
     _client: Any = PrivateAttr()
     _aclient: Any = PrivateAttr()
     _mode: str = PrivateAttr("nvidia")
+    _is_hosted: bool = PrivateAttr(True)
 
     def __init__(
         self,
@@ -63,22 +68,62 @@ class NVIDIAEmbedding(BaseEmbedding):
         max_retries: Optional[int] = 5,
         nvidia_api_key: Optional[str] = None,
         api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
         embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,  # This could default to 50
         callback_manager: Optional[CallbackManager] = None,
         **kwargs: Any,
     ):
+        """
+        Construct an Embedding interface for NVIDIA NIM.
+
+        This constructor initializes an instance of the NVIDIAEmbedding class, which provides
+        an interface for embedding text using NVIDIA's NIM service.
+
+        Parameters:
+        - model (str, optional): The name of the model to use for embeddings.
+        - timeout (float, optional): The timeout for requests to the NIM service, in seconds. Defaults to 120.
+        - max_retries (int, optional): The maximum number of retries for requests to the NIM service. Defaults to 5.
+        - nvidia_api_key (str, optional): The API key for the NIM service. This is required if using a hosted NIM.
+        - api_key (str, optional): An alternative parameter for providing the API key.
+        - base_url (str, optional): The base URL for the NIM service. If not provided, the service will default to a hosted NIM.
+        - **kwargs: Additional keyword arguments.
+
+        API Keys:
+        - The recommended way to provide the API key is through the `NVIDIA_API_KEY` environment variable.
+
+        Note:
+        - Switch from a hosted NIM (default) to an on-premises NIM using the `base_url` parameter. An API key is required for hosted NIM.
+        """
+        super().__init__(
+            model=model,
+            embed_batch_size=embed_batch_size,
+            callback_manager=callback_manager,
+            **kwargs,
+        )
+
         if embed_batch_size > 259:
             raise ValueError("The batch size should not be larger than 259.")
 
+        if base_url is None:
+            # TODO: we should not assume unknown models are at the base url, but
+            #       we cannot error out here because
+            #          NVIDIAEmbedding(model="special").mode("nim", base_url=...)
+            #       is valid usage
+            base_url = MODEL_ENDPOINT_MAP.get(model, BASE_URL)
+
         api_key = get_from_param_or_env(
-            "api_key", nvidia_api_key or api_key, "NVIDIA_API_KEY", "none"
+            "api_key",
+            nvidia_api_key or api_key,
+            "NVIDIA_API_KEY",
+            "NO_API_KEY_PROVIDED",
         )
 
-        # TODO: we should not assume unknown models are at the base url, but
-        #       we cannot error out here because
-        #          NVIDIAEmbedding(model="special").mode("nim", base_url=...)
-        #       is valid usage
-        base_url = MODEL_ENDPOINT_MAP.get(model, BASE_RETRIEVAL_URL)
+        self._is_hosted = base_url in KNOWN_URLS
+
+        if self._is_hosted and api_key == "NO_API_KEY_PROVIDED":
+            warnings.warn(
+                "An API key is required for hosted NIM. This will become an error in 0.2.0."
+            )
 
         self._client = OpenAI(
             api_key=api_key,
@@ -96,18 +141,11 @@ class NVIDIAEmbedding(BaseEmbedding):
         )
         self._aclient._custom_headers = {"User-Agent": "llama-index-embeddings-nvidia"}
 
-        super().__init__(
-            model=model,
-            embed_batch_size=embed_batch_size,
-            callback_manager=callback_manager,
-            **kwargs,
-        )
-
     @property
     def available_models(self) -> List[Model]:
         """Get available models."""
         ids = MODEL_ENDPOINT_MAP.keys()
-        if self._mode == "nim":
+        if not self._is_hosted:
             ids = [model.id for model in self._client.models.list()]
         return [Model(id=id) for id in ids]
 
@@ -115,6 +153,10 @@ class NVIDIAEmbedding(BaseEmbedding):
     def class_name(cls) -> str:
         return "NVIDIAEmbedding"
 
+    @deprecated(
+        version="0.1.2",
+        reason="Will be removed in 0.2. Construct with `base_url` instead.",
+    )
     def mode(
         self,
         mode: Optional[Literal["nvidia", "nim"]] = "nvidia",
@@ -123,14 +165,20 @@ class NVIDIAEmbedding(BaseEmbedding):
         model: Optional[str] = None,
         api_key: Optional[str] = None,
     ) -> "NVIDIAEmbedding":
+        """
+        Deprecated: use NVIDIAEmbedding(base_url="...") instead.
+        """
         if mode == "nim":
             if not base_url:
                 raise ValueError("base_url is required for nim mode")
+        if mode == "nvidia":
+            api_key = get_from_param_or_env("api_key", api_key, "NVIDIA_API_KEY")
         if not base_url:
             # TODO: we should not assume unknown models are at the base url
-            base_url = MODEL_ENDPOINT_MAP.get(model or self.model, BASE_RETRIEVAL_URL)
+            base_url = MODEL_ENDPOINT_MAP.get(model or self.model, BASE_URL)
 
         self._mode = mode
+        self._is_hosted = base_url in KNOWN_URLS
         if base_url:
             self._client.base_url = base_url
             self._aclient.base_url = base_url
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/pyproject.toml b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/pyproject.toml
index 7256f262c5..18eb13a9eb 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/pyproject.toml
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/pyproject.toml
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-embeddings-nvidia"
 readme = "README.md"
-version = "0.1.2"
+version = "0.1.3"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/conftest.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/conftest.py
index 6a5fb78d6a..fdbc0ab0aa 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/conftest.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/conftest.py
@@ -1,12 +1,25 @@
 import pytest
 import os
 
-from llama_index.embeddings.nvidia import NVIDIAEmbedding
+from llama_index.embeddings.nvidia import NVIDIAEmbedding as Interface
 from llama_index.embeddings.nvidia.base import DEFAULT_MODEL
 
 from typing import Generator
 
-from contextlib import contextmanager
+
+# this fixture is used to mask the NVIDIA_API_KEY environment variable and restore it
+# after the test. it also returns the value of the NVIDIA_API_KEY environment variable
+# before it was masked so that it can be used in the test.
+@pytest.fixture()
+def masked_env_var() -> Generator[str, None, None]:
+    var = "NVIDIA_API_KEY"
+    try:
+        if val := os.environ.get(var, None):
+            del os.environ[var]
+        yield val
+    finally:
+        if val:
+            os.environ[var] = val
 
 
 def pytest_collection_modifyitems(config, items):
@@ -42,7 +55,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
 def get_mode(config: pytest.Config) -> dict:
     nim_endpoint = config.getoption("--nim-endpoint")
     if nim_endpoint:
-        return {"mode": "nim", "base_url": nim_endpoint}
+        return {"base_url": nim_endpoint}
     return {}
 
 
@@ -54,23 +67,10 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         if model := metafunc.config.getoption("--model-id"):
             models = [model]
         elif metafunc.config.getoption("--all-models"):
-            models = [
-                model.id for model in NVIDIAEmbedding().mode(**mode).available_models
-            ]
+            models = [model.id for model in Interface(**mode).available_models]
         metafunc.parametrize("model", models, ids=models)
 
 
 @pytest.fixture()
 def mode(request: pytest.FixtureRequest) -> dict:
     return get_mode(request.config)
-
-
-@contextmanager
-def no_env_var(var: str) -> Generator[None, None, None]:
-    try:
-        if val := os.environ.get(var, None):
-            del os.environ[var]
-        yield
-    finally:
-        if val:
-            os.environ[var] = val
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_api_key.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_api_key.py
index 86278a7368..3b5b8ae3db 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_api_key.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_api_key.py
@@ -2,63 +2,63 @@ import os
 
 import pytest
 
-from llama_index.embeddings.nvidia import NVIDIAEmbedding
+from llama_index.embeddings.nvidia import NVIDIAEmbedding as Interface
 
 from typing import Any
-from .conftest import no_env_var
 
 
 def get_api_key(instance: Any) -> str:
     return instance._client.api_key
 
 
-def test_create_without_api_key() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        NVIDIAEmbedding()
+def test_create_default_url_without_api_key(masked_env_var: str) -> None:
+    with pytest.warns(UserWarning):
+        Interface()
+
+
+def test_create_unknown_url_without_api_key(masked_env_var: str) -> None:
+    Interface(base_url="https://test_url/v1")
 
 
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
-def test_create_with_api_key(param: str) -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        instance = NVIDIAEmbedding(**{param: "just testing no failure"})
-        assert get_api_key(instance) == "just testing no failure"
+def test_create_with_api_key(param: str, masked_env_var: str) -> None:
+    instance = Interface(**{param: "just testing no failure"})
+    assert get_api_key(instance) == "just testing no failure"
 
 
-def test_api_key_priority() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
+def test_api_key_priority(masked_env_var: str) -> None:
+    try:
         os.environ["NVIDIA_API_KEY"] = "ENV"
-        assert get_api_key(NVIDIAEmbedding()) == "ENV"
-        assert get_api_key(NVIDIAEmbedding(nvidia_api_key="PARAM")) == "PARAM"
-        assert get_api_key(NVIDIAEmbedding(api_key="PARAM")) == "PARAM"
-        assert (
-            get_api_key(NVIDIAEmbedding(api_key="LOW", nvidia_api_key="HIGH")) == "HIGH"
-        )
+        assert get_api_key(Interface()) == "ENV"
+        assert get_api_key(Interface(nvidia_api_key="PARAM")) == "PARAM"
+        assert get_api_key(Interface(api_key="PARAM")) == "PARAM"
+        assert get_api_key(Interface(api_key="LOW", nvidia_api_key="HIGH")) == "HIGH"
+    finally:
+        # we must clean up environ or it may impact other tests
+        del os.environ["NVIDIA_API_KEY"]
 
 
 @pytest.mark.integration()
-def test_missing_api_key_error() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIAEmbedding()
-        with pytest.raises(Exception) as exc_info:
-            client.get_query_embedding("Hello, world!")
-        message = str(exc_info.value)
-        assert "401" in message
+def test_missing_api_key_error(masked_env_var: str) -> None:
+    with pytest.warns(UserWarning):
+        client = Interface()
+    with pytest.raises(Exception) as exc_info:
+        client.get_query_embedding("Hello, world!")
+    message = str(exc_info.value)
+    assert "401" in message
 
 
 @pytest.mark.integration()
-def test_bogus_api_key_error() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIAEmbedding(nvidia_api_key="BOGUS")
-        with pytest.raises(Exception) as exc_info:
-            client.get_query_embedding("Hello, world!")
-        message = str(exc_info.value)
-        assert "401" in message
+def test_bogus_api_key_error(masked_env_var: str) -> None:
+    client = Interface(nvidia_api_key="BOGUS")
+    with pytest.raises(Exception) as exc_info:
+        client.get_query_embedding("Hello, world!")
+    message = str(exc_info.value)
+    assert "401" in message
 
 
 @pytest.mark.integration()
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
-def test_api_key(param: str, model: str, mode: dict) -> None:
-    api_key = os.environ.get("NVIDIA_API_KEY")
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIAEmbedding(**{"model": model, param: api_key}).mode(**mode)
-        assert client.get_query_embedding("Hello, world!")
+def test_api_key(model: str, mode: dict, param: str, masked_env_var: str) -> None:
+    client = Interface(model=model, **{**mode, **{param: masked_env_var}})
+    client.get_query_embedding("Hello, world!")
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_available_models.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_available_models.py
index 3b5412b9e6..bdd95fe041 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_available_models.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_available_models.py
@@ -1,11 +1,11 @@
 import pytest
 
-from llama_index.embeddings.nvidia import NVIDIAEmbedding
+from llama_index.embeddings.nvidia import NVIDIAEmbedding as Interface
 
 
 @pytest.mark.integration()
 def test_available_models(mode: dict) -> None:
-    models = NVIDIAEmbedding().mode(**mode).available_models
+    models = Interface(**mode).available_models
     assert models
     assert isinstance(models, list)
     assert all(isinstance(model.id, str) for model in models)
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_integration.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_integration.py
index 146a017bf8..30873c1bae 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_integration.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_integration.py
@@ -5,7 +5,7 @@ from llama_index.embeddings.nvidia import NVIDIAEmbedding
 
 @pytest.mark.integration()
 def test_basic(model: str, mode: dict) -> None:
-    client = NVIDIAEmbedding(model=model).mode(**mode)
+    client = NVIDIAEmbedding(model=model, **mode)
     response = client.get_query_embedding("Hello, world!")
     assert isinstance(response, list)
     assert len(response) > 0
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_mode_switch.py b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_mode_switch.py
index 14d2b0ed09..128f8e7a4e 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_mode_switch.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-nvidia/tests/test_mode_switch.py
@@ -1,49 +1,80 @@
 import pytest
 
-from llama_index.embeddings.nvidia import NVIDIAEmbedding
+from llama_index.embeddings.nvidia import NVIDIAEmbedding as Interface
+from llama_index.embeddings.nvidia.base import BASE_URL, KNOWN_URLS
 
-from .conftest import no_env_var
 
-# we don't test this because we do not want to force users to have an API key
-#  NVIDIAEmbedding().mode("nim", base_url=...) must work without an API key
-# def test_mode_switch_nvidia_throws_without_key():
-#     emb = NVIDIAEmbedding()
-#     with pytest.raises(ValueError):
-#         emb.mode("nvidia")
+def test_mode_switch_throws_without_key_deprecated(masked_env_var: str):
+    x = Interface()
+    with pytest.raises(ValueError):
+        with pytest.warns(DeprecationWarning):
+            x.mode("nvidia")
 
 
-def test_mode_switch_nvidia_with_key():
-    with no_env_var("NVIDIA_API_KEY"):
-        NVIDIAEmbedding().mode("nvidia", api_key="test")
+def test_mode_switch_with_key_deprecated(masked_env_var: str):
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nvidia", api_key="test")
 
 
-def test_mode_switch_nim_throws_without_url():
-    emb = NVIDIAEmbedding()
+def test_mode_switch_nim_throws_without_url_deprecated():
+    instance = Interface()
     with pytest.raises(ValueError):
-        emb.mode("nim")
+        with pytest.warns(DeprecationWarning):
+            instance.mode("nim")
+
+
+def test_mode_switch_nim_with_url_deprecated():
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nim", base_url="test")
+
+
+def test_mode_switch_param_setting_deprecated():
+    instance = Interface(model="dummy")
+
+    with pytest.warns(DeprecationWarning):
+        instance1 = instance.mode("nim", base_url="https://test_url/v1/")
+    assert instance1.model == "dummy"
+    assert str(instance1._client.base_url) == "https://test_url/v1/"
+
+    with pytest.warns(DeprecationWarning):
+        instance2 = instance1.mode("nvidia", api_key="test", model="dummy-2")
+    assert instance2.model == "dummy-2"
+    assert str(instance2._client.base_url) == BASE_URL
+    assert instance2._client.api_key == "test"
+
+
+UNKNOWN_URLS = [
+    "https://test_url/v1",
+    "https://test_url/v1/",
+    "https://test_url/.../v1",
+    "http://test_url/v1",
+    "http://test_url/v1/",
+    "http://test_url/.../v1/",
+]
+
+
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+def test_mode_switch_unknown_base_url_without_key(masked_env_var: str, base_url: str):
+    Interface(base_url=base_url)
 
 
-def test_mode_switch_nim_with_url():
-    NVIDIAEmbedding().mode("nim", base_url="test")
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_unknown_base_url_with_key(
+    masked_env_var: str, param: str, base_url: str
+):
+    Interface(base_url=base_url, **{param: "test"})
 
 
-def test_mode_switch_param_setting():
-    emb = NVIDIAEmbedding(model="dummy")
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+def test_mode_switch_known_base_url_without_key(masked_env_var: str, base_url: str):
+    with pytest.warns(UserWarning):
+        Interface(base_url=base_url)
 
-    nim_emb = emb.mode("nim", base_url="https://test_url/v1/")
-    assert nim_emb.model == "dummy"
-    assert str(nim_emb._client.base_url) == "https://test_url/v1/"
-    assert str(nim_emb._aclient.base_url) == "https://test_url/v1/"
 
-    cat_emb = nim_emb.mode("nvidia", api_key="test", model="dummy-2")
-    assert cat_emb.model == "dummy-2"
-    assert (
-        str(cat_emb._client.base_url)
-        == "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
-    )
-    assert (
-        str(cat_emb._aclient.base_url)
-        == "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
-    )
-    assert cat_emb._client.api_key == "test"
-    assert cat_emb._aclient.api_key == "test"
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_known_base_url_with_key(
+    masked_env_var: str, base_url: str, param: str
+):
+    Interface(base_url=base_url, **{param: "test"})
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
index 80b32393c1..2d761043ea 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
@@ -4,6 +4,8 @@ from typing import (
     List,
     Literal,
 )
+from deprecated import deprecated
+import warnings
 
 from llama_index.core.bridge.pydantic import PrivateAttr, BaseModel
 from llama_index.core.base.llms.generic_utils import (
@@ -16,6 +18,11 @@ from llama_index.llms.openai_like import OpenAILike
 DEFAULT_MODEL = "meta/llama3-8b-instruct"
 BASE_URL = "https://integrate.api.nvidia.com/v1/"
 
+KNOWN_URLS = [
+    BASE_URL,
+    "https://integrate.api.nvidia.com/v1",
+]
+
 
 class Model(BaseModel):
     id: str
@@ -24,6 +31,7 @@ class Model(BaseModel):
 class NVIDIA(OpenAILike):
     """NVIDIA's API Catalog Connector."""
 
+    _is_hosted: bool = PrivateAttr(True)
     _mode: str = PrivateAttr("nvidia")
 
     def __init__(
@@ -31,9 +39,30 @@ class NVIDIA(OpenAILike):
         model: str = DEFAULT_MODEL,
         nvidia_api_key: Optional[str] = None,
         api_key: Optional[str] = None,
+        base_url: Optional[str] = BASE_URL,
         max_tokens: Optional[int] = 1024,
         **kwargs: Any,
     ) -> None:
+        """
+        Initialize an instance of the NVIDIA class.
+
+        This class provides an interface to the NVIDIA NIM. By default, it connects to a hosted NIM,
+        but you can switch to an on-premises NIM by providing a `base_url`.
+
+        Args:
+            model (str, optional): The model to use for the NIM.
+            nvidia_api_key (str, optional): The API key for the NVIDIA NIM. Defaults to None.
+            api_key (str, optional): An alternative parameter for providing the API key. Defaults to None.
+            base_url (str, optional): The base URL for the NIM. Use this to switch to an on-premises NIM.
+            max_tokens (int, optional): The maximum number of tokens to generate. Defaults to 1024.
+            **kwargs: Additional keyword arguments.
+
+        API Keys:
+        - The recommended way to provide the API key is through the `NVIDIA_API_KEY` environment variable.
+
+        Raises:
+            DeprecationWarning: If an API key is not provided for a hosted NIM, a warning is issued. This will become an error in version 0.2.0.
+        """
         api_key = get_from_param_or_env(
             "api_key",
             nvidia_api_key or api_key,
@@ -41,10 +70,17 @@ class NVIDIA(OpenAILike):
             "NO_API_KEY_PROVIDED",
         )
 
+        self._is_hosted = base_url in KNOWN_URLS
+
+        if self._is_hosted and api_key == "NO_API_KEY_PROVIDED":
+            warnings.warn(
+                "An API key is required for the hosted NIM. This will become an error in 0.2.0.",
+            )
+
         super().__init__(
             model=model,
             api_key=api_key,
-            api_base=BASE_URL,
+            api_base=base_url,
             max_tokens=max_tokens,
             is_chat_model=True,
             default_headers={"User-Agent": "llama-index-llms-nvidia"},
@@ -53,24 +89,24 @@ class NVIDIA(OpenAILike):
 
     @property
     def available_models(self) -> List[Model]:
-        exclude = {
-            "mistralai/mixtral-8x22b-v0.1",  # not a /chat/completion endpoint
-        }
-        # do not exclude models in nim mode. the nim administrator has control
-        # over the model name and may deploy an excluded name on the nim's
-        # /chat/completion endpoint.
-        if self._mode == "nim":
-            exclude = set()
-        return [
-            model
-            for model in self._get_client().models.list().data
-            if model.id not in exclude
-        ]
+        models = self._get_client().models.list().data
+        # only exclude models in hosted mode. in non-hosted mode, the administrator has control
+        # over the model name and may deploy an excluded name that will work.
+        if self._is_hosted:
+            exclude = {
+                "mistralai/mixtral-8x22b-v0.1",  # not a /chat/completion endpoint
+            }
+            models = [model for model in models if model.id not in exclude]
+        return models
 
     @classmethod
     def class_name(cls) -> str:
         return "NVIDIA"
 
+    @deprecated(
+        version="0.1.3",
+        reason="Will be removed in 0.2. Construct with `base_url` instead.",
+    )
     def mode(
         self,
         mode: Optional[Literal["nvidia", "nim"]] = "nvidia",
@@ -80,20 +116,7 @@ class NVIDIA(OpenAILike):
         api_key: Optional[str] = None,
     ) -> "NVIDIA":
         """
-        Change the mode.
-
-        There are two modes, "nvidia" and "nim". The "nvidia" mode is the default
-        mode and is used to interact with hosted NIMs. The "nim" mode is used to
-        interact with NVIDIA NIM endpoints, which are typically hosted on-premises.
-
-        For the "nvidia" mode, the "api_key" parameter is available to specify
-        your API key. If not specified, the NVIDIA_API_KEY environment variable
-        will be used.
-
-        For the "nim" mode, the "base_url" parameter is required and the "model"
-        parameter may be necessary. Set base_url to the url of your local NIM
-        endpoint. For instance, "https://localhost:9999/v1". Additionally, the
-        "model" parameter must be set to the name of the model inside the NIM.
+        Deprecated: use NVIDIA(base_url="...") instead.
         """
         if mode == "nim":
             if not base_url:
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
index 9321b6536d..edee1b459c 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
@@ -30,7 +30,7 @@ license = "MIT"
 name = "llama-index-llms-nvidia"
 packages = [{include = "llama_index/"}]
 readme = "README.md"
-version = "0.1.2"
+version = "0.1.3"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py
index 2378be82a5..20f315ff33 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py
@@ -55,7 +55,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
 def get_mode(config: pytest.Config) -> dict:
     nim_endpoint = config.getoption("--nim-endpoint")
     if nim_endpoint:
-        return {"mode": "nim", "base_url": nim_endpoint}
+        return {"base_url": nim_endpoint}
     return {}
 
 
@@ -67,7 +67,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         if model := metafunc.config.getoption("--model-id"):
             models = [model]
         elif metafunc.config.getoption("--all-models"):
-            models = [model.id for model in NVIDIA().mode(**mode).available_models]
+            models = [model.id for model in NVIDIA(**mode).available_models]
         metafunc.parametrize("chat_model", models, ids=models)
 
 
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py
index d6b82deefd..2536f7cf73 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py
@@ -5,7 +5,7 @@ from llama_index.llms.nvidia import NVIDIA
 
 @pytest.mark.integration()
 def test_additional_kwargs_success(chat_model: str, mode: dict) -> None:
-    client = NVIDIA(chat_model).mode(**mode)
+    client = NVIDIA(chat_model, **mode)
     assert client.complete(
         "Hello, world!",
         stop=["cat", "Cats"],
@@ -17,7 +17,7 @@ def test_additional_kwargs_success(chat_model: str, mode: dict) -> None:
 
 @pytest.mark.integration()
 def test_additional_kwargs_wrong_dtype(chat_model: str, mode: dict) -> None:
-    client = NVIDIA(chat_model).mode(**mode)
+    client = NVIDIA(chat_model, **mode)
     with pytest.raises(Exception) as exc_info:
         client.complete(
             "Hello, world!",
@@ -29,7 +29,7 @@ def test_additional_kwargs_wrong_dtype(chat_model: str, mode: dict) -> None:
 
 @pytest.mark.integration()
 def test_additional_kwargs_wrong_dtype(chat_model: str, mode: dict) -> None:
-    client = NVIDIA(chat_model).mode(**mode)
+    client = NVIDIA(chat_model, **mode)
     with pytest.raises(Exception) as exc_info:
         client.complete(
             "Hello, world!",
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py
index b96a7f66c3..e040b2c497 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py
@@ -11,8 +11,13 @@ def get_api_key(instance: Any) -> str:
     return instance.api_key
 
 
-def test_create_without_api_key(masked_env_var: str) -> None:
-    NVIDIA()
+def test_create_default_url_without_api_key(masked_env_var: str) -> None:
+    with pytest.warns(UserWarning):
+        NVIDIA()
+
+
+def test_create_unknown_url_without_api_key(masked_env_var: str) -> None:
+    NVIDIA(base_url="https://test_url/v1")
 
 
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
@@ -35,7 +40,8 @@ def test_api_key_priority(masked_env_var: str) -> None:
 
 @pytest.mark.integration()
 def test_missing_api_key_error(masked_env_var: str) -> None:
-    client = NVIDIA()
+    with pytest.warns(UserWarning):
+        client = NVIDIA()
     with pytest.raises(Exception) as exc_info:
         client.complete("Hello, world!").text
     message = str(exc_info.value)
@@ -53,6 +59,6 @@ def test_bogus_api_key_error(masked_env_var: str) -> None:
 
 @pytest.mark.integration()
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
-def test_api_key(param: str, masked_env_var: str) -> None:
-    client = NVIDIA(**{param: masked_env_var})
+def test_api_key(chat_model: str, mode: dict, param: str, masked_env_var: str) -> None:
+    client = NVIDIA(model=chat_model, **{**mode, **{param: masked_env_var}})
     assert client.complete("Hello, world!").text
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py
index 01cdd7f745..dd32989604 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py
@@ -5,7 +5,7 @@ from llama_index.llms.nvidia import NVIDIA
 
 @pytest.mark.integration()
 def test_available_models(mode: dict) -> None:
-    models = NVIDIA().mode(**mode).available_models
+    models = NVIDIA(**mode).available_models
     assert models
     assert isinstance(models, list)
     assert all(isinstance(model.id, str) for model in models)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py
index f26fb1e569..2fb7d88c26 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py
@@ -10,7 +10,7 @@ from llama_index.llms.nvidia import NVIDIA
 @pytest.mark.integration()
 def test_chat(chat_model: str, mode: dict) -> None:
     message = ChatMessage(content="Hello")
-    response = NVIDIA(model=chat_model).mode(**mode).chat([message])
+    response = NVIDIA(model=chat_model, **mode).chat([message])
     assert isinstance(response, ChatResponse)
     assert isinstance(response.message, ChatMessage)
     assert isinstance(response.message.content, str)
@@ -18,7 +18,7 @@ def test_chat(chat_model: str, mode: dict) -> None:
 
 @pytest.mark.integration()
 def test_complete(chat_model: str, mode: dict) -> None:
-    response = NVIDIA(model=chat_model).mode(**mode).complete("Hello")
+    response = NVIDIA(model=chat_model, **mode).complete("Hello")
     assert isinstance(response, CompletionResponse)
     assert isinstance(response.text, str)
 
@@ -26,14 +26,14 @@ def test_complete(chat_model: str, mode: dict) -> None:
 @pytest.mark.integration()
 def test_stream_chat(chat_model: str, mode: dict) -> None:
     message = ChatMessage(content="Hello")
-    gen = NVIDIA(model=chat_model).mode(**mode).stream_chat([message])
+    gen = NVIDIA(model=chat_model, **mode).stream_chat([message])
     assert all(isinstance(response, ChatResponse) for response in gen)
     assert all(isinstance(response.delta, str) for response in gen)
 
 
 @pytest.mark.integration()
 def test_stream_complete(chat_model: str, mode: dict) -> None:
-    gen = NVIDIA(model=chat_model).mode(**mode).stream_complete("Hello")
+    gen = NVIDIA(model=chat_model, **mode).stream_complete("Hello")
     assert all(isinstance(response, CompletionResponse) for response in gen)
     assert all(isinstance(response.delta, str) for response in gen)
 
@@ -42,7 +42,7 @@ def test_stream_complete(chat_model: str, mode: dict) -> None:
 @pytest.mark.asyncio()
 async def test_achat(chat_model: str, mode: dict) -> None:
     message = ChatMessage(content="Hello")
-    response = await NVIDIA(model=chat_model).mode(**mode).achat([message])
+    response = await NVIDIA(model=chat_model, **mode).achat([message])
     assert isinstance(response, ChatResponse)
     assert isinstance(response.message, ChatMessage)
     assert isinstance(response.message.content, str)
@@ -51,7 +51,7 @@ async def test_achat(chat_model: str, mode: dict) -> None:
 @pytest.mark.integration()
 @pytest.mark.asyncio()
 async def test_acomplete(chat_model: str, mode: dict) -> None:
-    response = await NVIDIA(model=chat_model).mode(**mode).acomplete("Hello")
+    response = await NVIDIA(model=chat_model, **mode).acomplete("Hello")
     assert isinstance(response, CompletionResponse)
     assert isinstance(response.text, str)
 
@@ -60,7 +60,7 @@ async def test_acomplete(chat_model: str, mode: dict) -> None:
 @pytest.mark.asyncio()
 async def test_astream_chat(chat_model: str, mode: dict) -> None:
     message = ChatMessage(content="Hello")
-    gen = await NVIDIA(model=chat_model).mode(**mode).astream_chat([message])
+    gen = await NVIDIA(model=chat_model, **mode).astream_chat([message])
     responses = [response async for response in gen]
     assert all(isinstance(response, ChatResponse) for response in responses)
     assert all(isinstance(response.delta, str) for response in responses)
@@ -69,7 +69,7 @@ async def test_astream_chat(chat_model: str, mode: dict) -> None:
 @pytest.mark.integration()
 @pytest.mark.asyncio()
 async def test_astream_complete(chat_model: str, mode: dict) -> None:
-    gen = await NVIDIA(model=chat_model).mode(**mode).astream_complete("Hello")
+    gen = await NVIDIA(model=chat_model, **mode).astream_complete("Hello")
     responses = [response async for response in gen]
     assert all(isinstance(response, CompletionResponse) for response in responses)
     assert all(isinstance(response.delta, str) for response in responses)
@@ -83,6 +83,4 @@ async def test_astream_complete(chat_model: str, mode: dict) -> None:
     ],
 )
 def test_exclude_models(mode: dict, excluded: str) -> None:
-    assert excluded not in [
-        model.id for model in NVIDIA().mode(**mode).available_models
-    ]
+    assert excluded not in [model.id for model in NVIDIA(**mode).available_models]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py
index 887a129942..96bda87eed 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py
@@ -1,37 +1,80 @@
 import pytest
 
 from llama_index.llms.nvidia import NVIDIA as Interface
-from llama_index.llms.nvidia.base import BASE_URL
+from llama_index.llms.nvidia.base import BASE_URL, KNOWN_URLS
 
 
-def test_mode_switch_nvidia_throws_without_key(masked_env_var: str):
+def test_mode_switch_nvidia_throws_without_key_deprecated(masked_env_var: str):
     x = Interface()
     with pytest.raises(ValueError):
-        x.mode("nvidia")
+        with pytest.warns(DeprecationWarning):
+            x.mode("nvidia")
 
 
-def test_mode_switch_nvidia_with_key(masked_env_var: str):
-    Interface().mode("nvidia", api_key="test")
+def test_mode_switch_nvidia_with_key_deprecated(masked_env_var: str):
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nvidia", api_key="test")
 
 
-def test_mode_switch_nim_throws_without_url():
+def test_mode_switch_nim_throws_without_url_deprecated():
     instance = Interface()
     with pytest.raises(ValueError):
-        instance.mode("nim")
+        with pytest.warns(DeprecationWarning):
+            instance.mode("nim")
 
 
-def test_mode_switch_nim_with_url():
-    Interface().mode("nim", base_url="test")
+def test_mode_switch_nim_with_url_deprecated():
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nim", base_url="test")
 
 
-def test_mode_switch_param_setting():
+def test_mode_switch_param_setting_deprecated():
     instance = Interface(model="dummy")
 
-    instance1 = instance.mode("nim", base_url="https://test_url/v1/")
+    with pytest.warns(DeprecationWarning):
+        instance1 = instance.mode("nim", base_url="https://test_url/v1/")
     assert instance1.model == "dummy"
     assert str(instance1.api_base) == "https://test_url/v1/"
 
-    instance2 = instance1.mode("nvidia", api_key="test", model="dummy-2")
+    with pytest.warns(DeprecationWarning):
+        instance2 = instance1.mode("nvidia", api_key="test", model="dummy-2")
     assert instance2.model == "dummy-2"
     assert str(instance2.api_base) == BASE_URL
     assert instance2.api_key == "test"
+
+
+UNKNOWN_URLS = [
+    "https://test_url/v1",
+    "https://test_url/v1/",
+    "https://test_url/.../v1",
+    "http://test_url/v1",
+    "http://test_url/v1/",
+    "http://test_url/.../v1/",
+]
+
+
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+def test_mode_switch_unknown_base_url_without_key(masked_env_var: str, base_url: str):
+    Interface(base_url=base_url)
+
+
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_unknown_base_url_with_key(
+    masked_env_var: str, param: str, base_url: str
+):
+    Interface(base_url=base_url, **{param: "test"})
+
+
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+def test_mode_switch_known_base_url_without_key(masked_env_var: str, base_url: str):
+    with pytest.warns(UserWarning):
+        Interface(base_url=base_url)
+
+
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_known_base_url_with_key(
+    masked_env_var: str, base_url: str, param: str
+):
+    Interface(base_url=base_url, **{param: "test"})
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py
index 91420d68cf..bed8c6c071 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py
@@ -11,12 +11,19 @@ from llama_index.core.instrumentation.events.rerank import (
 from llama_index.core.postprocessor.types import BaseNodePostprocessor
 from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle
 import requests
-
+import warnings
+from deprecated import deprecated
 from llama_index.core.base.llms.generic_utils import get_from_param_or_env
 
 
 DEFAULT_MODEL = "nv-rerank-qa-mistral-4b:1"
-DEFAULT_BASE_URL = "https://ai.api.nvidia.com/v1"
+BASE_URL = "https://ai.api.nvidia.com/v1"
+
+MODEL_ENDPOINT_MAP = {
+    DEFAULT_MODEL: BASE_URL,
+}
+
+KNOWN_URLS = list(MODEL_ENDPOINT_MAP.values())
 
 dispatcher = get_dispatcher(__name__)
 
@@ -45,35 +52,71 @@ class NVIDIARerank(BaseNodePostprocessor):
         ge=1,
         description="The maximum batch size supported by the inference server.",
     )
-    _api_key: str = PrivateAttr("API_KEY_NOT_PROVIDED")  # TODO: should be SecretStr
+    _api_key: str = PrivateAttr("NO_API_KEY_PROVIDED")  # TODO: should be SecretStr
     _mode: str = PrivateAttr("nvidia")
-    _base_url: str = PrivateAttr(DEFAULT_BASE_URL)
+    _is_hosted: bool = PrivateAttr(True)
+    _base_url: str = PrivateAttr(BASE_URL)
 
     def _set_api_key(self, nvidia_api_key: str = None, api_key: str = None) -> None:
         self._api_key = get_from_param_or_env(
             "api_key",
             nvidia_api_key or api_key,
             "NVIDIA_API_KEY",
-            "API_KEY_NOT_PROVIDED",
+            "NO_API_KEY_PROVIDED",
         )
 
     def __init__(
         self,
+        model: str = DEFAULT_MODEL,
         nvidia_api_key: Optional[str] = None,
         api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
         **kwargs: Any,
     ):
-        super().__init__(**kwargs)
+        """
+        Initialize a NVIDIARerank instance.
+
+        This class provides access to a NVIDIA NIM for reranking. By default, it connects to a hosted NIM, but can be configured to connect to an on-premises NIM using the `base_url` parameter. An API key is required for hosted NIM.
 
-        self._set_api_key(nvidia_api_key, api_key)
+        Args:
+            model (str): The model to use for reranking.
+            nvidia_api_key (str, optional): The NVIDIA API key. Defaults to None.
+            api_key (str, optional): The API key. Defaults to None.
+            base_url (str, optional): The base URL of the on-premises NIM. Defaults to None.
+            **kwargs: Additional keyword arguments.
+
+        API Key:
+        - The recommended way to provide the API key is through the `NVIDIA_API_KEY` environment variable.
+        """
+        super().__init__(model=model, **kwargs)
+
+        self._base_url = base_url or MODEL_ENDPOINT_MAP.get(model, BASE_URL)
+
+        self._api_key = get_from_param_or_env(
+            "api_key",
+            nvidia_api_key or api_key,
+            "NVIDIA_API_KEY",
+            "NO_API_KEY_PROVIDED",
+        )
+
+        self._is_hosted = self._base_url in KNOWN_URLS
+
+        if self._is_hosted and self._api_key == "NO_API_KEY_PROVIDED":
+            warnings.warn(
+                "An API key is required for hosted NIM. This will become an error in 0.2.0."
+            )
 
     @property
     def available_models(self) -> List[Model]:
         """Get available models."""
-        # there is one model on ai.nvidia.com and available as a local NIM
-        ids = [DEFAULT_MODEL]
+        # all available models are in the map
+        ids = MODEL_ENDPOINT_MAP.keys()
         return [Model(id=id) for id in ids]
 
+    @deprecated(
+        version="0.1.2",
+        reason="Will be removed in 0.2. Construct with `base_url` instead.",
+    )
     def mode(
         self,
         mode: Literal["nvidia", "nim"] = "nvidia",
@@ -83,29 +126,20 @@ class NVIDIARerank(BaseNodePostprocessor):
         api_key: Optional[str] = None,
     ) -> "NVIDIARerank":
         """
-        Change the mode.
-
-        There are two modes, "nvidia" and "nim". The "nvidia" mode is the default mode
-        and is used to interact with hosted NVIDIA NIMs. The "nim" mode is
-        used to interact with local NVIDIA NIM endpoints, which are typically hosted
-        on-premises.
-
-        For the "nvidia" mode, the "api_key" parameter is available to specify your
-        API key. If not specified, the NVIDIA_API_KEY environment variable will be used.
-
-        For the "nim" mode, the "base_url" is required and "model" is recommended. Set
-        base_url to the url of your NVIDIA NIM endpoint. For instance,
-        "https://localhost:1976/v1", it should end in "/v1". Additionally, the "model"
-        parameter must be set to the name of the model inside the NIM.
+        Deprecated: use NVIDIARerank(base_url=...) instead.
         """
         if isinstance(self, str):
             raise ValueError("Please construct the model before calling mode()")
 
-        if mode == "nim":
+        self._is_hosted = mode == "nvidia"
+
+        if not self._is_hosted:
             if not base_url:
                 raise ValueError("base_url is required for nim mode")
+        else:
+            api_key = get_from_param_or_env("api_key", api_key, "NVIDIA_API_KEY")
         if not base_url:
-            base_url = DEFAULT_BASE_URL
+            base_url = BASE_URL
 
         self._mode = mode
         if base_url:
@@ -128,7 +162,7 @@ class NVIDIARerank(BaseNodePostprocessor):
         if model:
             self.model = model
         if api_key:
-            self._set_api_key(api_key)
+            self._api_key = api_key
 
         return self
 
@@ -191,7 +225,7 @@ class NVIDIARerank(BaseNodePostprocessor):
                 }
                 # the hosted NIM path is different from the local NIM path
                 url = self._base_url
-                if self._mode == "nvidia":
+                if self._is_hosted:
                     url += "/retrieval/nvidia/reranking"
                 else:
                     url += "/ranking"
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml
index 3e836c160c..348f035351 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml
@@ -30,7 +30,7 @@ license = "MIT"
 name = "llama-index-postprocessor-nvidia-rerank"
 packages = [{include = "llama_index/"}]
 readme = "README.md"
-version = "0.1.1"
+version = "0.1.2"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/conftest.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/conftest.py
index 1521e3f97a..6f0b5959ba 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/conftest.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/conftest.py
@@ -1,12 +1,25 @@
 import pytest
 import os
 
-from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
+from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface
 from llama_index.postprocessor.nvidia_rerank.base import DEFAULT_MODEL
 
 from typing import Generator
 
-from contextlib import contextmanager
+
+# this fixture is used to mask the NVIDIA_API_KEY environment variable and restore it
+# after the test. it also returns the value of the NVIDIA_API_KEY environment variable
+# before it was masked so that it can be used in the test.
+@pytest.fixture()
+def masked_env_var() -> Generator[str, None, None]:
+    var = "NVIDIA_API_KEY"
+    try:
+        if val := os.environ.get(var, None):
+            del os.environ[var]
+        yield val
+    finally:
+        if val:
+            os.environ[var] = val
 
 
 def pytest_collection_modifyitems(config, items):
@@ -54,23 +67,10 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         if model := metafunc.config.getoption("--model-id"):
             models = [model]
         elif metafunc.config.getoption("--all-models"):
-            models = [
-                model.id for model in NVIDIARerank().mode(**mode).available_models
-            ]
+            models = [model.id for model in Interface(**mode).available_models]
         metafunc.parametrize("model", models, ids=models)
 
 
 @pytest.fixture()
 def mode(request: pytest.FixtureRequest) -> dict:
     return get_mode(request.config)
-
-
-@contextmanager
-def no_env_var(var: str) -> Generator[None, None, None]:
-    try:
-        if val := os.environ.get(var, None):
-            del os.environ[var]
-        yield
-    finally:
-        if val:
-            os.environ[var] = val
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py
index 21ce8398c4..86c0a55db2 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py
@@ -2,71 +2,73 @@ import os
 
 import pytest
 
-from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
+from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface
 from llama_index.core.schema import NodeWithScore, Document
 
 from typing import Any
-from .conftest import no_env_var
 
 
 def get_api_key(instance: Any) -> str:
     return instance._api_key
 
 
-def test_create_without_api_key() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        NVIDIARerank()
+def test_create_default_url_without_api_key(masked_env_var: str) -> None:
+    with pytest.warns(UserWarning):
+        Interface()
+
+
+def test_create_unknown_url_without_api_key(masked_env_var: str) -> None:
+    Interface(base_url="https://test_url/v1")
 
 
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
-def test_create_with_api_key(param: str) -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        instance = NVIDIARerank(**{param: "just testing no failure"})
-        assert get_api_key(instance) == "just testing no failure"
+def test_create_with_api_key(param: str, masked_env_var: str) -> None:
+    instance = Interface(**{param: "just testing no failure"})
+    assert get_api_key(instance) == "just testing no failure"
 
 
-def test_api_key_priority() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
+def test_api_key_priority(masked_env_var: str) -> None:
+    try:
         os.environ["NVIDIA_API_KEY"] = "ENV"
-        assert get_api_key(NVIDIARerank()) == "ENV"
-        assert get_api_key(NVIDIARerank(nvidia_api_key="PARAM")) == "PARAM"
-        assert get_api_key(NVIDIARerank(api_key="PARAM")) == "PARAM"
-        assert get_api_key(NVIDIARerank(api_key="LOW", nvidia_api_key="HIGH")) == "HIGH"
+        assert get_api_key(Interface()) == "ENV"
+        assert get_api_key(Interface(nvidia_api_key="PARAM")) == "PARAM"
+        assert get_api_key(Interface(api_key="PARAM")) == "PARAM"
+        assert get_api_key(Interface(api_key="LOW", nvidia_api_key="HIGH")) == "HIGH"
+    finally:
+        # we must clean up environ or it may impact other tests
+        del os.environ["NVIDIA_API_KEY"]
 
 
 @pytest.mark.integration()
-def test_missing_api_key_error() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIARerank()
-        with pytest.raises(Exception) as exc_info:
-            client.postprocess_nodes(
-                [NodeWithScore(node=Document(text="Hello, world!"))],
-                query_str="Hello, world!",
-            )
-        message = str(exc_info.value)
-        assert "401" in message
+def test_missing_api_key_error(masked_env_var: str) -> None:
+    with pytest.warns(UserWarning):
+        client = Interface()
+    with pytest.raises(Exception) as exc_info:
+        client.postprocess_nodes(
+            [NodeWithScore(node=Document(text="Hello, world!"))],
+            query_str="Hello, world!",
+        )
+    message = str(exc_info.value)
+    assert "401" in message
 
 
 @pytest.mark.integration()
-def test_bogus_api_key_error() -> None:
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIARerank(nvidia_api_key="BOGUS")
-        with pytest.raises(Exception) as exc_info:
-            client.postprocess_nodes(
-                [NodeWithScore(node=Document(text="Hello, world!"))],
-                query_str="Hello, world!",
-            )
-        message = str(exc_info.value)
-        assert "401" in message
+def test_bogus_api_key_error(masked_env_var: str) -> None:
+    client = Interface(nvidia_api_key="BOGUS")
+    with pytest.raises(Exception) as exc_info:
+        client.postprocess_nodes(
+            [NodeWithScore(node=Document(text="Hello, world!"))],
+            query_str="Hello, world!",
+        )
+    message = str(exc_info.value)
+    assert "401" in message
 
 
 @pytest.mark.integration()
 @pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
-def test_api_key(param: str, model: str, mode: dict) -> None:
-    api_key = os.environ.get("NVIDIA_API_KEY")
-    with no_env_var("NVIDIA_API_KEY"):
-        client = NVIDIARerank(**{"model": model, param: api_key}).mode(**mode)
-        assert client.postprocess_nodes(
-            [NodeWithScore(node=Document(text="Hello, world!"))],
-            query_str="Hello, world!",
-        )
+def test_api_key(model: str, mode: dict, param: str, masked_env_var: str) -> None:
+    client = Interface(model=model, **{**mode, **{param: masked_env_var}})
+    assert client.postprocess_nodes(
+        [NodeWithScore(node=Document(text="Hello, world!"))],
+        query_str="Hello, world!",
+    )
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py
index 69752c566c..698118baa6 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py
@@ -5,7 +5,7 @@ from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
 
 @pytest.mark.integration()
 def test_available_models(mode: dict) -> None:
-    models = NVIDIARerank().mode(**mode).available_models
+    models = NVIDIARerank(**mode).available_models
     assert models
     assert isinstance(models, list)
     assert all(isinstance(model.id, str) for model in models)
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_mode_switch.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_mode_switch.py
index 7801878e00..eb2ef2334b 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_mode_switch.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_mode_switch.py
@@ -1,41 +1,80 @@
 import pytest
 
-from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
-from llama_index.postprocessor.nvidia_rerank.base import DEFAULT_BASE_URL
+from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface
+from llama_index.postprocessor.nvidia_rerank.base import KNOWN_URLS, BASE_URL
 
-from .conftest import no_env_var
 
-# we don't test this because we do not want to force users to have an API key
-#  NVIDIARerank().mode("nim", base_url=...) must work without an API key
-# def test_mode_switch_nvidia_throws_without_key():
-#     emb = NVIDIARerank()
-#     with pytest.raises(ValueError):
-#         emb.mode("nvidia")
+def test_mode_switch_throws_without_key_deprecated(masked_env_var: str):
+    x = Interface()
+    with pytest.raises(ValueError):
+        with pytest.warns(DeprecationWarning):
+            x.mode("nvidia")
 
 
-def test_mode_switch_nvidia_with_key():
-    with no_env_var("NVIDIA_API_KEY"):
-        NVIDIARerank().mode("nvidia", api_key="test")
+def test_mode_switch_with_key_deprecated(masked_env_var: str):
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nvidia", api_key="test")
 
 
-def test_mode_switch_nim_throws_without_url():
-    instance = NVIDIARerank()
+def test_mode_switch_nim_throws_without_url_deprecated():
+    instance = Interface()
     with pytest.raises(ValueError):
-        instance.mode("nim")
+        with pytest.warns(DeprecationWarning):
+            instance.mode("nim")
 
 
-def test_mode_switch_nim_with_url():
-    NVIDIARerank().mode("nim", base_url="http://host/test/v1")
+def test_mode_switch_nim_with_url_deprecated():
+    with pytest.warns(DeprecationWarning):
+        Interface().mode("nim", base_url="http://test/v1")
 
 
-def test_mode_switch_param_setting():
-    instance0 = NVIDIARerank(model="dummy")
+def test_mode_switch_param_setting_deprecated():
+    instance = Interface(model="dummy")
 
-    isntance1 = instance0.mode("nim", base_url="https://test_url/v1/")
-    assert isntance1.model == "dummy"
-    assert str(isntance1._base_url) == "https://test_url/v1/"
+    with pytest.warns(DeprecationWarning):
+        instance1 = instance.mode("nim", base_url="https://test_url/v1/")
+    assert instance1.model == "dummy"
+    assert str(instance1._base_url) == "https://test_url/v1/"
 
-    instance2 = isntance1.mode("nvidia", api_key="test", model="dummy-2")
+    with pytest.warns(DeprecationWarning):
+        instance2 = instance1.mode("nvidia", api_key="test", model="dummy-2")
     assert instance2.model == "dummy-2"
-    assert str(instance2._base_url) == DEFAULT_BASE_URL
+    assert str(instance2._base_url) == BASE_URL
     assert instance2._api_key == "test"
+
+
+UNKNOWN_URLS = [
+    "https://test_url/v1",
+    "https://test_url/v1/",
+    "https://test_url/.../v1",
+    "http://test_url/v1",
+    "http://test_url/v1/",
+    "http://test_url/.../v1/",
+]
+
+
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+def test_mode_switch_unknown_base_url_without_key(masked_env_var: str, base_url: str):
+    Interface(base_url=base_url)
+
+
+@pytest.mark.parametrize("base_url", UNKNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_unknown_base_url_with_key(
+    masked_env_var: str, param: str, base_url: str
+):
+    Interface(base_url=base_url, **{param: "test"})
+
+
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+def test_mode_switch_known_base_url_without_key(masked_env_var: str, base_url: str):
+    with pytest.warns(UserWarning):
+        Interface(base_url=base_url)
+
+
+@pytest.mark.parametrize("base_url", KNOWN_URLS)
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_mode_switch_known_base_url_with_key(
+    masked_env_var: str, base_url: str, param: str
+):
+    Interface(base_url=base_url, **{param: "test"})
diff --git a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py
index 3cdd6a33e2..6ef77b6487 100644
--- a/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py
+++ b/llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_postprocessor_nvidia_rerank.py
@@ -34,13 +34,9 @@ def nodes(documents: List[Document]) -> List[NodeWithScore]:
 @pytest.mark.integration()
 def test_basic(model: str, mode: dict) -> None:
     text = "Testing leads to failure, and failure leads to understanding."
-    result = (
-        NVIDIARerank(model=model)
-        .mode(**mode)
-        .postprocess_nodes(
-            [NodeWithScore(node=Document(text=text))],
-            query_str=text,
-        )
+    result = NVIDIARerank(model=model, **mode).postprocess_nodes(
+        [NodeWithScore(node=Document(text=text))],
+        query_str=text,
     )
     assert result
     assert isinstance(result, list)
@@ -55,13 +51,9 @@ def test_basic(model: str, mode: dict) -> None:
 def test_accuracy(model: str, mode: dict) -> None:
     texts = ["first", "last"]
     query = "last"
-    result = (
-        NVIDIARerank(model=model)
-        .mode(**mode)
-        .postprocess_nodes(
-            [NodeWithScore(node=Document(text=text)) for text in texts],
-            query_str=query,
-        )
+    result = NVIDIARerank(model=model, **mode).postprocess_nodes(
+        [NodeWithScore(node=Document(text=text)) for text in texts],
+        query_str=query,
     )
     assert result
     assert isinstance(result, list)
@@ -74,7 +66,7 @@ def test_accuracy(model: str, mode: dict) -> None:
 
 @pytest.mark.integration()
 def test_direct_empty_docs(query: str, model: str, mode: dict) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     result_docs = ranker.postprocess_nodes(nodes=[], query_str=query)
     assert len(result_docs) == 0
 
@@ -85,7 +77,7 @@ def test_direct_top_n_negative(
 ) -> None:
     orig = NVIDIARerank.Config.validate_assignment
     NVIDIARerank.Config.validate_assignment = False
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = -100
     NVIDIARerank.Config.validate_assignment = orig
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
@@ -96,7 +88,7 @@ def test_direct_top_n_negative(
 def test_direct_top_n_zero(
     query: str, nodes: List[NodeWithScore], model: str, mode: dict
 ) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = 0
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
     assert len(result) == 0
@@ -106,7 +98,7 @@ def test_direct_top_n_zero(
 def test_direct_top_n_one(
     query: str, nodes: List[NodeWithScore], model: str, mode: dict
 ) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = 1
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
     assert len(result) == 1
@@ -116,7 +108,7 @@ def test_direct_top_n_one(
 def test_direct_top_n_equal_len_docs(
     query: str, nodes: List[NodeWithScore], model: str, mode: dict
 ) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = len(nodes)
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
     assert len(result) == len(nodes)
@@ -126,7 +118,7 @@ def test_direct_top_n_equal_len_docs(
 def test_direct_top_n_greater_len_docs(
     query: str, nodes: List[NodeWithScore], model: str, mode: dict
 ) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = len(nodes) * 2
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
     assert len(result) == len(nodes)
@@ -134,13 +126,13 @@ def test_direct_top_n_greater_len_docs(
 
 @pytest.mark.parametrize("batch_size", [-10, 0])
 def test_invalid_max_batch_size(model: str, mode: dict, batch_size: int) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     with pytest.raises(ValueError):
         ranker.max_batch_size = batch_size
 
 
 def test_invalid_top_n(model: str, mode: dict) -> None:
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     with pytest.raises(ValueError):
         ranker.top_n = -10
 
@@ -167,7 +159,7 @@ def test_rerank_batching(
 ) -> None:
     assert len(nodes) > batch_size, "test requires more nodes"
 
-    ranker = NVIDIARerank(model=model).mode(**mode)
+    ranker = NVIDIARerank(model=model, **mode)
     ranker.top_n = top_n
     ranker.max_batch_size = batch_size
     result = ranker.postprocess_nodes(nodes=nodes, query_str=query)
-- 
GitLab