diff --git a/docs/docs/examples/llm/nvidia.ipynb b/docs/docs/examples/llm/nvidia.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..566240104fc9b5a3a4d17092680a0822cc22cfef
--- /dev/null
+++ b/docs/docs/examples/llm/nvidia.ipynb
@@ -0,0 +1,862 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using NVIDIA's LLM API Catalog Connector\n",
+    "\n",
+    "This notebook will guide you through understanding the basic usage of the `NVIDIA` connector.\n",
+    "\n",
+    "With this connector, you'll be able to connect to and generate from compatible models available at the NVIDIA [API Catalog](https://build.nvidia.com/explore/discover), such as:\n",
+    "\n",
+    "- Google's [gemma-7b](https://build.nvidia.com/google/gemma-7b)\n",
+    "- Mistal AI's [mistral-7b-instruct-v0.2](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2)\n",
+    "- And more!\n",
+    "\n",
+    "We'll begin by ensuring `llama-index` and associated packages are installed.\n",
+    "\n",
+    "> NOTE: Only models that have a base URL of `https://integrate.api.nvidia.com/v1` are compatible with this connector at this time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting llama-index-embeddings-openai\n",
+      "  Using cached llama_index_embeddings_openai-0.1.7-py3-none-any.whl.metadata (603 bytes)\n",
+      "Requirement already satisfied: llama-index-core<0.11.0,>=0.10.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-embeddings-openai) (0.10.30)\n",
+      "Requirement already satisfied: PyYAML>=6.0.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (6.0.1)\n",
+      "Requirement already satisfied: SQLAlchemy>=1.4.49 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from SQLAlchemy[asyncio]>=1.4.49->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.0.29)\n",
+      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.6 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.9.5)\n",
+      "Requirement already satisfied: dataclasses-json in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.4)\n",
+      "Requirement already satisfied: deprecated>=1.2.9.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.2.14)\n",
+      "Requirement already satisfied: dirtyjson<2.0.0,>=1.0.8 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.8)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.3.1)\n",
+      "Requirement already satisfied: httpx in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.27.0)\n",
+      "Requirement already satisfied: llamaindex-py-client<0.2.0,>=0.1.18 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.1.18)\n",
+      "Requirement already satisfied: nest-asyncio<2.0.0,>=1.5.8 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.6.0)\n",
+      "Requirement already satisfied: networkx>=3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.1)\n",
+      "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.8.1)\n",
+      "Requirement already satisfied: numpy in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.24.4)\n",
+      "Requirement already satisfied: openai>=1.1.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.22.0)\n",
+      "Requirement already satisfied: pandas in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.0.3)\n",
+      "Requirement already satisfied: pillow>=9.0.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (10.3.0)\n",
+      "Requirement already satisfied: requests>=2.31.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.31.0)\n",
+      "Requirement already satisfied: tenacity<9.0.0,>=8.2.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (8.2.3)\n",
+      "Requirement already satisfied: tiktoken>=0.3.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.66.2)\n",
+      "Requirement already satisfied: typing-extensions>=4.5.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.11.0)\n",
+      "Requirement already satisfied: typing-inspect>=0.8.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.9.0)\n",
+      "Requirement already satisfied: wrapt in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.16.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (23.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.9.4)\n",
+      "Requirement already satisfied: pydantic>=1.10 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.7.0)\n",
+      "Requirement already satisfied: anyio in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.3.0)\n",
+      "Requirement already satisfied: certifi in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.2.2)\n",
+      "Requirement already satisfied: httpcore==1.* in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.5)\n",
+      "Requirement already satisfied: idna in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.7)\n",
+      "Requirement already satisfied: sniffio in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.3.1)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpcore==1.*->httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.14.0)\n",
+      "Requirement already satisfied: click in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (8.1.7)\n",
+      "Requirement already satisfied: joblib in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.4.0)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.4.16)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from openai>=1.1.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.9.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from requests>=2.31.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.3.2)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from requests>=2.31.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.2.1)\n",
+      "Requirement already satisfied: greenlet!=0.4.17 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from SQLAlchemy>=1.4.49->SQLAlchemy[asyncio]>=1.4.49->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.0.3)\n",
+      "Requirement already satisfied: mypy-extensions>=0.3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from typing-inspect>=0.8.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.0)\n",
+      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from dataclasses-json->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.21.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.1)\n",
+      "Requirement already satisfied: packaging>=17.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (24.0)\n",
+      "Requirement already satisfied: annotated-types>=0.4.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.0)\n",
+      "Requirement already satisfied: pydantic-core==2.18.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.18.1)\n",
+      "Requirement already satisfied: six>=1.5 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.16.0)\n",
+      "Using cached llama_index_embeddings_openai-0.1.7-py3-none-any.whl (6.0 kB)\n",
+      "Installing collected packages: llama-index-embeddings-openai\n",
+      "Successfully installed llama-index-embeddings-openai-0.1.7\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install llama-index-embeddings-openai llama-index-readers-file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## API Keys and Boilerplate\n",
+    "\n",
+    "During the next cell we'll run some boilerplate to allow the examples to be executed smoothly in a notebook environment. \n",
+    "\n",
+    "We'll also provide our API keys. \n",
+    "\n",
+    "> NOTE: You can create your NVIDIA API key using the `Get API Key` button in the code example window."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Using OpenAI API for embeddings\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\"\n",
+    "\n",
+    "# Using NVIDIA API Playground API Key for LLM\n",
+    "os.environ[\"NVIDIA_API_KEY\"] = \"nvapi-\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the NVIDIA LLM\n",
+    "\n",
+    "Now we can load our `NVIDIA` LLM by passing in the model name, as found in the docs - located [here](https://docs.api.nvidia.com/nim/reference/)\n",
+    "\n",
+    "> NOTE: The default model is `mistralai/mistral-7b-instruct-v0.2`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.nvidia import NVIDIA\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "from llama_index.core import Settings\n",
+    "\n",
+    "llm = NVIDIA(model=\"mistralai/mistral-7b-instruct-v0.2\")\n",
+    "\n",
+    "Settings.llm = llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can observe which model our `llm` object is currently associated with the `.model` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'mistralai/mistral-7b-instruct-v0.2'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading API Catalogue LLM\n",
+    "\n",
+    "We can also load models using their API Catalogue address.\n",
+    "\n",
+    "Let's use `gemma-7b` as an example!\n",
+    "\n",
+    "1. Navigate to the [model page](https://build.nvidia.com/google/gemma-7b)\n",
+    "2. Find the address in the `model` parameter (e.g. `\"google/gemma-7b\"`)\n",
+    "3. Verify it has the `base_url` of `\"https://integrate.api.nvidia.com/v1\"`\n",
+    "4. Use `NVIDIA(model=\"model_name_here\")` to point the connector at that model (e.g. `NVIDIA(model=\"google/gemma-7b\"`)\n",
+    "\n",
+    "Let's see this in the code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = NVIDIA(model=\"google/gemma-7b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's confirm we've associated our `NvidiaAIPlayground` LLM with the correct model!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'google/gemma-7b'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Functionality\n",
+    "\n",
+    "Now we can explore the different ways you can use the connector within the LlamaIndex ecosystem!\n",
+    "\n",
+    "Before we begin, lets set up a list of `ChatMessage` objects - which is the expected input for some of the methods."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage, MessageRole\n",
+    "\n",
+    "chat_messages = [\n",
+    "    ChatMessage(\n",
+    "        role=MessageRole.SYSTEM, content=(\"You are a helpful assistant.\")\n",
+    "    ),\n",
+    "    ChatMessage(\n",
+    "        role=MessageRole.USER,\n",
+    "        content=(\"What are the most popular house pets in North America?\"),\n",
+    "    ),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll follow the same basic pattern for each example: \n",
+    "\n",
+    "1. We'll point our `NVIDIA` LLM to our desired model\n",
+    "2. We'll examine how to use the endpoint to achieve the desired task!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Complete: `.complete()`\n",
+    "\n",
+    "We can use `.complete()`/`.acomplete()` (which takes a string) to prompt a response from the selected model.\n",
+    "\n",
+    "Let's use our default model for this task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completion_llm = NVIDIA()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can verify this is the expected default by checking the `.model` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'mistralai/mistral-7b-instruct-v0.2'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completion_llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's call `.complete()` on our model with a string, in this case `\"Hello!\"`, and observe the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CompletionResponse(text=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So, feel free to ask me anything!\\n\\nIf you're looking for some general information, I can help you with that too. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some\", additional_kwargs={}, raw={'id': 'chatcmpl-f6906079-51e7-44bf-aaea-a9478397dfbf', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So, feel free to ask me anything!\\n\\nIf you're looking for some general information, I can help you with that too. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some\", role='assistant', function_call=None, tool_calls=None))], 'created': 1713474670, 'model': 'mistralai/mistral-7b-instruct-v0.2', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=512, prompt_tokens=11, total_tokens=523)}, logprobs=None, delta=None)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completion_llm.complete(\"Hello!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As is expected by LlamaIndex - we get a `CompletionResponse` in response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Complete: `.acomplete()`\n",
+    "\n",
+    "There is also an async implementation which can be leveraged in the same way!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CompletionResponse(text=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So feel free to ask me anything!\\n\\nIf you're looking for a specific topic, just let me know and I'll do my best to provide you with accurate and up-to-date information. And if you have any requests for fun facts or trivia, I'm happy to oblige!\\n\\nSo, what would you like to know today? Let me help make your day a little brighter! 😊\", additional_kwargs={}, raw={'id': 'chatcmpl-8ce881c1-a47b-43aa-afd8-9e9addf26ce9', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So feel free to ask me anything!\\n\\nIf you're looking for a specific topic, just let me know and I'll do my best to provide you with accurate and up-to-date information. And if you have any requests for fun facts or trivia, I'm happy to oblige!\\n\\nSo, what would you like to know today? Let me help make your day a little brighter! 😊\", role='assistant', function_call=None, tool_calls=None))], 'created': 1712175910, 'model': 'mistralai/mistral-7b-instruct-v0.2', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=123, prompt_tokens=11, total_tokens=134)}, logprobs=None, delta=None)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await completion_llm.acomplete(\"Hello!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Chat: `.chat()`\n",
+    "\n",
+    "Now we can try the same thing using the `.chat()` method. This method expects a list of chat messages - so we'll use the one we created above.\n",
+    "\n",
+    "We'll use the `mistralai/mixtral-8x7b-instruct-v0.1` model for the example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_llm = NVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All we need to do now is call `.chat()` on our list of `ChatMessages` and observe our response.\n",
+    "\n",
+    "You'll also notice that we can pass in a few additional key-word arguments that can influence the generation - in this case, we've used the `seed` parameter to influence our generation and the `stop` parameter to indicate we want the model to stop generating once it reaches a certain token!\n",
+    "\n",
+    "> NOTE: You can find information about what additional kwargs are supported by the model's endpoint by referencing the API documentation for the selected model. Mixtral's is located [here](https://docs.api.nvidia.com/nim/reference/mistralai-mixtral-8x7b-instruct-infer) as an example!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=\" In North America, the most popular types of house pets are:\\n\\n1. Dogs: Man's best friend is the most popular pet in North America. They are known for their loyalty, companionship, and the variety of breeds that cater to different lifestyles and preferences.\\n\\n2. Cats\", additional_kwargs={}), raw={'id': 'chatcmpl-b6ef95ca-e023-4dc8-8ee9-843f214169e9', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" In North America, the most popular types of house pets are:\\n\\n1. Dogs: Man's best friend is the most popular pet in North America. They are known for their loyalty, companionship, and the variety of breeds that cater to different lifestyles and preferences.\\n\\n2. Cats\", role='assistant', function_call=None, tool_calls=None))], 'created': 1713474655, 'model': 'mistralai/mixtral-8x7b-instruct-v0.1', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=66, prompt_tokens=26, total_tokens=92)}, delta=None, logprobs=None, additional_kwargs={})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chat_llm.chat(chat_messages, seed=4, stop=[\"cat\", \"cats\", \"Cat\", \"Cats\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, we receive a `ChatResponse` in response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Chat: (`achat`)\n",
+    "\n",
+    "We also have an async implementation of the `.chat()` method which can be called in the following way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=' The most popular house pets in North America are dogs and cats. According to the American Pet Products Association (APPA), as of 2021, approximately 69 million homes in the United States own a pet, and 63.4 million of those households have a dog, while 42.7 million have a cat. Birds, small mammals, reptiles, and fish are also popular pets, but to a lesser extent.', additional_kwargs={}), raw={'id': 'chatcmpl-373a1d42-4dc1-4ef9-aaf3-5fea137e8e1e', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=' The most popular house pets in North America are dogs and cats. According to the American Pet Products Association (APPA), as of 2021, approximately 69 million homes in the United States own a pet, and 63.4 million of those households have a dog, while 42.7 million have a cat. Birds, small mammals, reptiles, and fish are also popular pets, but to a lesser extent.', role='assistant', function_call=None, tool_calls=None))], 'created': 1712177472, 'model': 'mistralai/mixtral-8x7b-instruct-v0.1', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=95, prompt_tokens=59, total_tokens=154)}, delta=None, logprobs=None, additional_kwargs={})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await chat_llm.achat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Stream: `.stream_chat()`\n",
+    "\n",
+    "We can also use the models found on `build.nvidia.com` for streaming use-cases!\n",
+    "\n",
+    "Let's select another model and observe this behaviour. We'll use Google's `gemma-7b` model for this task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream_llm = NVIDIA(model=\"google/gemma-7b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's call our model with `.stream_chat()`, which again expects a list of `ChatMessage` objects, and capture the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streamed_response = stream_llm.stream_chat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object llm_chat_callback.<locals>.wrap.<locals>.wrapped_llm_chat.<locals>.wrapped_gen at 0x7dd89853e320>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streamed_response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, the response is a generator with the streamed response. \n",
+    "\n",
+    "Let's take a look at the final response once the generation is complete."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "assistant: **Top Popular House Pets in North America:**\n",
+      "\n",
+      "**1. Dogs:**\n",
+      "* Estimated 63.4 million pet dogs in households (2023)\n",
+      "* Known for their loyalty, companionship, and trainability\n",
+      "\n",
+      "**2. Cats:**\n",
+      "* Estimated 38.4 million pet cats in households (2023)\n",
+      "* Known for their independence, affection, and low-maintenance nature\n",
+      "\n",
+      "**3. Fish:**\n",
+      "* Estimated 14.5 million pet fish in households (2023)\n",
+      "* Popular for their tranquility, beauty, and variety of species\n",
+      "\n",
+      "**4. Small mammals (guinea pigs, hamsters, rabbits):**\n",
+      "* Estimated 14.4 million pet small mammals in households (2023)\n",
+      "* Known for their playful and affectionate nature\n",
+      "\n",
+      "**5. Birds:**\n",
+      "* Estimated 13.3 million pet birds in households (2023)\n",
+      "* Known for their beauty, song, and intelligence\n",
+      "\n",
+      "**Other popular pets:**\n",
+      "\n",
+      "* Tortoises and reptiles\n",
+      "* Hamsters and rodents\n",
+      "* Invertebrates (such as spiders and hermit crabs)\n",
+      "\n",
+      "**Factors influencing pet popularity:**\n",
+      "\n",
+      "* **Lifestyle and living situation:** Urban dwellers are more likely to have cats, while suburban and rural residents are more likely to have dogs.\n",
+      "* **Cost:** Dogs tend to be more expensive to own than cats.\n",
+      "* **Personality and preferences:** Some people prefer the companionship of dogs, while others prefer the independence of cats.\n",
+      "* **Availability:** Certain pets are easier to find or adopt than others.\n",
+      "* **Trend and cultural influences:** Some pets become more popular than others due to trends or cultural preferences.\n"
+     ]
+    }
+   ],
+   "source": [
+    "last_element = None\n",
+    "for last_element in streamed_response:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Stream: `.astream_chat()`\n",
+    "\n",
+    "We have the equivalent async method for streaming as well, which can be used in a similar way to the sync implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streamed_response = await stream_llm.astream_chat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<async_generator object llm_chat_callback.<locals>.wrap.<locals>.wrapped_async_llm_chat.<locals>.wrapped_gen at 0x787709eea460>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streamed_response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "assistant: Sure, here are the most popular house pets in North America:\n",
+      "\n",
+      "1. Dogs\n",
+      "2. Cats\n",
+      "3. Fish\n",
+      "4. Small Mammals\n",
+      "5. Birds\n"
+     ]
+    }
+   ],
+   "source": [
+    "last_element = None\n",
+    "async for last_element in streamed_response:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming Query Engine Responses\n",
+    "\n",
+    "Let's look at a slightly more involved example using a query engine!\n",
+    "\n",
+    "We'll start by loading some data (we'll be using the [Hitchhiker's Guide to the Galaxy](https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loading Data\n",
+    "\n",
+    "Let's first create a directory where our data can live."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/hhgttg'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll download our data from the above source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-04-01 14:39:38--  https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt\n",
+      "Resolving web.eecs.utk.edu (web.eecs.utk.edu)... 160.36.127.165\n",
+      "Connecting to web.eecs.utk.edu (web.eecs.utk.edu)|160.36.127.165|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1534289 (1.5M) [text/plain]\n",
+      "Saving to: ‘data/hhgttg/hhgttg.txt’\n",
+      "\n",
+      "data/hhgttg/hhgttg. 100%[===================>]   1.46M  6.75MB/s    in 0.2s    \n",
+      "\n",
+      "2024-04-01 14:39:39 (6.75 MB/s) - ‘data/hhgttg/hhgttg.txt’ saved [1534289/1534289]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget 'https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt' -O 'data/hhgttg/hhgttg.txt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll need to have an embedding model for this step! We'll use OpenAI's `text-embedding-03-small` model to achieve this, and save it in our `Settings`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
+    "\n",
+    "openai_embedding = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
+    "\n",
+    "Settings.embed_model = openai_embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can load our document and create an index leveraging the above created `OpenAIEmbedding()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\"data/hhgttg\").load_data()\n",
+    "index = VectorStoreIndex.from_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can create a simple query engine and set our `streaming` parameter to `True`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streaming_qe = index.as_query_engine(streaming=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's send a query to our query engine, and then stream the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streaming_response = streaming_qe.query(\n",
+    "    \"What is the significance of the number 42?\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The significance of the number 42 is a central theme in \"The Hitchhiker's Guide to the Galaxy\" by Douglas Adams. The book is a comedic science fiction satire that follows the adventures of two intergalactic travelers, Arthur Dent and Ford Prefect, as they try to escape the destruction of Earth and uncover the true meaning of the number 42.\n",
+      "\n",
+      "Throughout the book, the number 42 is presented as the ultimate answer to the ultimate question of life, the universe, and everything. The question itself is never explicitly stated, but it is implied to be a deeply profound and existential one that has been sought after by philosophers, scientists, and thinkers throughout history.\n",
+      "\n",
+      "The idea of the number 42 as the ultimate answer is a playful jab at the idea of seeking ultimate knowledge and understanding, which is often seen as an impossible task. The number 42 is also a reference to the famous \"42\" answer in the \"The Hitchhiker's Guide to the Galaxy\" by Douglas Adams, which is a comedic science fiction satire that follows the adventures of two intergalactic travelers, Arthur Dent and Ford Prefect, as they try to escape the destruction of Earth and uncover the true meaning of the number 42.\n",
+      "\n",
+      "In the book, the supercomputer Deep Thought is asked to find the answer to the ultimate question, and after billions of years of computation, it determines that the answer is 42. The answer is so profound that it causes Deep Thought to become obsolete, as it is no longer needed to answer questions.\n",
+      "\n",
+      "The significance of the number 42 in \"The Hitchhiker's Guide to the Galaxy\" is a commentary on the nature of knowledge and the quest for ultimate understanding. It is a reminder that there are limits to what can be known and that the pursuit of knowledge should be done with a sense of humor and a willingness to accept the unknown."
+     ]
+    }
+   ],
+   "source": [
+    "streaming_response.print_response_stream()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Connecting to local NIMs\n",
+    "\n",
+    "In addition to connecting to hosted [NVIDIA NIMs](https://ai.nvidia.com), this connector can be used to connect to local microservice instances. This helps you take your applications local when necessary.\n",
+    "\n",
+    "For instructions on how to setup local microservice instances, see https://developer.nvidia.com/blog/nvidia-nim-offers-optimized-inference-microservices-for-deploying-ai-models-at-scale/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.nvidia import NVIDIA\n",
+    "\n",
+    "llm = NVIDIA(model=\"...\").mode(\"nim\", base_url=\"https://localhost.../v1\")\n",
+    "llm.available_models"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nvidia-llama-index-playground-connector",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore b/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..990c18de229088f55c6c514fd0f2d49981d1b0e7
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD b/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0896ca890d8bffd60a44fa824f8d57fecd73ee53
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile b/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b9eab05aa370629a4a3de75df3ff64cd53887b68
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/README.md b/llama-index-integrations/llms/llama-index-llms-nvidia/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5da782f53cb31009ae608b21b66ea00439437c32
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/README.md
@@ -0,0 +1,37 @@
+# NVIDIA's LLM connector
+
+Install the connector,
+
+```shell
+pip install llama-index-llms-nvidia
+```
+
+With this connector, you'll be able to connect to and generate from compatible models available as hosted [NVIDIA NIMs](https://ai.nvidia.com), such as:
+
+- Google's [gemma-7b](https://build.nvidia.com/google/gemma-7b)
+- Mistal AI's [mistral-7b-instruct-v0.2](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2)
+- And more!
+
+_First_, get a free API key. Go to https://build.nvidia.com, select a model, click "Get API Key".
+Store this key in your environment as `NVIDIA_API_KEY`.
+
+_Then_, try it out.
+
+```python
+from llama_index.llms.nvidia import NVIDIA
+from llama_index.core.llms import ChatMessage, MessageRole
+
+llm = NVIDIA()
+
+messages = [
+    ChatMessage(
+        role=MessageRole.SYSTEM, content=("You are a helpful assistant.")
+    ),
+    ChatMessage(
+        role=MessageRole.USER,
+        content=("What are the most popular house pets in North America?"),
+    ),
+]
+
+llm.chat(messages)
+```
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/BUILD b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac86372297a12a65ad0f3c056d8bcf078910a31
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py
@@ -0,0 +1,3 @@
+from llama_index.llms.nvidia.base import NVIDIA
+
+__all__ = ["NVIDIA"]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..173e92d921d65cee663960b9c6bb5ddac4e8fbc8
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
@@ -0,0 +1,107 @@
+from typing import (
+    Any,
+    Optional,
+    List,
+    Literal,
+)
+
+from llama_index.core.bridge.pydantic import PrivateAttr, BaseModel
+from llama_index.core.base.llms.generic_utils import (
+    get_from_param_or_env,
+)
+
+from llama_index.llms.nvidia.utils import API_CATALOG_MODELS
+
+from llama_index.llms.openai_like import OpenAILike
+
+DEFAULT_MODEL = "meta/llama3-8b-instruct"
+BASE_URL = "https://integrate.api.nvidia.com/v1/"
+
+
+class Model(BaseModel):
+    id: str
+
+
+class NVIDIA(OpenAILike):
+    """NVIDIA's API Catalog Connector."""
+
+    _mode: str = PrivateAttr("nvidia")
+
+    def __init__(
+        self,
+        model: str = DEFAULT_MODEL,
+        nvidia_api_key: Optional[str] = None,
+        api_key: Optional[str] = None,
+        **kwargs: Any,
+    ) -> None:
+        api_key = get_from_param_or_env(
+            "api_key",
+            nvidia_api_key or api_key,
+            "NVIDIA_API_KEY",
+            "NO_API_KEY_PROVIDED",
+        )
+
+        super().__init__(
+            model=model,
+            api_key=api_key,
+            api_base=BASE_URL,
+            is_chat_model=True,
+            default_headers={"User-Agent": "llama-index-llms-nvidia"},
+            **kwargs,
+        )
+
+    @property
+    def available_models(self) -> List[Model]:
+        ids = API_CATALOG_MODELS.keys()
+        if self._mode == "nim":
+            ids = [model.id for model in self._get_client().models.list()]
+        return [Model(id=name) for name in ids]
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "NVIDIA"
+
+    def mode(
+        self,
+        mode: Optional[Literal["nvidia", "nim"]] = "nvidia",
+        *,
+        base_url: Optional[str] = None,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None,
+    ) -> "NVIDIA":
+        """
+        Change the mode.
+
+        There are two modes, "nvidia" and "nim". The "nvidia" mode is the default
+        mode and is used to interact with hosted NIMs. The "nim" mode is used to
+        interact with NVIDIA NIM endpoints, which are typically hosted on-premises.
+
+        For the "nvidia" mode, the "api_key" parameter is available to specify
+        your API key. If not specified, the NVIDIA_API_KEY environment variable
+        will be used.
+
+        For the "nim" mode, the "base_url" parameter is required and the "model"
+        parameter may be necessary. Set base_url to the url of your local NIM
+        endpoint. For instance, "https://localhost:9999/v1". Additionally, the
+        "model" parameter must be set to the name of the model inside the NIM.
+        """
+        if mode == "nim":
+            if not base_url:
+                raise ValueError("base_url is required for nim mode")
+        if mode == "nvidia":
+            api_key = get_from_param_or_env(
+                "api_key",
+                api_key,
+                "NVIDIA_API_KEY",
+            )
+            base_url = base_url or BASE_URL
+
+        self._mode = mode
+        if base_url:
+            self.api_base = base_url
+        if model:
+            self.model = model
+        if api_key:
+            self.api_key = api_key
+
+        return self
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3897347eb71e46c49199eafb5703dc2d1cac392a
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py
@@ -0,0 +1,19 @@
+from typing import Dict, Optional
+
+API_CATALOG_MODELS: Dict[str, int] = {
+    "mistralai/mistral-7b-instruct-v0.2": 16384,
+    "mistralai/mixtral-8x7b-instruct-v0.1": 16384,
+    "mistralai/mixtral-8x22b-instruct-v0.1": 32768,
+    "mistralai/mistral-large": 16384,
+    "google/gemma-7b": 4096,
+    "google/gemma-2b": 4096,
+    "google/codegemma-7b": 4096,
+    "meta/llama2-70b": 1024,
+    "meta/codellama-70b": 1024,
+    "meta/llama3-8b-instruct": 6000,
+    "meta/llama3-70b-instruct": 6000,
+}
+
+
+def catalog_modelname_to_contextsize(modelname: str) -> Optional[int]:
+    return API_CATALOG_MODELS.get(modelname, None)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a90fb7fcf040ac414f02e69b895e0abc9e4eebb1
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.llms.nvidia"
+
+[tool.llamahub.class_authors]
+NVIDIA = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Chris Alexiuk <calexiuk@nvidia.com>"]
+description = "llama-index llms nvidia api catalog integration"
+license = "MIT"
+name = "llama-index-llms-nvidia"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.0"
+llama-index-llms-openai = "^0.1.17"
+llama-index-llms-openai-like = "^0.1.3"
+
+[tool.poetry.group.dev.dependencies]
+black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
+codespell = {extras = ["toml"], version = ">=v2.2.6"}
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
+types-setuptools = "67.1.0.0"
+
+[tool.pytest.ini_options]
+markers = [
+    "integration: mark test as an integration test",
+]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/BUILD b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6801b41d5f2ede52534a7befe17ee2d9eee18ae9
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/BUILD
@@ -0,0 +1,5 @@
+python_test_utils(
+    name="test_utils",
+)
+
+python_tests()
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..2378be82a59c88c5fbe8afe8a4b77b197172f2b1
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/conftest.py
@@ -0,0 +1,76 @@
+import pytest
+import os
+
+from llama_index.llms.nvidia import NVIDIA
+from llama_index.llms.nvidia.base import DEFAULT_MODEL
+
+from typing import Generator
+
+
+# this fixture is used to mask the NVIDIA_API_KEY environment variable and restore it
+# after the test. it also returns the value of the NVIDIA_API_KEY environment variable
+# before it was masked so that it can be used in the test.
+@pytest.fixture()
+def masked_env_var() -> Generator[str, None, None]:
+    var = "NVIDIA_API_KEY"
+    try:
+        if val := os.environ.get(var, None):
+            del os.environ[var]
+        yield val
+    finally:
+        if val:
+            os.environ[var] = val
+
+
+def pytest_collection_modifyitems(config, items):
+    if "NVIDIA_API_KEY" not in os.environ:
+        skip_marker = pytest.mark.skip(
+            reason="requires NVIDIA_API_KEY environment variable or --nim-endpoint option"
+        )
+        for item in items:
+            if "integration" in item.keywords and not config.getoption(
+                "--nim-endpoint"
+            ):
+                item.add_marker(skip_marker)
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--all-models",
+        action="store_true",
+        help="Run tests across all models",
+    )
+    parser.addoption(
+        "--model-id",
+        action="store",
+        help="Run tests for a specific chat model",
+    )
+    parser.addoption(
+        "--nim-endpoint",
+        type=str,
+        help="Run tests using NIM mode",
+    )
+
+
+def get_mode(config: pytest.Config) -> dict:
+    nim_endpoint = config.getoption("--nim-endpoint")
+    if nim_endpoint:
+        return {"mode": "nim", "base_url": nim_endpoint}
+    return {}
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
+    mode = get_mode(metafunc.config)
+
+    if "chat_model" in metafunc.fixturenames:
+        models = [DEFAULT_MODEL]
+        if model := metafunc.config.getoption("--model-id"):
+            models = [model]
+        elif metafunc.config.getoption("--all-models"):
+            models = [model.id for model in NVIDIA().mode(**mode).available_models]
+        metafunc.parametrize("chat_model", models, ids=models)
+
+
+@pytest.fixture()
+def mode(request: pytest.FixtureRequest) -> dict:
+    return get_mode(request.config)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b82deefdae7c3174a20365595e060316a177de
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_additional_kwargs.py
@@ -0,0 +1,39 @@
+import pytest
+
+from llama_index.llms.nvidia import NVIDIA
+
+
+@pytest.mark.integration()
+def test_additional_kwargs_success(chat_model: str, mode: dict) -> None:
+    client = NVIDIA(chat_model).mode(**mode)
+    assert client.complete(
+        "Hello, world!",
+        stop=["cat", "Cats"],
+        seed=42,
+        frequency_penalty=0.5,
+        presence_penalty=0.5,
+    ).text
+
+
+@pytest.mark.integration()
+def test_additional_kwargs_wrong_dtype(chat_model: str, mode: dict) -> None:
+    client = NVIDIA(chat_model).mode(**mode)
+    with pytest.raises(Exception) as exc_info:
+        client.complete(
+            "Hello, world!",
+            frequency_penalty="fish",
+        ).text
+    message = str(exc_info.value)
+    assert "400" in message
+
+
+@pytest.mark.integration()
+def test_additional_kwargs_wrong_dtype(chat_model: str, mode: dict) -> None:
+    client = NVIDIA(chat_model).mode(**mode)
+    with pytest.raises(Exception) as exc_info:
+        client.complete(
+            "Hello, world!",
+            cats="cats",
+        ).text
+    message = str(exc_info.value)
+    assert "unexpected keyword" in message
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96a7f66c39d4c6f4e81ffe928dcc71814400234
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_api_key.py
@@ -0,0 +1,58 @@
+import os
+
+import pytest
+
+from llama_index.llms.nvidia import NVIDIA
+
+from typing import Any
+
+
+def get_api_key(instance: Any) -> str:
+    return instance.api_key
+
+
+def test_create_without_api_key(masked_env_var: str) -> None:
+    NVIDIA()
+
+
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_create_with_api_key(param: str, masked_env_var: str) -> None:
+    instance = NVIDIA(**{param: "just testing no failure"})
+    assert get_api_key(instance) == "just testing no failure"
+
+
+def test_api_key_priority(masked_env_var: str) -> None:
+    try:
+        os.environ["NVIDIA_API_KEY"] = "ENV"
+        assert get_api_key(NVIDIA()) == "ENV"
+        assert get_api_key(NVIDIA(nvidia_api_key="PARAM")) == "PARAM"
+        assert get_api_key(NVIDIA(api_key="PARAM")) == "PARAM"
+        assert get_api_key(NVIDIA(api_key="LOW", nvidia_api_key="HIGH")) == "HIGH"
+    finally:
+        # we must clean up environ or it may impact other tests
+        del os.environ["NVIDIA_API_KEY"]
+
+
+@pytest.mark.integration()
+def test_missing_api_key_error(masked_env_var: str) -> None:
+    client = NVIDIA()
+    with pytest.raises(Exception) as exc_info:
+        client.complete("Hello, world!").text
+    message = str(exc_info.value)
+    assert "401" in message
+
+
+@pytest.mark.integration()
+def test_bogus_api_key_error(masked_env_var: str) -> None:
+    client = NVIDIA(nvidia_api_key="BOGUS")
+    with pytest.raises(Exception) as exc_info:
+        client.complete("Hello, world!").text
+    message = str(exc_info.value)
+    assert "401" in message
+
+
+@pytest.mark.integration()
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_api_key(param: str, masked_env_var: str) -> None:
+    client = NVIDIA(**{param: masked_env_var})
+    assert client.complete("Hello, world!").text
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..01cdd7f7458ce7126728170687858e2735aea6c2
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_available_models.py
@@ -0,0 +1,11 @@
+import pytest
+
+from llama_index.llms.nvidia import NVIDIA
+
+
+@pytest.mark.integration()
+def test_available_models(mode: dict) -> None:
+    models = NVIDIA().mode(**mode).available_models
+    assert models
+    assert isinstance(models, list)
+    assert all(isinstance(model.id, str) for model in models)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..6915a796816051dd6e6194cb0add74230b3020d7
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_integration.py
@@ -0,0 +1,75 @@
+import pytest
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    CompletionResponse,
+)
+from llama_index.llms.nvidia import NVIDIA
+
+
+@pytest.mark.integration()
+def test_chat(chat_model: str, mode: dict) -> None:
+    message = ChatMessage(content="Hello")
+    response = NVIDIA(model=chat_model).mode(**mode).chat([message])
+    assert isinstance(response, ChatResponse)
+    assert isinstance(response.message, ChatMessage)
+    assert isinstance(response.message.content, str)
+
+
+@pytest.mark.integration()
+def test_complete(chat_model: str, mode: dict) -> None:
+    response = NVIDIA(model=chat_model).mode(**mode).complete("Hello")
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+@pytest.mark.integration()
+def test_stream_chat(chat_model: str, mode: dict) -> None:
+    message = ChatMessage(content="Hello")
+    gen = NVIDIA(model=chat_model).mode(**mode).stream_chat([message])
+    assert all(isinstance(response, ChatResponse) for response in gen)
+    assert all(isinstance(response.delta, str) for response in gen)
+
+
+@pytest.mark.integration()
+def test_stream_complete(chat_model: str, mode: dict) -> None:
+    gen = NVIDIA(model=chat_model).mode(**mode).stream_complete("Hello")
+    assert all(isinstance(response, CompletionResponse) for response in gen)
+    assert all(isinstance(response.delta, str) for response in gen)
+
+
+@pytest.mark.integration()
+@pytest.mark.asyncio()
+async def test_achat(chat_model: str, mode: dict) -> None:
+    message = ChatMessage(content="Hello")
+    response = await NVIDIA(model=chat_model).mode(**mode).achat([message])
+    assert isinstance(response, ChatResponse)
+    assert isinstance(response.message, ChatMessage)
+    assert isinstance(response.message.content, str)
+
+
+@pytest.mark.integration()
+@pytest.mark.asyncio()
+async def test_acomplete(chat_model: str, mode: dict) -> None:
+    response = await NVIDIA(model=chat_model).mode(**mode).acomplete("Hello")
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+@pytest.mark.integration()
+@pytest.mark.asyncio()
+async def test_astream_chat(chat_model: str, mode: dict) -> None:
+    message = ChatMessage(content="Hello")
+    gen = await NVIDIA(model=chat_model).mode(**mode).astream_chat([message])
+    responses = [response async for response in gen]
+    assert all(isinstance(response, ChatResponse) for response in responses)
+    assert all(isinstance(response.delta, str) for response in responses)
+
+
+@pytest.mark.integration()
+@pytest.mark.asyncio()
+async def test_astream_complete(chat_model: str, mode: dict) -> None:
+    gen = await NVIDIA(model=chat_model).mode(**mode).astream_complete("Hello")
+    responses = [response async for response in gen]
+    assert all(isinstance(response, CompletionResponse) for response in responses)
+    assert all(isinstance(response.delta, str) for response in responses)
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py
new file mode 100644
index 0000000000000000000000000000000000000000..887a12994293732cc9afebad7997a80d567a0d8e
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_mode_switch.py
@@ -0,0 +1,37 @@
+import pytest
+
+from llama_index.llms.nvidia import NVIDIA as Interface
+from llama_index.llms.nvidia.base import BASE_URL
+
+
+def test_mode_switch_nvidia_throws_without_key(masked_env_var: str):
+    x = Interface()
+    with pytest.raises(ValueError):
+        x.mode("nvidia")
+
+
+def test_mode_switch_nvidia_with_key(masked_env_var: str):
+    Interface().mode("nvidia", api_key="test")
+
+
+def test_mode_switch_nim_throws_without_url():
+    instance = Interface()
+    with pytest.raises(ValueError):
+        instance.mode("nim")
+
+
+def test_mode_switch_nim_with_url():
+    Interface().mode("nim", base_url="test")
+
+
+def test_mode_switch_param_setting():
+    instance = Interface(model="dummy")
+
+    instance1 = instance.mode("nim", base_url="https://test_url/v1/")
+    assert instance1.model == "dummy"
+    assert str(instance1.api_base) == "https://test_url/v1/"
+
+    instance2 = instance1.mode("nvidia", api_key="test", model="dummy-2")
+    assert instance2.model == "dummy-2"
+    assert str(instance2.api_base) == BASE_URL
+    assert instance2.api_key == "test"
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py
new file mode 100644
index 0000000000000000000000000000000000000000..131407f98f798b3008b6af25b3c640efa7a18934
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py
@@ -0,0 +1,225 @@
+import os
+from typing import Any, AsyncGenerator, Generator, Optional
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from llama_index.core.base.llms.types import ChatMessage, LLMMetadata
+from llama_index.llms.nvidia import NVIDIA
+
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+    ChatCompletionMessage,
+    Choice,
+    ChoiceLogprobs,
+)
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk, ChoiceDelta
+from openai.types.chat.chat_completion_chunk import Choice as ChunkChoice
+from openai.types.completion import Completion, CompletionUsage
+
+
+class CachedNVIDIApiKeys:
+    def __init__(self, set_env_key_to: Optional[str] = "", set_fake_key: bool = False):
+        self.set_env_key_to = set_env_key_to
+        self.set_fake_key = set_fake_key
+
+    def __enter__(self) -> None:
+        self.api_env_was = os.environ.get("NVIDIA_API_KEY", "")
+        os.environ["NVIDIA_API_KEY"] = self.set_env_key_to
+
+        if self.set_fake_key:
+            os.environ["NVIDIA_API_KEY"] = "nvai-" + "x" * 9 + "-" + "x" * 54
+
+    def __exit__(self, *exc: object) -> None:
+        if self.api_env_was == "":
+            del os.environ["NVIDIA_API_KEY"]
+        else:
+            os.environ["NVIDIA_API_KEY"] = self.api_env_was
+
+
+def mock_chat_completion_v1(*args: Any, **kwargs: Any) -> ChatCompletion:
+    return ChatCompletion(
+        id="chatcmpl-4162e407-e121-42b4-8590-1c173380be7d",
+        object="chat.completion",
+        created=1713474384,
+        model="mistralai/mistral-7b-instruct-v0.2",
+        usage=CompletionUsage(
+            completion_tokens=304, prompt_tokens=11, total_tokens=315
+        ),
+        choices=[
+            Choice(
+                finish_reason="stop",
+                index=0,
+                logprobs=ChoiceLogprobs(
+                    content=None,
+                    text_offset=[],
+                    token_logprobs=[0.0, 0.0],
+                    tokens=[],
+                    top_logprobs=[],
+                ),
+                message=ChatCompletionMessage(
+                    content="Cool Test Message",
+                    role="assistant",
+                    function_call=None,
+                    tool_calls=None,
+                ),
+            )
+        ],
+    )
+
+
+async def mock_async_chat_completion_v1(*args: Any, **kwargs: Any) -> Completion:
+    return mock_chat_completion_v1(*args, **kwargs)
+
+
+def mock_chat_completion_stream_v1(
+    *args: Any, **kwargs: Any
+) -> Generator[ChatCompletionChunk, None, None]:
+    responses = [
+        ChatCompletionChunk(
+            id="chatcmpl-998d9b96-0b71-41f5-b910-dd3bc00f38c6",
+            object="chat.completion.chunk",
+            created=1713474736,
+            model="google/gemma-7b",
+            choices=[
+                ChunkChoice(
+                    finish_reason="stop",
+                    index=0,
+                    delta=ChoiceDelta(
+                        content="Test",
+                        function_call=None,
+                        role="assistant",
+                        tool_calls=None,
+                    ),
+                )
+            ],
+        ),
+        ChatCompletionChunk(
+            id="chatcmpl-998d9b96-0b71-41f5-b910-dd3bc00f38c6",
+            object="chat.completion.chunk",
+            created=1713474736,
+            model="google/gemma-7b",
+            choices=[
+                ChunkChoice(
+                    finish_reason="stop",
+                    index=0,
+                    delta=ChoiceDelta(
+                        content="Second Test",
+                        function_call=None,
+                        role="assistant",
+                        tool_calls=None,
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    yield from responses
+
+
+async def mock_async_chat_completion_stream_v1(
+    *args: Any, **kwargs: Any
+) -> AsyncGenerator[Completion, None]:
+    async def gen() -> AsyncGenerator[Completion, None]:
+        for response in mock_chat_completion_stream_v1(*args, **kwargs):
+            yield response
+
+    return gen()
+
+
+@patch("llama_index.llms.openai.base.SyncOpenAI")
+def test_chat_model_basic(MockSyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockSyncOpenAI.return_value
+        mock_instance.chat.completions.create.return_value = mock_chat_completion_v1()
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response = llm.complete(prompt)
+        assert response.text == "Cool Test Message"
+
+        chat_response = llm.chat([message])
+        assert chat_response.message.content == "Cool Test Message"
+
+
+@patch("llama_index.llms.openai.base.SyncOpenAI")
+def test_chat_model_streaming(MockSyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockSyncOpenAI.return_value
+        mock_instance.chat.completions.create.return_value = (
+            mock_chat_completion_stream_v1()
+        )
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response_gen = llm.stream_complete(prompt)
+        responses = list(response_gen)
+        assert responses[-1].text == "TestSecond Test"
+
+        mock_instance.chat.completions.create.return_value = (
+            mock_chat_completion_stream_v1()
+        )
+
+        chat_response_gen = llm.stream_chat([message])
+        chat_responses = list(chat_response_gen)
+        print(chat_responses)
+        assert chat_responses[-1].message.content == "TestSecond Test"
+        assert chat_responses[-1].message.role == "assistant"
+
+
+@pytest.mark.asyncio()
+@patch("llama_index.llms.openai.base.AsyncOpenAI")
+async def test_async_chat_model_basic(MockAsyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockAsyncOpenAI.return_value
+        create_fn = AsyncMock()
+        create_fn.side_effect = mock_async_chat_completion_v1
+        mock_instance.chat.completions.create = create_fn
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response = await llm.acomplete(prompt)
+        assert response.text == "Cool Test Message"
+
+        chat_response = await llm.achat([message])
+        assert chat_response.message.content == "Cool Test Message"
+
+
+@pytest.mark.asyncio()
+@patch("llama_index.llms.openai.base.AsyncOpenAI")
+async def test_async_streaming_chat_model(MockAsyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockAsyncOpenAI.return_value
+        create_fn = AsyncMock()
+        create_fn.side_effect = mock_async_chat_completion_stream_v1
+        mock_instance.chat.completions.create = create_fn
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response_gen = await llm.astream_complete(prompt)
+        responses = [response async for response in response_gen]
+        assert responses[-1].text == "TestSecond Test"
+
+        chat_response_gen = await llm.astream_chat([message])
+        chat_responses = [response async for response in chat_response_gen]
+        assert chat_responses[-1].message.content == "TestSecond Test"
+
+
+def test_validates_api_key_is_present() -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        assert NVIDIA()
+
+        os.environ["NVIDIA_API_KEY"] = ""
+
+        assert NVIDIA(api_key="nvai-" + "x" * 9 + "-" + "x" * 54)
+
+
+def test_metadata() -> None:
+    assert isinstance(NVIDIA().metadata, LLMMetadata)
diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py
index dec4820bbd5278bcc6d11d13a9fb868a49a5afb5..8c26a92b0eb4362d6c538e14506a06e5dfd0e23a 100644
--- a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py
@@ -269,14 +269,16 @@ def from_openai_token_logprob(
     openai_token_logprob: ChatCompletionTokenLogprob,
 ) -> List[LogProb]:
     """Convert a single openai token logprob to generic list of logprobs."""
-    try:
-        result = [
-            LogProb(token=el.token, logprob=el.logprob, bytes=el.bytes or [])
-            for el in openai_token_logprob.top_logprobs
-        ]
-    except Exception as e:
-        print(openai_token_logprob)
-        raise
+    result = []
+    if openai_token_logprob.top_logprobs:
+        try:
+            result = [
+                LogProb(token=el.token, logprob=el.logprob, bytes=el.bytes or [])
+                for el in openai_token_logprob.top_logprobs
+            ]
+        except Exception as e:
+            print(openai_token_logprob)
+            raise
     return result
 
 
@@ -284,10 +286,11 @@ def from_openai_token_logprobs(
     openai_token_logprobs: Sequence[ChatCompletionTokenLogprob],
 ) -> List[List[LogProb]]:
     """Convert openai token logprobs to generic list of LogProb."""
-    return [
-        from_openai_token_logprob(token_logprob)
-        for token_logprob in openai_token_logprobs
-    ]
+    result = []
+    for token_logprob in openai_token_logprobs:
+        if logprobs := from_openai_token_logprob(token_logprob):
+            result.append(logprobs)
+    return result
 
 
 def from_openai_completion_logprob(
@@ -304,10 +307,13 @@ def from_openai_completion_logprobs(
     openai_completion_logprobs: Logprobs,
 ) -> List[List[LogProb]]:
     """Convert openai completion logprobs to generic list of LogProb."""
-    return [
-        from_openai_completion_logprob(completion_logprob)
-        for completion_logprob in openai_completion_logprobs.top_logprobs
-    ]
+    result = []
+    if openai_completion_logprobs.top_logprobs:
+        result = [
+            from_openai_completion_logprob(completion_logprob)
+            for completion_logprob in openai_completion_logprobs.top_logprobs
+        ]
+    return result
 
 
 def from_openai_completion(openai_completion: Completion) -> CompletionResponse:
diff --git a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
index a641b3b9bd0d86f5990beeebc6ba56bc775362b5..bdbd0f82a9c28aa183db2fc8fc459fade69641c8 100644
--- a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
@@ -29,7 +29,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-openai"
 readme = "README.md"
-version = "0.1.16"
+version = "0.1.17"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
diff --git a/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai_utils.py b/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai_utils.py
index 89c82ea177f281e7f7feee759a9ec94238f0a3ed..71dbae36725dc5b159cee6e364e14d96c6e0162b 100644
--- a/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai_utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai_utils.py
@@ -1,7 +1,9 @@
 import pytest
 from typing import List
 
-from llama_index.core.base.llms.types import ChatMessage, MessageRole
+from llama_index.core.base.llms.types import ChatMessage, MessageRole, LogProb
+from openai.types.chat.chat_completion_token_logprob import ChatCompletionTokenLogprob
+from openai.types.completion_choice import Logprobs
 from llama_index.core.bridge.pydantic import BaseModel
 from llama_index.llms.openai.utils import (
     from_openai_message_dicts,
@@ -10,6 +12,12 @@ from llama_index.llms.openai.utils import (
     to_openai_tool,
 )
 
+from llama_index.llms.openai.utils import (
+    from_openai_completion_logprobs,
+    from_openai_token_logprob,
+    from_openai_token_logprobs,
+)
+
 
 from openai.types.chat.chat_completion_assistant_message_param import (
     FunctionCall as FunctionCallParam,
@@ -223,3 +231,23 @@ def test_to_openai_message_with_pydantic_description() -> None:
             "parameters": TestOutput.schema(),
         },
     }
+
+
+def test_from_openai_token_logprob_none_top_logprob() -> None:
+    logprob = ChatCompletionTokenLogprob(token="", logprob=1.0, top_logprobs=[])
+    logprob.top_logprobs = None
+    result: List[LogProb] = from_openai_token_logprob(logprob)
+    assert isinstance(result, list)
+
+
+def test_from_openai_token_logprobs_none_top_logprobs() -> None:
+    logprob = ChatCompletionTokenLogprob(token="", logprob=1.0, top_logprobs=[])
+    logprob.top_logprobs = None
+    result: List[LogProb] = from_openai_token_logprobs([logprob])
+    assert isinstance(result, list)
+
+
+def test_from_openai_completion_logprobs_none_top_logprobs() -> None:
+    logprobs = Logprobs(top_logprobs=None)
+    result = from_openai_completion_logprobs(logprobs)
+    assert isinstance(result, list)