diff --git a/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb b/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..5753e690097f05c78c3cb2c05d3263cc42779eec
--- /dev/null
+++ b/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb
@@ -0,0 +1,523 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "\n",
+    "# Multi-Modal LLM using NVIDIA endpoints for image reasoning\n",
+    "\n",
+    "In this notebook, we show how to use NVIDIA MultiModal LLM class/abstraction for image understanding/reasoning.\n",
+    "\n",
+    "We also show several functions we are now supporting for NVIDIA LLM:\n",
+    "* `complete` (both sync and async): for a single prompt and list of images\n",
+    "* `stream complete` (both sync and async): for steaming output of complete"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install --upgrade --quiet llama-index-multi-modal-llms-nvidia llama-index-embeddings-nvidia llama-index-readers-file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "# del os.environ['NVIDIA_API_KEY']  ## delete key and reset\n",
+    "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n",
+    "    print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n",
+    "else:\n",
+    "    nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n",
+    "    assert nvapi_key.startswith(\n",
+    "        \"nvapi-\"\n",
+    "    ), f\"{nvapi_key[:5]}... is not a valid key\"\n",
+    "    os.environ[\"NVIDIA_API_KEY\"] = nvapi_key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal\n",
+    "import base64\n",
+    "from llama_index.core.schema import ImageDocument\n",
+    "from PIL import Image\n",
+    "import requests\n",
+    "from io import BytesIO\n",
+    "\n",
+    "# import matplotlib.pyplot as plt\n",
+    "from llama_index.core.multi_modal_llms.generic_utils import load_image_urls\n",
+    "\n",
+    "llm = NVIDIAMultiModal()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize `OpenAIMultiModal` and Load Images from URLs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_urls = [\n",
+    "    \"https://res.cloudinary.com/hello-tickets/image/upload/c_limit,f_auto,q_auto,w_1920/v1640835927/o3pfl41q7m5bj8jardk0.jpg\",\n",
+    "    \"https://www.visualcapitalist.com/wp-content/uploads/2023/10/US_Mortgage_Rate_Surge-Sept-11-1.jpg\",\n",
+    "    \"https://www.sportsnet.ca/wp-content/uploads/2023/11/CP1688996471-1040x572.jpg\",\n",
+    "    # Add yours here!\n",
+    "]\n",
+    "\n",
+    "img_response = requests.get(image_urls[0])\n",
+    "img = Image.open(BytesIO(img_response.content))\n",
+    "# plt.imshow(img)\n",
+    "\n",
+    "image_url_documents = load_image_urls(image_urls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Complete a prompt with a bunch of images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.complete(\n",
+    "    prompt=f\"What is this image?\",\n",
+    "    image_documents=image_url_documents,\n",
+    ")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await llm.acomplete(\n",
+    "    prompt=\"tell me about this image\",\n",
+    "    image_documents=image_url_documents,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Steam Complete a prompt with a bunch of images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream_complete_response = llm.stream_complete(\n",
+    "    prompt=f\"What is this image?\",\n",
+    "    image_documents=image_url_documents,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for r in stream_complete_response:\n",
+    "    print(r.text, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream_complete_response = await llm.astream_complete(\n",
+    "    prompt=f\"What is this image?\",\n",
+    "    image_documents=image_url_documents,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "last_element = None\n",
+    "async for last_element in stream_complete_response:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Passing an image as a base64 encoded string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imgr_content = base64.b64encode(\n",
+    "    requests.get(\n",
+    "        \"https://helloartsy.com/wp-content/uploads/kids/cats/how-to-draw-a-small-cat/how-to-draw-a-small-cat-step-6.jpg\"\n",
+    "    ).content\n",
+    ").decode(\"utf-8\")\n",
+    "\n",
+    "llm.complete(\n",
+    "    prompt=\"List models in image\",\n",
+    "    image_documents=[ImageDocument(image=imgr_content, mimetype=\"jpeg\")],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Passing an image as an NVCF asset\n",
+    "If your image is sufficiently large or you will pass it multiple times in a chat conversation, you may upload it once and reference it in your chat conversation\n",
+    "\n",
+    "See https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/assets.html for details about how upload the image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "content_type = \"image/jpg\"\n",
+    "description = \"example-image-from-lc-nv-ai-e-notebook\"\n",
+    "\n",
+    "create_response = requests.post(\n",
+    "    \"https://api.nvcf.nvidia.com/v2/nvcf/assets\",\n",
+    "    headers={\n",
+    "        \"Authorization\": f\"Bearer {os.environ['NVIDIA_API_KEY']}\",\n",
+    "        \"accept\": \"application/json\",\n",
+    "        \"Content-Type\": \"application/json\",\n",
+    "    },\n",
+    "    json={\"contentType\": content_type, \"description\": description},\n",
+    ")\n",
+    "create_response.raise_for_status()\n",
+    "\n",
+    "upload_response = requests.put(\n",
+    "    create_response.json()[\"uploadUrl\"],\n",
+    "    headers={\n",
+    "        \"Content-Type\": content_type,\n",
+    "        \"x-amz-meta-nvcf-asset-description\": description,\n",
+    "    },\n",
+    "    data=img_response.content,\n",
+    ")\n",
+    "upload_response.raise_for_status()\n",
+    "\n",
+    "asset_id = create_response.json()[\"assetId\"]\n",
+    "asset_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.stream_complete(\n",
+    "    prompt=f\"Describe the image\",\n",
+    "    image_documents=[\n",
+    "        ImageDocument(metadata={\"asset_id\": asset_id}, mimetype=\"png\")\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for r in response:\n",
+    "    print(r.text, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Passing images from local files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "\n",
+    "# put your local directore here\n",
+    "image_documents = SimpleDirectoryReader(\"./tests/data/\").load_data()\n",
+    "\n",
+    "llm.complete(\n",
+    "    prompt=\"Describe the images as an alternative text\",\n",
+    "    image_documents=image_documents,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Chat with of images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "llm.chat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe this image:\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "await llm.achat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe this image:\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.chat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe the image\"},\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": f'<img src=\"data:{content_type};asset_id,{asset_id}\" />',\n",
+    "                },\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await llm.achat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe the image\"},\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": f'<img src=\"data:{content_type};asset_id,{asset_id}\" />',\n",
+    "                },\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Stream Chat a prompt with images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "streaming_resp = llm.stream_chat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe this image:\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for r in streaming_resp:\n",
+    "    print(r.delta, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "resp = await llm.astream_chat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": \"Describe this image:\"},\n",
+    "                {\"type\": \"image_url\", \"image_url\": image_urls[0]},\n",
+    "            ],\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "last_element = None\n",
+    "async for last_element in resp:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.stream_chat(\n",
+    "    [\n",
+    "        ChatMessage(\n",
+    "            role=\"user\",\n",
+    "            content=f\"\"\"<img src=\"data:image/jpg;\n",
+    "            ,{asset_id}\"/>\"\"\",\n",
+    "        )\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for r in response:\n",
+    "    print(r.delta, end=\"\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..990c18de229088f55c6c514fd0f2d49981d1b0e7
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0896ca890d8bffd60a44fa824f8d57fecd73ee53
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b9eab05aa370629a4a3de75df3ff64cd53887b68
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2ce08fc489f52fa26388f108917aca86ee4e0b79
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md
@@ -0,0 +1,126 @@
+# LlamaIndex Multi_Modal Integration: Nvidia
+
+This project integrates Nvidia vlm into the LlamaIndex framework, enabling advanced multimodal capabilities for various AI applications.
+
+## Features
+
+- Seamless integration of NVIDIA vlm with LlamaIndex
+- Support for multiple state-of-the-art vision-language models:
+  - [adept/fuyu-8b](https://build.nvidia.com/adept/fuyu-8b)
+  - [google/deplot](https://build.nvidia.com/google/google-deplot)
+  - [nvidia/neva-22b](https://build.nvidia.com/nvidia/neva-22b)
+  - [google/paligemma](https://build.nvidia.com/google/google-paligemma)
+  - [microsoft/phi-3-vision-128k-instruct](https://build.nvidia.com/microsoft/phi-3-vision-128k-instruct)
+  - [microsoft/phi-3.5-vision-instruct](https://build.nvidia.com/microsoft/phi-3_5-vision-instruct)
+  - [nvidia/vila](https://build.nvidia.com/nvidia)
+  - [meta/llama-3.2-11b-vision-instruct](https://build.nvidia.com/meta/llama-3.2-11b-vision-instruct)
+  - [meta/llama-3.2-90b-vision-instruct](https://build.nvidia.com/meta/llama-3.2-90b-vision-instruct)
+- Easy-to-use interface for multimodal tasks like image captioning and visual question answering
+- Configurable model parameters for fine-tuned performance
+
+---
+
+## Installation
+
+```bash
+pip install llama-index-multi-modal-llms-nvidia
+```
+
+Make sure to set your NVIDIA API key as an environment variable:
+
+```bash
+export NVIDIA_API_KEY=your_api_key_here
+```
+
+## Usage
+
+Here's a basic example of how to use the Nvidia vlm integration:
+
+```python
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal
+from llama_index.core.schema import ImageDocument
+
+# Initialize the model
+model = NVIDIAMultiModal()
+
+# Prepare your image and prompt
+image_document = ImageDocument(image_path="path/to/your/image.jpg")
+prompt = "Describe this image in detail."
+
+# Generate a response
+response = model.complete(prompt, image_documents=[image_document])
+
+print(response.text)
+```
+
+### Streaming
+
+```python
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal
+from llama_index.core.schema import ImageDocument
+
+# Initialize the model
+model = NVIDIAMultiModal()
+
+# Prepare your image and prompt
+image_document = ImageDocument(image_path="downloaded_image.jpg")
+prompt = "Describe this image in detail."
+
+import nest_asyncio
+import asyncio
+
+nest_asyncio.apply()
+
+response = model.stream_complete(
+    prompt=f"Describe the image",
+    image_documents=[
+        ImageDocument(metadata={"asset_id": asset_id}, mimetype="png")
+    ],
+)
+
+for r in response:
+    print(r.text, end="")
+```
+
+## Passing an image as an NVCF asset
+
+If your image is sufficiently large or you will pass it multiple times in a chat conversation, you may upload it once and reference it in your chat conversation
+
+See https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/assets.html for details about how upload the image.
+
+```python
+import requests
+
+content_type = "image/jpg"
+description = "example-image-from-lc-nv-ai-e-notebook"
+
+create_response = requests.post(
+    "https://api.nvcf.nvidia.com/v2/nvcf/assets",
+    headers={
+        "Authorization": f"Bearer {os.environ['NVIDIA_API_KEY']}",
+        "accept": "application/json",
+        "Content-Type": "application/json",
+    },
+    json={"contentType": content_type, "description": description},
+)
+create_response.raise_for_status()
+
+upload_response = requests.put(
+    create_response.json()["uploadUrl"],
+    headers={
+        "Content-Type": content_type,
+        "x-amz-meta-nvcf-asset-description": description,
+    },
+    data=img_response.content,
+)
+upload_response.raise_for_status()
+
+asset_id = create_response.json()["assetId"]
+
+response = llm.complete(
+    prompt=f"Describe the image",
+    image_documents=[
+        ImageDocument(metadata={"asset_id": asset_id}, mimetype="png")
+    ],
+)
+```
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f41abaeed484d44ea77cafd4c07e4ea213cc6d
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py
@@ -0,0 +1,4 @@
+from llama_index.multi_modal_llms.nvidia.base import NVIDIAMultiModal
+
+
+__all__ = ["NVIDIAMultiModal"]
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..02cea8ed17ed7c6378d1d04dc08e53e04932f23a
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py
@@ -0,0 +1,536 @@
+from typing import Any, Dict, List, Optional, Sequence
+import requests
+
+from llama_index.core.base.llms.types import (
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    MessageRole,
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    ChatResponseAsyncGen,
+)
+from llama_index.core.bridge.pydantic import Field
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.constants import (
+    DEFAULT_NUM_OUTPUTS,
+    DEFAULT_TEMPERATURE,
+)
+from llama_index.core.multi_modal_llms import (
+    MultiModalLLM,
+    MultiModalLLMMetadata,
+)
+from llama_index.core.schema import ImageNode
+from llama_index.core.base.llms.generic_utils import (
+    get_from_param_or_env,
+)
+
+from llama_index.core.base.llms.generic_utils import get_from_param_or_env
+
+from llama_index.multi_modal_llms.nvidia.utils import (
+    BASE_URL,
+    KNOWN_URLS,
+    NVIDIA_MULTI_MODAL_MODELS,
+    generate_nvidia_multi_modal_chat_message,
+    aggregate_msgs,
+    process_response,
+)
+import aiohttp
+import json
+
+
+class NVIDIAClient:
+    def __init__(
+        self,
+        api_key: str,
+        timeout: Optional[float] = None,
+    ):
+        self.api_key = api_key
+        self.timeout = timeout
+
+    def _get_headers(self, stream: bool) -> Dict[str, str]:
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "content-type": "application/json",
+            "User-Agent": "langchain-nvidia-ai-endpoints",
+        }
+        headers["accept"] = "text/event-stream" if stream else "application/json"
+        return headers
+
+    def get_model_details(self) -> List[str]:
+        """
+        Get model details.
+
+        Returns:
+            List of models
+        """
+        return list(NVIDIA_MULTI_MODAL_MODELS.keys())
+
+    def request(
+        self,
+        endpoint: str,
+        stream: bool,
+        messages: Dict[str, Any],
+        extra_headers: Dict[str, Any],
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Perform a synchronous request to the NVIDIA API.
+
+        Args:
+            endpoint (str): The API endpoint to send the request to.
+            messages (Dict[str, Any]): The request payload.
+
+        Returns:
+            Dict[str, Any]: The API response.
+        """
+
+        def perform_request():
+            payload = {"messages": messages, "stream": stream, **kwargs}
+            headers = {
+                **self._get_headers(stream=stream),
+                **extra_headers,
+            }
+            response = requests.post(
+                endpoint, json=payload, headers=headers, stream=stream
+            )
+            response.raise_for_status()
+            return response
+
+        return perform_request()
+
+    async def request_async(
+        self,
+        endpoint: str,
+        stream: bool,
+        messages: Dict[str, Any],
+        extra_headers: Dict[str, Any],
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Perform an asynchronous request to the NVIDIA API.
+
+        Args:
+            endpoint (str): The API endpoint to send the request to.
+            messages (Dict[str, Any]): The request payload.
+
+        Returns:
+            Dict[str, Any]: The API response.
+        """
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                endpoint,
+                json={"messages": messages, "stream": stream, **kwargs},
+                headers={**self._get_headers(stream=stream), **extra_headers},
+            ) as response:
+                response.raise_for_status()
+                return await response.json()
+
+
+class NVIDIAMultiModal(MultiModalLLM):
+    model: str = Field(description="The Multi-Modal model to use from NVIDIA.")
+    temperature: float = Field(description="The temperature to use for sampling.")
+    max_tokens: Optional[int] = Field(
+        description=" The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.",
+        gt=0,
+    )
+    timeout: float = Field(
+        default=60.0,
+        description="The timeout, in seconds, for API requests.",
+        ge=0,
+    )
+    api_key: str = Field(default=None, description="The NVIDIA API key.", exclude=True)
+    base_url: str = Field(default=BASE_URL, description="The base URL for NVIDIA API.")
+    additional_kwargs: Dict[str, Any] = Field(
+        default_factory=dict, description="Additional kwargs for the NVIDIA API."
+    )
+
+    def __init__(
+        self,
+        model: str = "microsoft/phi-3-vision-128k-instruct",
+        temperature: float = DEFAULT_TEMPERATURE,
+        max_tokens: Optional[int] = 300,
+        nvidia_api_key: Optional[str] = None,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = BASE_URL,
+        callback_manager: Optional[CallbackManager] = None,
+        **kwargs: Any,
+    ) -> None:
+        api_key = get_from_param_or_env(
+            "api_key",
+            nvidia_api_key or api_key,
+            "NVIDIA_API_KEY",
+            "NO_API_KEY_PROVIDED",
+        )
+
+        is_hosted = base_url in KNOWN_URLS
+
+        if is_hosted and api_key == "NO_API_KEY_PROVIDED":
+            raise ValueError(
+                "An API key is required for the hosted NIM. This will become an error in 0.2.0."
+            )
+
+        super().__init__(
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            api_key=api_key,
+            api_base=base_url,
+            callback_manager=callback_manager,
+            **kwargs,
+        )
+
+    @property
+    def _client(self) -> NVIDIAClient:
+        return NVIDIAClient(**self._get_credential_kwargs())
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "nvidia_multi_modal_llm"
+
+    @property
+    def metadata(self) -> MultiModalLLMMetadata:
+        """Multi Modal LLM metadata."""
+        return MultiModalLLMMetadata(
+            num_output=self.max_tokens or DEFAULT_NUM_OUTPUTS,
+            model_name=self.model,
+        )
+
+    @property
+    def available_models(self):
+        return self._client.get_model_details()
+
+    def _get_credential_kwargs(self) -> Dict[str, Any]:
+        return {"api_key": self.api_key}
+
+    # Model Params for NVIDIA Multi Modal model.
+    def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
+        if self.model not in NVIDIA_MULTI_MODAL_MODELS:
+            raise ValueError(
+                f"Invalid model {self.model}. "
+                f"Available models are: {list(NVIDIA_MULTI_MODAL_MODELS.keys())}"
+            )
+        base_kwargs = {"model": self.model, "temperature": self.temperature, **kwargs}
+        if self.max_tokens is not None:
+            base_kwargs["max_tokens"] = self.max_tokens
+        return {**base_kwargs, **self.additional_kwargs}
+
+    def _get_response_token_counts(self, raw_response: Any) -> dict:
+        """Get the token usage reported by the response."""
+        if not isinstance(raw_response, dict):
+            return {}
+
+        usage = raw_response.get("usage", {})
+        # NOTE: other model providers that use the NVIDIA client may not report usage
+        if usage is None:
+            return {}
+
+        return {
+            "prompt_tokens": usage.get("prompt_tokens", 0),
+            "completion_tokens": usage.get("completion_tokens", 0),
+            "total_tokens": usage.get("total_tokens", 0),
+        }
+
+    def _complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            prompt=prompt, image_documents=image_documents, model=self.model
+        )
+        message_dict = [{"role": MessageRole.USER, "content": content}]
+
+        response = self._client.request(
+            endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+            stream=False,
+            messages=message_dict,
+            extra_headers=extra_headers,
+            **all_kwargs,
+        )
+        response = response.json()
+        text = response["choices"][0]["message"]["content"]
+        return CompletionResponse(
+            text=text,
+            raw=response,
+            additional_kwargs=self._get_response_token_counts(response),
+        )
+
+    def _stream_complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponseGen:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            prompt=prompt, image_documents=image_documents, model=self.model
+        )
+        message_dict = [{"role": MessageRole.USER, "content": content}]
+
+        def gen() -> CompletionResponseGen:
+            response = self._client.request(
+                messages=message_dict,
+                stream=True,
+                endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+                extra_headers=extra_headers,
+                **all_kwargs,
+            )
+            for line in response.iter_lines():
+                if line and line.strip() != b"data: [DONE]":
+                    line = line.decode("utf-8")
+                    line = line[5:]
+
+                    msg, final_line = aggregate_msgs(process_response(line))
+
+                    yield CompletionResponse(
+                        **msg,
+                        additional_kwargs=self._get_response_token_counts(line),
+                    )
+
+                    if final_line:
+                        break
+
+        return gen()
+
+    def complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        return self._complete(prompt, image_documents, **kwargs)
+
+    def stream_complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponseGen:
+        return self._stream_complete(prompt, image_documents, **kwargs)
+
+    def _chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            inputs=messages, model=self.model
+        )
+
+        response = self._client.request(
+            endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+            stream=False,
+            messages=content,
+            extra_headers=extra_headers,
+            **all_kwargs,
+        )
+        response = response.json()
+        text = response["choices"][0]["message"]["content"]
+
+        return ChatResponse(
+            delta=text,
+            message=ChatMessage(
+                role=response["choices"][0]["message"]["role"], content=text
+            ),
+            raw=response,
+            additional_kwargs=self._get_response_token_counts(response),
+        )
+
+    def chat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,
+    ) -> ChatResponse:
+        return self._chat(messages, **kwargs)
+
+    def stream_chat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,
+    ) -> ChatResponseGen:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            inputs=messages, model=self.model
+        )
+
+        def gen() -> CompletionResponseGen:
+            response = self._client.request(
+                messages=content,
+                stream=True,
+                endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+                extra_headers=extra_headers,
+                **all_kwargs,
+            )
+            for line in response.iter_lines():
+                if line and line.strip() != b"data: [DONE]":
+                    line = line.decode("utf-8")
+                    line = line[5:]
+
+                    msg, final_line = aggregate_msgs(process_response(line))
+
+                    role = msg.get("role", MessageRole.ASSISTANT)
+                    additional_kwargs = {}
+
+                    yield ChatResponse(
+                        message=ChatMessage(
+                            role=role,
+                            content=msg.get("content"),
+                            additional_kwargs=additional_kwargs,
+                        ),
+                        delta=msg.get("content"),
+                        raw=response,
+                        additional_kwargs=self._get_response_token_counts(line),
+                    )
+
+                    if final_line:
+                        break
+
+        return gen()
+
+    # ===== Async Endpoints =====
+
+    async def _acomplete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            prompt=prompt, image_documents=image_documents, model=self.model
+        )
+        message_dict = [{"role": MessageRole.USER, "content": content}]
+
+        response_json = await self._client.request_async(
+            endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+            stream=False,
+            messages=message_dict,
+            extra_headers=extra_headers,
+            **all_kwargs,
+        )
+        text = response_json["choices"][0]["message"]["content"]
+        return CompletionResponse(
+            text=text,
+            raw=response_json,
+            additional_kwargs=self._get_response_token_counts(response_json),
+        )
+
+    async def acomplete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        return await self._acomplete(prompt, image_documents, **kwargs)
+
+    async def astream_complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            prompt=prompt, image_documents=image_documents, model=self.model
+        )
+        payload = {
+            "messages": [{"role": MessageRole.USER, "content": content}],
+            "stream": True,
+            **all_kwargs,
+        }
+        headers = {
+            **self._client._get_headers(stream=True),
+            **extra_headers,
+        }
+
+        async def gen() -> CompletionResponseAsyncGen:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+                    json=payload,
+                    headers=headers,
+                ) as response:
+                    response.raise_for_status()
+                    text = ""
+                    async for line in response.content:
+                        if line and line.strip() != b"data: [DONE]":
+                            line = line.decode("utf-8").strip()
+                            if line.startswith("data:"):
+                                data = json.loads(line[5:])
+
+                                delta = data["choices"][0]["delta"]["content"]
+                                text += delta
+
+                                yield CompletionResponse(
+                                    text=text,
+                                    raw=data,
+                                    delta=text,
+                                    additional_kwargs=self._get_response_token_counts(
+                                        line
+                                    ),
+                                )
+
+        return gen()
+
+    async def _achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            inputs=messages, model=self.model
+        )
+
+        response_json = await self._client.request_async(
+            endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+            stream=False,
+            messages=content,
+            extra_headers=extra_headers,
+            **all_kwargs,
+        )
+
+        text = response_json["choices"][0]["message"]["content"]
+
+        return ChatResponse(
+            delta=text,
+            message=ChatMessage(
+                role=response_json["choices"][0]["message"]["role"], content=text
+            ),
+            raw=response_json,
+            additional_kwargs=self._get_response_token_counts(response_json),
+        )
+
+    async def achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        return await self._achat(messages, **kwargs)
+
+    async def astream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseAsyncGen:
+        all_kwargs = self._get_model_kwargs(**kwargs)
+        content, extra_headers = generate_nvidia_multi_modal_chat_message(
+            inputs=messages, model=self.model
+        )
+        payload = {"messages": content, "stream": True, **all_kwargs}
+        headers = {
+            **self._client._get_headers(stream=True),
+            **extra_headers,
+        }
+
+        async def gen() -> ChatResponseAsyncGen:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"],
+                    json=payload,
+                    headers=headers,
+                ) as response:
+                    response.raise_for_status()
+
+                    text = ""
+
+                    async for line in response.content:
+                        if line and line.strip() != b"data: [DONE]":
+                            line_text = line.decode("utf-8").strip()
+
+                            if line_text.startswith("data:"):
+                                data = json.loads(line_text[5:])
+                                delta = data["choices"][0]["delta"]["content"]
+                                role = data["choices"][0]["delta"].get(
+                                    "role", MessageRole.ASSISTANT
+                                )
+                                text += delta
+
+                                yield ChatResponse(
+                                    message=ChatMessage(
+                                        role=role,
+                                        content=delta,
+                                        additional_kwargs={},
+                                    ),
+                                    delta=delta,
+                                    raw=data,
+                                    additional_kwargs=self._get_response_token_counts(
+                                        data
+                                    ),
+                                )
+
+        return gen()
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa4d00bcd2a658a5072b5caae290ee4b135c9f36
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py
@@ -0,0 +1,341 @@
+import base64
+import filetype
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from llama_index.core.schema import ImageDocument
+import json
+import os
+import re
+import urllib
+from llama_index.core.base.llms.types import ChatMessage
+
+DEFAULT_MODEL = "microsoft/phi-3-vision-128k-instruct"
+BASE_URL = "https://ai.api.nvidia.com/v1/"
+
+KNOWN_URLS = [
+    BASE_URL,
+    "https://integrate.api.nvidia.com/v1",
+]
+
+NVIDIA_MULTI_MODAL_MODELS = {
+    "adept/fuyu-8b": {"endpoint": f"{BASE_URL}vlm/adept/fuyu-8b", "type": "nv-vlm"},
+    "google/deplot": {"endpoint": f"{BASE_URL}vlm/google/deplot", "type": "nv-vlm"},
+    "microsoft/kosmos-2": {
+        "endpoint": f"{BASE_URL}vlm/microsoft/kosmos-2",
+        "type": "nv-vlm",
+    },
+    "nvidia/neva-22b": {"endpoint": f"{BASE_URL}vlm/nvidia/neva-22b", "type": "nv-vlm"},
+    "google/paligemma": {
+        "endpoint": f"{BASE_URL}vlm/google/paligemma",
+        "type": "nv-vlm",
+    },
+    "microsoft/phi-3-vision-128k-instruct": {
+        "endpoint": f"{BASE_URL}vlm/microsoft/phi-3-vision-128k-instruct",
+        "type": "vlm",
+    },
+    "microsoft/phi-3.5-vision-instruct": {
+        "endpoint": f"{BASE_URL}microsoft/microsoft/phi-3_5-vision-instruct",
+        "type": "nv-vlm",
+    },
+    "nvidia/vila": {"endpoint": f"{BASE_URL}vlm/nvidia/vila", "type": "nv-vlm"},
+    "meta/llama-3.2-11b-vision-instruct": {
+        "endpoint": f"{BASE_URL}gr/meta/llama-3.2-11b-vision-instruct/chat/completions",
+        "type": "vlm",
+    },
+    "meta/llama-3.2-90b-vision-instruct": {
+        "endpoint": f"{BASE_URL}/gr/meta/llama-3.2-90b-vision-instruct/chat/completions",
+        "type": "vlm",
+    },
+}
+
+
+def infer_image_mimetype_from_base64(base64_string) -> str:
+    # Decode the base64 string
+    decoded_data = base64.b64decode(base64_string)
+
+    # Use filetype to guess the MIME type
+    kind = filetype.guess(decoded_data)
+
+    # Return the MIME type if detected, otherwise return None
+    return kind.mime if kind is not None else None
+
+
+def infer_image_mimetype_from_file_path(image_file_path: str) -> str:
+    # Get the file extension
+    file_extension = image_file_path.split(".")[-1].lower()
+
+    # Map file extensions to mimetypes
+    # Claude 3 support the base64 source type for images, and the image/jpeg, image/png, image/gif, and image/webp media types.
+    # https://docs.anthropic.com/claude/reference/messages_post
+    if file_extension in ["jpg", "jpeg", "png", "webp", "gif"]:
+        return file_extension
+    return "png"
+    # Add more mappings for other image types if needed
+
+    # If the file extension is not recognized
+
+
+# Function to encode the image to base64 content
+def encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def create_image_content(image_document) -> Optional[Dict[str, Any]]:
+    """
+    Create the image content based on the provided image document.
+    """
+    if image_document.image:
+        mimetype = (
+            image_document.mimetype
+            if image_document.mimetype
+            else infer_image_mimetype_from_base64(image_document.image)
+        )
+        return {
+            "type": "text",
+            "text": f'<img src="data:image/{mimetype};base64,{image_document.image}" />',
+        }, ""
+
+    elif "asset_id" in image_document.metadata:
+        asset_id = image_document.metadata["asset_id"]
+        mimetype = image_document.mimetype if image_document.mimetype else "jpeg"
+        return {
+            "type": "text",
+            "text": f'<img src="data:image/{mimetype};asset_id,{asset_id}" />',
+        }, asset_id
+
+    elif image_document.image_url and image_document.image_url != "":
+        mimetype = infer_image_mimetype_from_file_path(image_document.image_url)
+        return {
+            "type": "image_url",
+            "image_url": image_document.image_url,
+        }, ""
+    elif (
+        "file_path" in image_document.metadata
+        and image_document.metadata["file_path"] != ""
+    ):
+        mimetype = infer_image_mimetype_from_file_path(
+            image_document.metadata["file_path"]
+        )
+        base64_image = encode_image(image_document.metadata["file_path"])
+        return {
+            "type": "text",
+            "text": f'<img src="data:image/{mimetype};base64,{base64_image}" />',
+        }, ""
+
+    return None, None
+
+
+def generate_nvidia_multi_modal_chat_message(
+    model: str,
+    prompt: Optional[str] = None,
+    inputs: Optional[List[ChatMessage]] = [],
+    image_documents: Optional[Sequence[ImageDocument]] = [],
+) -> List[Dict[str, Any]]:
+    # If image_documents is None, return a text-only chat message
+    completion_content = []
+    asset_ids = []
+    extra_headers = {}
+    model_type = NVIDIA_MULTI_MODAL_MODELS[model]["type"]
+
+    for input in inputs:
+        if input.content:
+            asset_ids.extend(_nv_vlm_get_asset_ids(input.content))
+
+    # Process each image document
+    for image_document in image_documents:
+        image_content, asset_id = create_image_content(image_document)
+        if image_content:
+            completion_content.append(image_content)
+        if asset_id:
+            asset_ids.append(asset_id)
+
+    if len(asset_ids) > 0:
+        extra_headers["NVCF-INPUT-ASSET-REFERENCES"] = ",".join(asset_ids)
+
+    # Append the text prompt to the completion content
+    if prompt:
+        completion_content.append({"type": "text", "text": prompt})
+        return completion_content, extra_headers
+
+    inputs = [
+        {
+            "role": message.role,
+            "content": _nv_vlm_adjust_input(message, model_type).content,
+        }
+        for message in inputs
+    ]
+    return inputs, extra_headers
+
+
+def process_response(response) -> List[dict]:
+    """General-purpose response processing for single responses and streams."""
+    if hasattr(response, "json"):  ## For single response (i.e. non-streaming)
+        try:
+            return [response.json()]
+        except json.JSONDecodeError:
+            response = str(response.__dict__)
+    if isinstance(response, str):  ## For set of responses (i.e. streaming)
+        msg_list = []
+        for msg in response.split("\n\n"):
+            if "{" not in msg:
+                continue
+            msg_list += [json.loads(msg[msg.find("{") :])]
+        return msg_list
+    raise ValueError(f"Received ill-formed response: {response}")
+
+
+def aggregate_msgs(msg_list: Sequence[dict]) -> Tuple[dict, bool]:
+    """Dig out relevant details of aggregated message."""
+    content_buffer: Dict[str, Any] = {}
+    content_holder: Dict[Any, Any] = {}
+    usage_holder: Dict[Any, Any] = {}  ####
+    finish_reason_holder: Optional[str] = None
+    is_stopped = False
+    for msg in msg_list:
+        usage_holder = msg.get("usage", {})  ####
+        if "choices" in msg:
+            ## Tease out ['choices'][0]...['delta'/'message']
+            # when streaming w/ usage info, we may get a response
+            #  w/ choices: [] that includes final usage info
+            choices = msg.get("choices", [{}])
+            msg = choices[0] if choices else {}
+            # TODO: this needs to be fixed, the fact we only
+            #       use the first choice breaks the interface
+            finish_reason_holder = msg.get("finish_reason", None)
+            is_stopped = finish_reason_holder == "stop"
+            msg = msg.get("delta", msg.get("message", msg.get("text", "")))
+            if not isinstance(msg, dict):
+                msg = {"content": msg}
+        elif "data" in msg:
+            ## Tease out ['data'][0]...['embedding']
+            msg = msg.get("data", [{}])[0]
+        content_holder = msg
+        for k, v in msg.items():
+            if k in ("content",) and k in content_buffer:
+                content_buffer[k] += v
+            else:
+                content_buffer[k] = v
+        if is_stopped:
+            break
+    content_holder = {
+        **content_holder,
+        **content_buffer,
+        "text": content_buffer["content"],
+    }
+    if usage_holder:
+        content_holder.update(token_usage=usage_holder)  ####
+    if finish_reason_holder:
+        content_holder.update(finish_reason=finish_reason_holder)
+    return content_holder, is_stopped
+
+
+def _nv_vlm_adjust_input(message: ChatMessage, model_type: str) -> ChatMessage:
+    """
+    This function converts the OpenAI VLM API input message to
+    NVIDIA VLM API input message, in place.
+
+    The NVIDIA VLM API input message.content:
+        {
+            "role": "user",
+            "content": [
+                ...,
+                {
+                    "type": "image_url",
+                    "image_url": "{data}"
+                },
+                ...
+            ]
+        }
+    where OpenAI VLM API input message.content:
+        {
+            "role": "user",
+            "content": [
+                ...,
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "{url | data}"
+                    }
+                },
+                ...
+            ]
+        }
+
+    In the process, it accepts a url or file and converts them to
+    data urls.
+    """
+    if content := message.content:
+        if isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict) and "image_url" in part:
+                    if (
+                        isinstance(part["image_url"], dict)
+                        and "url" in part["image_url"]
+                    ):
+                        url = _url_to_b64_string(part["image_url"]["url"])
+                        if model_type == "nv-vlm":
+                            part["image_url"] = url
+                        else:
+                            part["image_url"]["url"] = url
+    return message
+
+
+def _nv_vlm_get_asset_ids(
+    content: Union[str, List[Union[str, Dict[str, Any]]]],
+) -> List[str]:
+    """
+    Extracts asset IDs from the message content.
+
+    VLM APIs accept asset IDs as input in two forms:
+     - content = [{"image_url": {"url": "data:image/{type};asset_id,{asset_id}"}}*]
+     - content = .*<img src="data:image/{type};asset_id,{asset_id}"/>.*
+    """
+
+    def extract_asset_id(data: str) -> List[str]:
+        pattern = re.compile(r'data:image/[^;]+;asset_id,([^"\'\s]+)')
+        return pattern.findall(data)
+
+    asset_ids = []
+    if isinstance(content, str):
+        asset_ids.extend(extract_asset_id(content))
+    elif isinstance(content, list):
+        for part in content:
+            if isinstance(part, str):
+                asset_ids.extend(extract_asset_id(part))
+            elif isinstance(part, dict) and "image_url" in part:
+                image_url = part["image_url"]
+                if isinstance(image_url, str):
+                    asset_ids.extend(extract_asset_id(image_url))
+            elif isinstance(part, dict) and "text" in part:
+                image_url = part["text"]
+                if isinstance(image_url, str):
+                    asset_ids.extend(extract_asset_id(image_url))
+
+    return asset_ids
+
+
+def _is_url(s: str) -> bool:
+    try:
+        result = urllib.parse.urlparse(s)
+        return all([result.scheme, result.netloc])
+    except Exception as e:
+        raise f"Unable to parse URL: {e}"
+        return False
+
+
+def _url_to_b64_string(image_source: str) -> str:
+    try:
+        if _is_url(image_source):
+            return image_source
+        elif image_source.startswith("data:image"):
+            return image_source
+        elif os.path.exists(image_source):
+            encoded = encode_image(image_source)
+            image_type = infer_image_mimetype_from_base64(encoded)
+            return f"data:image/{image_type};base64,{encoded}"
+        else:
+            raise ValueError(
+                "The provided string is not a valid URL, base64, or file path."
+            )
+    except Exception as e:
+        raise ValueError(f"Unable to process the provided image source: {e}")
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9555020026cb10a9348f5fe873ce6676eaeed519
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml
@@ -0,0 +1,57 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.multi_modal_llms.nvidia"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index multi_modal nvidia integration"
+license = "MIT"
+name = "llama-index-multi-modal-llms-nvidia"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.0"
+filetype = "^1.2.0"
+
+[tool.poetry.group.dev.dependencies]
+aiohttp = "^3.10.10"
+black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
+codespell = {extras = ["toml"], version = ">=v2.2.6"}
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pillow = "10.0.0"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "^8.2"
+pytest-asyncio = "^0.24.0"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
+types-setuptools = "67.1.0.0"
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..45d59ac8248a2970130a0a35eb97b7c847273629
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD
@@ -0,0 +1,5 @@
+python_tests()
+
+python_test_utils(
+    name="test_utils",
+)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/__init__.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a843400e84d2c4a59f6a9b26558be61785b000
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py
@@ -0,0 +1,59 @@
+import pytest
+import os
+
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal as Interface
+from llama_index.multi_modal_llms.nvidia.utils import DEFAULT_MODEL
+
+from typing import Generator
+
+
+# this fixture is used to mask the NVIDIA_API_KEY environment variable and restore it
+# after the test. it also returns the value of the NVIDIA_API_KEY environment variable
+# before it was masked so that it can be used in the test.
+@pytest.fixture()
+def masked_env_var() -> Generator[str, None, None]:
+    var = "NVIDIA_API_KEY"
+    try:
+        if val := os.environ.get(var, None):
+            del os.environ[var]
+        yield val
+    finally:
+        if val:
+            os.environ[var] = val
+
+
+def pytest_collection_modifyitems(config, items):
+    if "NVIDIA_API_KEY" not in os.environ:
+        skip_marker = pytest.mark.skip(
+            reason="requires NVIDIA_API_KEY environment variable"
+        )
+        for item in items:
+            item.add_marker(skip_marker)
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--all-models",
+        action="store_true",
+        help="Run tests across all models",
+    )
+    parser.addoption(
+        "--model-id",
+        action="store",
+        help="Run tests for a specific chat model",
+    )
+    parser.addoption(
+        "--nim-endpoint",
+        type=str,
+        help="Run tests using NIM mode",
+    )
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
+    if "vlm_model" in metafunc.fixturenames:
+        models = [DEFAULT_MODEL]
+        if model := metafunc.config.getoption("--model-id"):
+            models = [model]
+        elif metafunc.config.getoption("--all-models"):
+            models = [model.id for model in Interface().available_models]
+        metafunc.parametrize("vlm_model", models, ids=models)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e3480ac101b27d0b2bb2bf56e6ba4d9047c96c6
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py
@@ -0,0 +1,52 @@
+import os
+
+import pytest
+
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal
+
+from typing import Any
+from llama_index.core.schema import ImageDocument
+
+
+def get_api_key(instance: Any) -> str:
+    return instance.api_key
+
+
+def test_create_default_url_without_api_key(masked_env_var: str) -> None:
+    with pytest.raises(ValueError) as err_msg:
+        NVIDIAMultiModal()
+    assert (
+        str(err_msg.value)
+        == "An API key is required for the hosted NIM. This will become an error in 0.2.0."
+    )
+
+
+@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"])
+def test_create_with_api_key(param: str, masked_env_var: str) -> None:
+    instance = NVIDIAMultiModal(**{param: "just testing no failure"})
+    assert get_api_key(instance) == "just testing no failure"
+
+
+def test_api_key_priority(masked_env_var: str) -> None:
+    try:
+        os.environ["NVIDIA_API_KEY"] = "ENV"
+        assert get_api_key(NVIDIAMultiModal()) == "ENV"
+        assert get_api_key(NVIDIAMultiModal(nvidia_api_key="PARAM")) == "PARAM"
+        assert get_api_key(NVIDIAMultiModal(api_key="PARAM")) == "PARAM"
+        assert (
+            get_api_key(NVIDIAMultiModal(api_key="LOW", nvidia_api_key="HIGH"))
+            == "HIGH"
+        )
+    finally:
+        # we must clean up environ or it may impact other tests
+        del os.environ["NVIDIA_API_KEY"]
+
+
+@pytest.mark.integration()
+def test_bogus_api_key_error(vlm_model: str, masked_env_var: str) -> None:
+    client = NVIDIAMultiModal(model=vlm_model, nvidia_api_key="BOGUS")
+    with pytest.raises(Exception) as exc_info:
+        client.complete(
+            prompt="xyz", image_documents=[ImageDocument(image_url="https://xyz.com")]
+        )
+    assert "401" in str(exc_info.value)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..829622e8724401a47135a0d12c3edb51e00c94d3
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py
@@ -0,0 +1,11 @@
+import pytest
+
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal
+
+
+@pytest.mark.integration()
+def test_available_models() -> None:
+    models = NVIDIAMultiModal().available_models
+    assert models
+    assert isinstance(models, list)
+    assert all(isinstance(model, str) for model in models)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe97b2d210c35abe9058ea7d825f4ab8692c950
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py
@@ -0,0 +1,436 @@
+from llama_index.core.multi_modal_llms.base import MultiModalLLM
+from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal
+from llama_index.multi_modal_llms.nvidia.utils import (
+    NVIDIA_MULTI_MODAL_MODELS,
+)
+import base64
+import os
+from typing import Any, Dict, List, Union
+
+import pytest
+import requests
+from llama_index.core.base.llms.types import (
+    CompletionResponse,
+    ChatMessage,
+    ChatResponse,
+)
+from llama_index.core.schema import ImageDocument
+import numpy as np
+from PIL import Image
+import tempfile
+
+# TODO: multiple texts
+# TODO: accuracy tests
+
+#
+# API Specification -
+#
+#  - User message may contain 1 or more image_url
+#  - url is either a url to an image or base64 encoded image
+#  - format for base64 is "data:image/png;{type}},..."
+#  - supported image types are png, jpeg (or jpg), webp, gif (non-animated)
+#
+
+#
+# note: differences between api catalog and openai api
+#  - openai api supports server-side image download, api catalog does not consistently
+#   - NVIDIAMultiModal does client side download to simulate the same behavior
+#  - NVIDIAMultiModal will automatically read local files and convert them to base64
+#  - openai api always uses {"image_url": {"url": "..."}}
+#     where api catalog sometimes uses {"image_url": "..."}
+#
+
+image_urls = [
+    "https://res.cloudinary.com/hello-tickets/image/upload/c_limit,f_auto,q_auto,w_1920/v1640835927/o3pfl41q7m5bj8jardk0.jpg",
+    "https://www.visualcapitalist.com/wp-content/uploads/2023/10/US_Mortgage_Rate_Surge-Sept-11-1.jpg",
+    "https://www.sportsnet.ca/wp-content/uploads/2023/11/CP1688996471-1040x572.jpg",
+    # Add yours here!
+]
+
+MODELS = list(NVIDIA_MULTI_MODAL_MODELS.keys())
+
+
+def test_embedding_class():
+    names_of_base_classes = [b.__name__ for b in NVIDIAMultiModal.__mro__]
+    assert MultiModalLLM.__name__ in names_of_base_classes
+
+
+def test_init():
+    m = NVIDIAMultiModal(max_tokens=400)
+    assert m.max_tokens == 400
+
+
+def urlToBase64(url):
+    return base64.b64encode(requests.get(url).content).decode("utf-8")
+
+
+@pytest.fixture(scope="session")
+def temp_image_path(suffix: str):
+    # Create a white square image
+    white_square = np.ones((100, 100, 3), dtype=np.uint8) * 255
+    image = Image.fromarray(white_square)
+
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(suffix=f".{suffix}", delete=False) as temp_file:
+        image.save(temp_file, format=suffix.upper())
+        temp_path = temp_file.name
+
+    yield temp_path
+
+    # Clean up the temporary file after the test
+    os.unlink(temp_path)
+
+
+@pytest.fixture(scope="session")
+def get_asset_id():
+    content_type = "image/jpg"
+    description = "example-image-from-lc-nv-ai-e-notebook"
+
+    create_response = requests.post(
+        "https://api.nvcf.nvidia.com/v2/nvcf/assets",
+        headers={
+            "Authorization": f"Bearer {os.environ['NVIDIA_API_KEY']}",
+            "accept": "application/json",
+            "Content-Type": "application/json",
+        },
+        json={"contentType": content_type, "description": description},
+    )
+    create_response.raise_for_status()
+
+    upload_response = requests.put(
+        create_response.json()["uploadUrl"],
+        headers={
+            "Content-Type": content_type,
+            "x-amz-meta-nvcf-asset-description": description,
+        },
+        data=requests.get(image_urls[0]).content,
+    )
+    upload_response.raise_for_status()
+
+    return create_response.json()["assetId"]
+
+
+def test_class():
+    emb = NVIDIAMultiModal(api_key="BOGUS")
+    assert isinstance(emb, MultiModalLLM)
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        [ImageDocument(image_url=image_urls[0])],
+        [ImageDocument(image=urlToBase64(image_urls[0]), mimetype="jpeg")],
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    ["invoke", "stream"],
+)
+def test_vlm_input_style(
+    vlm_model: str,
+    content: List[ImageDocument],
+    func: str,
+) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    assert vlm_model in MODELS
+    if func == "invoke":
+        response = llm.complete(prompt="Describe the Image.", image_documents=content)
+        assert isinstance(response, CompletionResponse)
+    if func == "stream":
+        for token in llm.stream_complete(
+            prompt="Describe the Image.", image_documents=content
+        ):
+            assert isinstance(token.text, str)
+
+
+@pytest.mark.parametrize(
+    "suffix",
+    ["jpeg", "png", "webp", "gif"],
+    scope="session",
+)
+def test_vlm_image_type(
+    suffix: str,
+    temp_image_path: str,
+    vlm_model: str,
+) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    response = llm.complete(
+        "Describe image", image_documents=[ImageDocument(image_path=temp_image_path)]
+    )
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+pytest.mark.skipif(os.path.isfile("data/nvidia-picasso-large.png"))
+
+
+def test_vlm_image_large(
+    vlm_model: str,
+) -> None:
+    chat = NVIDIAMultiModal(model=vlm_model)
+    response = chat.complete(
+        prompt="Describe image",
+        image_documents=[ImageDocument(image_path="data/nvidia-picasso-large.png")],
+    )
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+@pytest.mark.parametrize(
+    "suffix",
+    ["jpeg", "png", "webp", "gif"],
+    scope="session",
+)
+def test_vlm_two_images(
+    suffix: str,
+    temp_image_path: str,
+    vlm_model: str,
+) -> None:
+    chat = NVIDIAMultiModal(model=vlm_model)
+    response = chat.complete(
+        prompt="Describe image",
+        image_documents=[
+            ImageDocument(image_path=temp_image_path),
+            ImageDocument(image_path=temp_image_path),
+        ],
+    )
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        [ImageDocument(metadata={"asset_id": ""})],
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    ["invoke", "stream"],
+)
+def test_vlm_asset_id(
+    vlm_model: str,
+    content: Union[str, List[Union[str, Dict[str, Any]]]],
+    func: str,
+    get_asset_id: str,
+) -> None:
+    assert isinstance(content[0], ImageDocument)
+    content[0].metadata["asset_id"] = get_asset_id
+
+    assert content[0].metadata["asset_id"] != ""
+
+    chat = NVIDIAMultiModal(model=vlm_model)
+    if func == "invoke":
+        response = chat.complete(prompt="Describe image", image_documents=content)
+        assert isinstance(response, CompletionResponse)
+        assert isinstance(response.text, str)
+    if func == "stream":
+        for token in chat.stream_complete(
+            prompt="Describe image", image_documents=content
+        ):
+            assert isinstance(token.text, str)
+
+
+## ------------------------- chat/stream_chat test cases ------------------------- ##
+
+
+@pytest.mark.parametrize(
+    "func",
+    ["chat", "stream_chat"],
+)
+def test_stream_chat_multiple_messages(vlm_model: str, func: str) -> None:
+    """Test streaming chat with multiple messages and images."""
+    llm = NVIDIAMultiModal(model=vlm_model)
+
+    messages = [
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Describe the first image:"},
+                {"type": "image_url", "image_url": image_urls[0]},
+            ],
+        ),
+        ChatMessage(
+            role="assistant", content="This is a response about the first image."
+        ),
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Now describe this second image:"},
+                {"type": "image_url", "image_url": image_urls[1]},
+            ],
+        ),
+    ]
+
+    if func == "chat":
+        response = llm.chat(messages)
+        assert isinstance(response, ChatResponse)
+        assert isinstance(response.delta, str)
+    if func == "stream_chat":
+        for token in llm.stream_chat(messages):
+            assert isinstance(token.delta, str)
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        """<img src="data:image/jpg;asset_id,{asset_id}"/>""",
+        [
+            {
+                "type": "image_url",
+                "image_url": "data:image/jpg;asset_id,{asset_id}",
+            }
+        ],
+        [
+            {"type": "text", "text": "Describe this image:"},
+            {"type": "image_url", "image_url": image_urls[1]},
+        ],
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    ["chat", "stream_chat"],
+)
+def test_vlm_asset_id_chat(
+    vlm_model: str,
+    content: Union[str, List[Union[str, Dict[str, Any]]]],
+    func: str,
+    get_asset_id: str,
+) -> None:
+    def fill(
+        item: Any,
+        asset_id: str,
+    ) -> Union[str, Any]:
+        # do not mutate item, mutation will cause cross test contamination
+        result: Any
+        if isinstance(item, str):
+            result = item.format(asset_id=asset_id)
+        elif isinstance(item, ChatMessage):
+            result = item.model_copy(update={"content": fill(item.content, asset_id)})
+        elif isinstance(item, list):
+            result = [fill(sub_item, asset_id) for sub_item in item]
+        elif isinstance(item, dict):
+            result = {key: fill(value, asset_id) for key, value in item.items()}
+        return result
+
+    asset_id = get_asset_id
+    assert asset_id != ""
+    content = fill(content, asset_id)
+
+    llm = NVIDIAMultiModal(model=vlm_model)
+    if func == "chat":
+        response = llm.chat([ChatMessage(role="user", content=content)])
+        assert isinstance(response, ChatResponse)
+        assert isinstance(response.delta, str)
+    if func == "stream_chat":
+        for token in llm.stream_chat([ChatMessage(role="user", content=content)]):
+            assert isinstance(token.delta, str)
+
+
+@pytest.mark.parametrize(
+    "func",
+    ["chat", "stream_chat"],
+    scope="session",
+)
+@pytest.mark.parametrize(
+    "suffix",
+    ["jpeg", "png", "webp", "gif"],
+    scope="session",
+)
+def test_vlm_image_type_chat(
+    suffix: str, temp_image_path: str, vlm_model: str, func: str
+) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    if func == "chat":
+        response = llm.chat(
+            [ChatMessage(content=[{"type": "image_url", "image_url": temp_image_path}])]
+        )
+        assert isinstance(response, ChatResponse)
+        assert isinstance(response.delta, str)
+    if func == "stream_chat":
+        for token in llm.stream_chat(
+            [ChatMessage(content=[{"type": "image_url", "image_url": temp_image_path}])]
+        ):
+            assert isinstance(token, ChatResponse)
+
+
+## ------------------------- Async test cases ------------------------- ##
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        [ImageDocument(image_url=image_urls[0])],
+        [ImageDocument(image=urlToBase64(image_urls[0]), mimetype="jpeg")],
+    ],
+)
+@pytest.mark.asyncio()
+async def test_vlm_input_style_async(
+    vlm_model: str,
+    content: List[ImageDocument],
+) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    assert vlm_model in MODELS
+
+    # Await the completion of the async call
+    response = await llm.acomplete(
+        prompt="Describe the Image.", image_documents=content
+    )
+
+    # Ensure the response is a valid CompletionResponse
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.text, str)
+
+
+@pytest.mark.asyncio()
+async def test_vlm_chat_async(vlm_model: str) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    messages = [
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Describe the first image:"},
+                {"type": "image_url", "image_url": image_urls[0]},
+            ],
+        ),
+        ChatMessage(
+            role="assistant", content="This is a response about the first image."
+        ),
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Now describe this second image:"},
+                {"type": "image_url", "image_url": image_urls[1]},
+            ],
+        ),
+    ]
+    response = await llm.achat(messages)
+    assert isinstance(response, ChatResponse)
+    assert isinstance(response.delta, str)
+
+
+@pytest.mark.asyncio()
+async def test_vlm_chat_async_stream(vlm_model: str) -> None:
+    llm = NVIDIAMultiModal(model=vlm_model)
+    messages = [
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Describe the first image:"},
+                {"type": "image_url", "image_url": image_urls[0]},
+            ],
+        ),
+        ChatMessage(
+            role="assistant", content="This is a response about the first image."
+        ),
+        ChatMessage(
+            role="user",
+            content=[
+                {"type": "text", "text": "Now describe this second image:"},
+                {"type": "image_url", "image_url": image_urls[1]},
+            ],
+        ),
+    ]
+    async for token in await llm.astream_chat(messages):
+        assert isinstance(token, ChatResponse)
+        assert isinstance(token.delta, str)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6805526c2026e808cd6d4894022f89659000b81f
--- /dev/null
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py
@@ -0,0 +1,68 @@
+import unittest
+from llama_index.multi_modal_llms.nvidia.utils import (
+    infer_image_mimetype_from_base64,
+    infer_image_mimetype_from_file_path,
+    generate_nvidia_multi_modal_chat_message,
+    create_image_content,
+)
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+)
+from llama_index.core.schema import ImageDocument
+
+
+class TestFunctions(unittest.TestCase):
+    def test_infer_image_mimetype_from_base64(self):
+        base64_string = "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB"
+        self.assertEqual(infer_image_mimetype_from_base64(base64_string), "image/jpeg")
+
+    def test_infer_image_mimetype_from_file_path(self):
+        self.assertEqual(infer_image_mimetype_from_file_path("image.jpg"), "jpg")
+        self.assertEqual(infer_image_mimetype_from_file_path("image.png"), "png")
+        self.assertEqual(infer_image_mimetype_from_file_path("image.webp"), "webp")
+        self.assertEqual(infer_image_mimetype_from_file_path("image.gif"), "gif")
+        self.assertEqual(infer_image_mimetype_from_file_path("image.txt"), "png")
+
+    # def test_encode_image(self):
+    #     image_path = "image.jpg"
+    #     encoded_image = encode_image(image_path)
+    #     self.assertIsInstance(encoded_image, str)
+
+    def test_create_image_content(self):
+        image_document = ImageDocument(image="abcd", mimetype="jpeg")
+        content, asset_id = create_image_content(image_document)
+        self.assertEqual(content["type"], "text")
+        self.assertEqual(content["text"], '<img src="data:image/jpeg;base64,abcd" />')
+        self.assertEqual(asset_id, "")
+
+        image_document = ImageDocument(metadata={"asset_id": "12345"}, mimetype="jpeg")
+        content, asset_id = create_image_content(image_document)
+        self.assertEqual(content["type"], "text")
+        self.assertEqual(
+            content["text"], '<img src="data:image/jpeg;asset_id,12345" />'
+        )
+        self.assertEqual(asset_id, "12345")
+
+        image_document = ImageDocument(image_url="https://example.com/image.jpg")
+        content, asset_id = create_image_content(image_document)
+        self.assertEqual(content["type"], "image_url")
+        self.assertEqual(content["image_url"], "https://example.com/image.jpg")
+        self.assertEqual(asset_id, "")
+
+    def test_generate_nvidia_multi_modal_chat_message(self):
+        inputs = [ChatMessage(role="user", content="Hello")]
+        image_documents = [ImageDocument(image="base64_string", mimetype="image/jpeg")]
+        message, extra_headers = generate_nvidia_multi_modal_chat_message(
+            "google/deplot", inputs=inputs, image_documents=image_documents
+        )
+        self.assertEqual(len(message[0]), 2)
+
+        inputs = [ChatMessage(role="user", content="Hello")]
+        image_documents = [
+            ImageDocument(metadata={"asset_id": "12345"}, mimetype="jpeg")
+        ]
+        message, extra_headers = generate_nvidia_multi_modal_chat_message(
+            "google/deplot", inputs=inputs, image_documents=image_documents
+        )
+        self.assertEqual(len(message[0]), 2)
+        self.assertEqual(extra_headers["NVCF-INPUT-ASSET-REFERENCES"], "12345")