diff --git a/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb b/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..5753e690097f05c78c3cb2c05d3263cc42779eec --- /dev/null +++ b/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb @@ -0,0 +1,523 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/multi_modal/nvidia_multi_modal.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n", + "\n", + "# Multi-Modal LLM using NVIDIA endpoints for image reasoning\n", + "\n", + "In this notebook, we show how to use NVIDIA MultiModal LLM class/abstraction for image understanding/reasoning.\n", + "\n", + "We also show several functions we are now supporting for NVIDIA LLM:\n", + "* `complete` (both sync and async): for a single prompt and list of images\n", + "* `stream complete` (both sync and async): for steaming output of complete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet llama-index-multi-modal-llms-nvidia llama-index-embeddings-nvidia llama-index-readers-file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "# del os.environ['NVIDIA_API_KEY'] ## delete key and reset\n", + "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n", + " print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n", + "else:\n", + " nvapi_key = getpass.getpass(\"NVAPI Key (starts with nvapi-): \")\n", + " assert nvapi_key.startswith(\n", + " \"nvapi-\"\n", + " ), f\"{nvapi_key[:5]}... is not a valid key\"\n", + " os.environ[\"NVIDIA_API_KEY\"] = nvapi_key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal\n", + "import base64\n", + "from llama_index.core.schema import ImageDocument\n", + "from PIL import Image\n", + "import requests\n", + "from io import BytesIO\n", + "\n", + "# import matplotlib.pyplot as plt\n", + "from llama_index.core.multi_modal_llms.generic_utils import load_image_urls\n", + "\n", + "llm = NVIDIAMultiModal()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize `OpenAIMultiModal` and Load Images from URLs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_urls = [\n", + " \"https://res.cloudinary.com/hello-tickets/image/upload/c_limit,f_auto,q_auto,w_1920/v1640835927/o3pfl41q7m5bj8jardk0.jpg\",\n", + " \"https://www.visualcapitalist.com/wp-content/uploads/2023/10/US_Mortgage_Rate_Surge-Sept-11-1.jpg\",\n", + " \"https://www.sportsnet.ca/wp-content/uploads/2023/11/CP1688996471-1040x572.jpg\",\n", + " # Add yours here!\n", + "]\n", + "\n", + "img_response = requests.get(image_urls[0])\n", + "img = Image.open(BytesIO(img_response.content))\n", + "# plt.imshow(img)\n", + "\n", + "image_url_documents = load_image_urls(image_urls)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Complete a prompt with a bunch of images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.complete(\n", + " prompt=f\"What is this image?\",\n", + " image_documents=image_url_documents,\n", + ")\n", + "\n", + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await llm.acomplete(\n", + " prompt=\"tell me about this image\",\n", + " image_documents=image_url_documents,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Steam Complete a prompt with a bunch of images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stream_complete_response = llm.stream_complete(\n", + " prompt=f\"What is this image?\",\n", + " image_documents=image_url_documents,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for r in stream_complete_response:\n", + " print(r.text, end=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stream_complete_response = await llm.astream_complete(\n", + " prompt=f\"What is this image?\",\n", + " image_documents=image_url_documents,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_element = None\n", + "async for last_element in stream_complete_response:\n", + " pass\n", + "\n", + "print(last_element)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Passing an image as a base64 encoded string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "imgr_content = base64.b64encode(\n", + " requests.get(\n", + " \"https://helloartsy.com/wp-content/uploads/kids/cats/how-to-draw-a-small-cat/how-to-draw-a-small-cat-step-6.jpg\"\n", + " ).content\n", + ").decode(\"utf-8\")\n", + "\n", + "llm.complete(\n", + " prompt=\"List models in image\",\n", + " image_documents=[ImageDocument(image=imgr_content, mimetype=\"jpeg\")],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Passing an image as an NVCF asset\n", + "If your image is sufficiently large or you will pass it multiple times in a chat conversation, you may upload it once and reference it in your chat conversation\n", + "\n", + "See https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/assets.html for details about how upload the image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "content_type = \"image/jpg\"\n", + "description = \"example-image-from-lc-nv-ai-e-notebook\"\n", + "\n", + "create_response = requests.post(\n", + " \"https://api.nvcf.nvidia.com/v2/nvcf/assets\",\n", + " headers={\n", + " \"Authorization\": f\"Bearer {os.environ['NVIDIA_API_KEY']}\",\n", + " \"accept\": \"application/json\",\n", + " \"Content-Type\": \"application/json\",\n", + " },\n", + " json={\"contentType\": content_type, \"description\": description},\n", + ")\n", + "create_response.raise_for_status()\n", + "\n", + "upload_response = requests.put(\n", + " create_response.json()[\"uploadUrl\"],\n", + " headers={\n", + " \"Content-Type\": content_type,\n", + " \"x-amz-meta-nvcf-asset-description\": description,\n", + " },\n", + " data=img_response.content,\n", + ")\n", + "upload_response.raise_for_status()\n", + "\n", + "asset_id = create_response.json()[\"assetId\"]\n", + "asset_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.stream_complete(\n", + " prompt=f\"Describe the image\",\n", + " image_documents=[\n", + " ImageDocument(metadata={\"asset_id\": asset_id}, mimetype=\"png\")\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for r in response:\n", + " print(r.text, end=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Passing images from local files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "# put your local directore here\n", + "image_documents = SimpleDirectoryReader(\"./tests/data/\").load_data()\n", + "\n", + "llm.complete(\n", + " prompt=\"Describe the images as an alternative text\",\n", + " image_documents=image_documents,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Chat with of images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.llms import ChatMessage\n", + "\n", + "llm.chat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe this image:\"},\n", + " {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.llms import ChatMessage\n", + "\n", + "await llm.achat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe this image:\"},\n", + " {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm.chat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe the image\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": f'<img src=\"data:{content_type};asset_id,{asset_id}\" />',\n", + " },\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await llm.achat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe the image\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": f'<img src=\"data:{content_type};asset_id,{asset_id}\" />',\n", + " },\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stream Chat a prompt with images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.llms import ChatMessage\n", + "\n", + "streaming_resp = llm.stream_chat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe this image:\"},\n", + " {\"type\": \"image_url\", \"image_url\": image_urls[1]},\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for r in streaming_resp:\n", + " print(r.delta, end=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.llms import ChatMessage\n", + "\n", + "resp = await llm.astream_chat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"Describe this image:\"},\n", + " {\"type\": \"image_url\", \"image_url\": image_urls[0]},\n", + " ],\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_element = None\n", + "async for last_element in resp:\n", + " pass\n", + "\n", + "print(last_element)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.stream_chat(\n", + " [\n", + " ChatMessage(\n", + " role=\"user\",\n", + " content=f\"\"\"<img src=\"data:image/jpg;\n", + " ,{asset_id}\"/>\"\"\",\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for r in response:\n", + " print(r.delta, end=\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..990c18de229088f55c6c514fd0f2d49981d1b0e7 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..0896ca890d8bffd60a44fa824f8d57fecd73ee53 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b9eab05aa370629a4a3de75df3ff64cd53887b68 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2ce08fc489f52fa26388f108917aca86ee4e0b79 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/README.md @@ -0,0 +1,126 @@ +# LlamaIndex Multi_Modal Integration: Nvidia + +This project integrates Nvidia vlm into the LlamaIndex framework, enabling advanced multimodal capabilities for various AI applications. + +## Features + +- Seamless integration of NVIDIA vlm with LlamaIndex +- Support for multiple state-of-the-art vision-language models: + - [adept/fuyu-8b](https://build.nvidia.com/adept/fuyu-8b) + - [google/deplot](https://build.nvidia.com/google/google-deplot) + - [nvidia/neva-22b](https://build.nvidia.com/nvidia/neva-22b) + - [google/paligemma](https://build.nvidia.com/google/google-paligemma) + - [microsoft/phi-3-vision-128k-instruct](https://build.nvidia.com/microsoft/phi-3-vision-128k-instruct) + - [microsoft/phi-3.5-vision-instruct](https://build.nvidia.com/microsoft/phi-3_5-vision-instruct) + - [nvidia/vila](https://build.nvidia.com/nvidia) + - [meta/llama-3.2-11b-vision-instruct](https://build.nvidia.com/meta/llama-3.2-11b-vision-instruct) + - [meta/llama-3.2-90b-vision-instruct](https://build.nvidia.com/meta/llama-3.2-90b-vision-instruct) +- Easy-to-use interface for multimodal tasks like image captioning and visual question answering +- Configurable model parameters for fine-tuned performance + +--- + +## Installation + +```bash +pip install llama-index-multi-modal-llms-nvidia +``` + +Make sure to set your NVIDIA API key as an environment variable: + +```bash +export NVIDIA_API_KEY=your_api_key_here +``` + +## Usage + +Here's a basic example of how to use the Nvidia vlm integration: + +```python +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal +from llama_index.core.schema import ImageDocument + +# Initialize the model +model = NVIDIAMultiModal() + +# Prepare your image and prompt +image_document = ImageDocument(image_path="path/to/your/image.jpg") +prompt = "Describe this image in detail." + +# Generate a response +response = model.complete(prompt, image_documents=[image_document]) + +print(response.text) +``` + +### Streaming + +```python +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal +from llama_index.core.schema import ImageDocument + +# Initialize the model +model = NVIDIAMultiModal() + +# Prepare your image and prompt +image_document = ImageDocument(image_path="downloaded_image.jpg") +prompt = "Describe this image in detail." + +import nest_asyncio +import asyncio + +nest_asyncio.apply() + +response = model.stream_complete( + prompt=f"Describe the image", + image_documents=[ + ImageDocument(metadata={"asset_id": asset_id}, mimetype="png") + ], +) + +for r in response: + print(r.text, end="") +``` + +## Passing an image as an NVCF asset + +If your image is sufficiently large or you will pass it multiple times in a chat conversation, you may upload it once and reference it in your chat conversation + +See https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/assets.html for details about how upload the image. + +```python +import requests + +content_type = "image/jpg" +description = "example-image-from-lc-nv-ai-e-notebook" + +create_response = requests.post( + "https://api.nvcf.nvidia.com/v2/nvcf/assets", + headers={ + "Authorization": f"Bearer {os.environ['NVIDIA_API_KEY']}", + "accept": "application/json", + "Content-Type": "application/json", + }, + json={"contentType": content_type, "description": description}, +) +create_response.raise_for_status() + +upload_response = requests.put( + create_response.json()["uploadUrl"], + headers={ + "Content-Type": content_type, + "x-amz-meta-nvcf-asset-description": description, + }, + data=img_response.content, +) +upload_response.raise_for_status() + +asset_id = create_response.json()["assetId"] + +response = llm.complete( + prompt=f"Describe the image", + image_documents=[ + ImageDocument(metadata={"asset_id": asset_id}, mimetype="png") + ], +) +``` diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d9f41abaeed484d44ea77cafd4c07e4ea213cc6d --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/__init__.py @@ -0,0 +1,4 @@ +from llama_index.multi_modal_llms.nvidia.base import NVIDIAMultiModal + + +__all__ = ["NVIDIAMultiModal"] diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py new file mode 100644 index 0000000000000000000000000000000000000000..02cea8ed17ed7c6378d1d04dc08e53e04932f23a --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/base.py @@ -0,0 +1,536 @@ +from typing import Any, Dict, List, Optional, Sequence +import requests + +from llama_index.core.base.llms.types import ( + CompletionResponse, + CompletionResponseAsyncGen, + CompletionResponseGen, + MessageRole, + ChatMessage, + ChatResponse, + ChatResponseGen, + ChatResponseAsyncGen, +) +from llama_index.core.bridge.pydantic import Field +from llama_index.core.callbacks import CallbackManager +from llama_index.core.constants import ( + DEFAULT_NUM_OUTPUTS, + DEFAULT_TEMPERATURE, +) +from llama_index.core.multi_modal_llms import ( + MultiModalLLM, + MultiModalLLMMetadata, +) +from llama_index.core.schema import ImageNode +from llama_index.core.base.llms.generic_utils import ( + get_from_param_or_env, +) + +from llama_index.core.base.llms.generic_utils import get_from_param_or_env + +from llama_index.multi_modal_llms.nvidia.utils import ( + BASE_URL, + KNOWN_URLS, + NVIDIA_MULTI_MODAL_MODELS, + generate_nvidia_multi_modal_chat_message, + aggregate_msgs, + process_response, +) +import aiohttp +import json + + +class NVIDIAClient: + def __init__( + self, + api_key: str, + timeout: Optional[float] = None, + ): + self.api_key = api_key + self.timeout = timeout + + def _get_headers(self, stream: bool) -> Dict[str, str]: + headers = { + "Authorization": f"Bearer {self.api_key}", + "content-type": "application/json", + "User-Agent": "langchain-nvidia-ai-endpoints", + } + headers["accept"] = "text/event-stream" if stream else "application/json" + return headers + + def get_model_details(self) -> List[str]: + """ + Get model details. + + Returns: + List of models + """ + return list(NVIDIA_MULTI_MODAL_MODELS.keys()) + + def request( + self, + endpoint: str, + stream: bool, + messages: Dict[str, Any], + extra_headers: Dict[str, Any], + **kwargs: Any, + ) -> Dict[str, Any]: + """ + Perform a synchronous request to the NVIDIA API. + + Args: + endpoint (str): The API endpoint to send the request to. + messages (Dict[str, Any]): The request payload. + + Returns: + Dict[str, Any]: The API response. + """ + + def perform_request(): + payload = {"messages": messages, "stream": stream, **kwargs} + headers = { + **self._get_headers(stream=stream), + **extra_headers, + } + response = requests.post( + endpoint, json=payload, headers=headers, stream=stream + ) + response.raise_for_status() + return response + + return perform_request() + + async def request_async( + self, + endpoint: str, + stream: bool, + messages: Dict[str, Any], + extra_headers: Dict[str, Any], + **kwargs: Any, + ) -> Dict[str, Any]: + """ + Perform an asynchronous request to the NVIDIA API. + + Args: + endpoint (str): The API endpoint to send the request to. + messages (Dict[str, Any]): The request payload. + + Returns: + Dict[str, Any]: The API response. + """ + async with aiohttp.ClientSession() as session: + async with session.post( + endpoint, + json={"messages": messages, "stream": stream, **kwargs}, + headers={**self._get_headers(stream=stream), **extra_headers}, + ) as response: + response.raise_for_status() + return await response.json() + + +class NVIDIAMultiModal(MultiModalLLM): + model: str = Field(description="The Multi-Modal model to use from NVIDIA.") + temperature: float = Field(description="The temperature to use for sampling.") + max_tokens: Optional[int] = Field( + description=" The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.", + gt=0, + ) + timeout: float = Field( + default=60.0, + description="The timeout, in seconds, for API requests.", + ge=0, + ) + api_key: str = Field(default=None, description="The NVIDIA API key.", exclude=True) + base_url: str = Field(default=BASE_URL, description="The base URL for NVIDIA API.") + additional_kwargs: Dict[str, Any] = Field( + default_factory=dict, description="Additional kwargs for the NVIDIA API." + ) + + def __init__( + self, + model: str = "microsoft/phi-3-vision-128k-instruct", + temperature: float = DEFAULT_TEMPERATURE, + max_tokens: Optional[int] = 300, + nvidia_api_key: Optional[str] = None, + api_key: Optional[str] = None, + base_url: Optional[str] = BASE_URL, + callback_manager: Optional[CallbackManager] = None, + **kwargs: Any, + ) -> None: + api_key = get_from_param_or_env( + "api_key", + nvidia_api_key or api_key, + "NVIDIA_API_KEY", + "NO_API_KEY_PROVIDED", + ) + + is_hosted = base_url in KNOWN_URLS + + if is_hosted and api_key == "NO_API_KEY_PROVIDED": + raise ValueError( + "An API key is required for the hosted NIM. This will become an error in 0.2.0." + ) + + super().__init__( + model=model, + temperature=temperature, + max_tokens=max_tokens, + api_key=api_key, + api_base=base_url, + callback_manager=callback_manager, + **kwargs, + ) + + @property + def _client(self) -> NVIDIAClient: + return NVIDIAClient(**self._get_credential_kwargs()) + + @classmethod + def class_name(cls) -> str: + return "nvidia_multi_modal_llm" + + @property + def metadata(self) -> MultiModalLLMMetadata: + """Multi Modal LLM metadata.""" + return MultiModalLLMMetadata( + num_output=self.max_tokens or DEFAULT_NUM_OUTPUTS, + model_name=self.model, + ) + + @property + def available_models(self): + return self._client.get_model_details() + + def _get_credential_kwargs(self) -> Dict[str, Any]: + return {"api_key": self.api_key} + + # Model Params for NVIDIA Multi Modal model. + def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]: + if self.model not in NVIDIA_MULTI_MODAL_MODELS: + raise ValueError( + f"Invalid model {self.model}. " + f"Available models are: {list(NVIDIA_MULTI_MODAL_MODELS.keys())}" + ) + base_kwargs = {"model": self.model, "temperature": self.temperature, **kwargs} + if self.max_tokens is not None: + base_kwargs["max_tokens"] = self.max_tokens + return {**base_kwargs, **self.additional_kwargs} + + def _get_response_token_counts(self, raw_response: Any) -> dict: + """Get the token usage reported by the response.""" + if not isinstance(raw_response, dict): + return {} + + usage = raw_response.get("usage", {}) + # NOTE: other model providers that use the NVIDIA client may not report usage + if usage is None: + return {} + + return { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + } + + def _complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + prompt=prompt, image_documents=image_documents, model=self.model + ) + message_dict = [{"role": MessageRole.USER, "content": content}] + + response = self._client.request( + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + stream=False, + messages=message_dict, + extra_headers=extra_headers, + **all_kwargs, + ) + response = response.json() + text = response["choices"][0]["message"]["content"] + return CompletionResponse( + text=text, + raw=response, + additional_kwargs=self._get_response_token_counts(response), + ) + + def _stream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseGen: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + prompt=prompt, image_documents=image_documents, model=self.model + ) + message_dict = [{"role": MessageRole.USER, "content": content}] + + def gen() -> CompletionResponseGen: + response = self._client.request( + messages=message_dict, + stream=True, + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + extra_headers=extra_headers, + **all_kwargs, + ) + for line in response.iter_lines(): + if line and line.strip() != b"data: [DONE]": + line = line.decode("utf-8") + line = line[5:] + + msg, final_line = aggregate_msgs(process_response(line)) + + yield CompletionResponse( + **msg, + additional_kwargs=self._get_response_token_counts(line), + ) + + if final_line: + break + + return gen() + + def complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + return self._complete(prompt, image_documents, **kwargs) + + def stream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseGen: + return self._stream_complete(prompt, image_documents, **kwargs) + + def _chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + inputs=messages, model=self.model + ) + + response = self._client.request( + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + stream=False, + messages=content, + extra_headers=extra_headers, + **all_kwargs, + ) + response = response.json() + text = response["choices"][0]["message"]["content"] + + return ChatResponse( + delta=text, + message=ChatMessage( + role=response["choices"][0]["message"]["role"], content=text + ), + raw=response, + additional_kwargs=self._get_response_token_counts(response), + ) + + def chat( + self, + messages: Sequence[ChatMessage], + **kwargs: Any, + ) -> ChatResponse: + return self._chat(messages, **kwargs) + + def stream_chat( + self, + messages: Sequence[ChatMessage], + **kwargs: Any, + ) -> ChatResponseGen: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + inputs=messages, model=self.model + ) + + def gen() -> CompletionResponseGen: + response = self._client.request( + messages=content, + stream=True, + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + extra_headers=extra_headers, + **all_kwargs, + ) + for line in response.iter_lines(): + if line and line.strip() != b"data: [DONE]": + line = line.decode("utf-8") + line = line[5:] + + msg, final_line = aggregate_msgs(process_response(line)) + + role = msg.get("role", MessageRole.ASSISTANT) + additional_kwargs = {} + + yield ChatResponse( + message=ChatMessage( + role=role, + content=msg.get("content"), + additional_kwargs=additional_kwargs, + ), + delta=msg.get("content"), + raw=response, + additional_kwargs=self._get_response_token_counts(line), + ) + + if final_line: + break + + return gen() + + # ===== Async Endpoints ===== + + async def _acomplete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + prompt=prompt, image_documents=image_documents, model=self.model + ) + message_dict = [{"role": MessageRole.USER, "content": content}] + + response_json = await self._client.request_async( + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + stream=False, + messages=message_dict, + extra_headers=extra_headers, + **all_kwargs, + ) + text = response_json["choices"][0]["message"]["content"] + return CompletionResponse( + text=text, + raw=response_json, + additional_kwargs=self._get_response_token_counts(response_json), + ) + + async def acomplete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + return await self._acomplete(prompt, image_documents, **kwargs) + + async def astream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseAsyncGen: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + prompt=prompt, image_documents=image_documents, model=self.model + ) + payload = { + "messages": [{"role": MessageRole.USER, "content": content}], + "stream": True, + **all_kwargs, + } + headers = { + **self._client._get_headers(stream=True), + **extra_headers, + } + + async def gen() -> CompletionResponseAsyncGen: + async with aiohttp.ClientSession() as session: + async with session.post( + NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + json=payload, + headers=headers, + ) as response: + response.raise_for_status() + text = "" + async for line in response.content: + if line and line.strip() != b"data: [DONE]": + line = line.decode("utf-8").strip() + if line.startswith("data:"): + data = json.loads(line[5:]) + + delta = data["choices"][0]["delta"]["content"] + text += delta + + yield CompletionResponse( + text=text, + raw=data, + delta=text, + additional_kwargs=self._get_response_token_counts( + line + ), + ) + + return gen() + + async def _achat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + inputs=messages, model=self.model + ) + + response_json = await self._client.request_async( + endpoint=NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + stream=False, + messages=content, + extra_headers=extra_headers, + **all_kwargs, + ) + + text = response_json["choices"][0]["message"]["content"] + + return ChatResponse( + delta=text, + message=ChatMessage( + role=response_json["choices"][0]["message"]["role"], content=text + ), + raw=response_json, + additional_kwargs=self._get_response_token_counts(response_json), + ) + + async def achat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + return await self._achat(messages, **kwargs) + + async def astream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseAsyncGen: + all_kwargs = self._get_model_kwargs(**kwargs) + content, extra_headers = generate_nvidia_multi_modal_chat_message( + inputs=messages, model=self.model + ) + payload = {"messages": content, "stream": True, **all_kwargs} + headers = { + **self._client._get_headers(stream=True), + **extra_headers, + } + + async def gen() -> ChatResponseAsyncGen: + async with aiohttp.ClientSession() as session: + async with session.post( + NVIDIA_MULTI_MODAL_MODELS[self.model]["endpoint"], + json=payload, + headers=headers, + ) as response: + response.raise_for_status() + + text = "" + + async for line in response.content: + if line and line.strip() != b"data: [DONE]": + line_text = line.decode("utf-8").strip() + + if line_text.startswith("data:"): + data = json.loads(line_text[5:]) + delta = data["choices"][0]["delta"]["content"] + role = data["choices"][0]["delta"].get( + "role", MessageRole.ASSISTANT + ) + text += delta + + yield ChatResponse( + message=ChatMessage( + role=role, + content=delta, + additional_kwargs={}, + ), + delta=delta, + raw=data, + additional_kwargs=self._get_response_token_counts( + data + ), + ) + + return gen() diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fa4d00bcd2a658a5072b5caae290ee4b135c9f36 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/llama_index/multi_modal_llms/nvidia/utils.py @@ -0,0 +1,341 @@ +import base64 +import filetype +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from llama_index.core.schema import ImageDocument +import json +import os +import re +import urllib +from llama_index.core.base.llms.types import ChatMessage + +DEFAULT_MODEL = "microsoft/phi-3-vision-128k-instruct" +BASE_URL = "https://ai.api.nvidia.com/v1/" + +KNOWN_URLS = [ + BASE_URL, + "https://integrate.api.nvidia.com/v1", +] + +NVIDIA_MULTI_MODAL_MODELS = { + "adept/fuyu-8b": {"endpoint": f"{BASE_URL}vlm/adept/fuyu-8b", "type": "nv-vlm"}, + "google/deplot": {"endpoint": f"{BASE_URL}vlm/google/deplot", "type": "nv-vlm"}, + "microsoft/kosmos-2": { + "endpoint": f"{BASE_URL}vlm/microsoft/kosmos-2", + "type": "nv-vlm", + }, + "nvidia/neva-22b": {"endpoint": f"{BASE_URL}vlm/nvidia/neva-22b", "type": "nv-vlm"}, + "google/paligemma": { + "endpoint": f"{BASE_URL}vlm/google/paligemma", + "type": "nv-vlm", + }, + "microsoft/phi-3-vision-128k-instruct": { + "endpoint": f"{BASE_URL}vlm/microsoft/phi-3-vision-128k-instruct", + "type": "vlm", + }, + "microsoft/phi-3.5-vision-instruct": { + "endpoint": f"{BASE_URL}microsoft/microsoft/phi-3_5-vision-instruct", + "type": "nv-vlm", + }, + "nvidia/vila": {"endpoint": f"{BASE_URL}vlm/nvidia/vila", "type": "nv-vlm"}, + "meta/llama-3.2-11b-vision-instruct": { + "endpoint": f"{BASE_URL}gr/meta/llama-3.2-11b-vision-instruct/chat/completions", + "type": "vlm", + }, + "meta/llama-3.2-90b-vision-instruct": { + "endpoint": f"{BASE_URL}/gr/meta/llama-3.2-90b-vision-instruct/chat/completions", + "type": "vlm", + }, +} + + +def infer_image_mimetype_from_base64(base64_string) -> str: + # Decode the base64 string + decoded_data = base64.b64decode(base64_string) + + # Use filetype to guess the MIME type + kind = filetype.guess(decoded_data) + + # Return the MIME type if detected, otherwise return None + return kind.mime if kind is not None else None + + +def infer_image_mimetype_from_file_path(image_file_path: str) -> str: + # Get the file extension + file_extension = image_file_path.split(".")[-1].lower() + + # Map file extensions to mimetypes + # Claude 3 support the base64 source type for images, and the image/jpeg, image/png, image/gif, and image/webp media types. + # https://docs.anthropic.com/claude/reference/messages_post + if file_extension in ["jpg", "jpeg", "png", "webp", "gif"]: + return file_extension + return "png" + # Add more mappings for other image types if needed + + # If the file extension is not recognized + + +# Function to encode the image to base64 content +def encode_image(image_path: str) -> str: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + +def create_image_content(image_document) -> Optional[Dict[str, Any]]: + """ + Create the image content based on the provided image document. + """ + if image_document.image: + mimetype = ( + image_document.mimetype + if image_document.mimetype + else infer_image_mimetype_from_base64(image_document.image) + ) + return { + "type": "text", + "text": f'<img src="data:image/{mimetype};base64,{image_document.image}" />', + }, "" + + elif "asset_id" in image_document.metadata: + asset_id = image_document.metadata["asset_id"] + mimetype = image_document.mimetype if image_document.mimetype else "jpeg" + return { + "type": "text", + "text": f'<img src="data:image/{mimetype};asset_id,{asset_id}" />', + }, asset_id + + elif image_document.image_url and image_document.image_url != "": + mimetype = infer_image_mimetype_from_file_path(image_document.image_url) + return { + "type": "image_url", + "image_url": image_document.image_url, + }, "" + elif ( + "file_path" in image_document.metadata + and image_document.metadata["file_path"] != "" + ): + mimetype = infer_image_mimetype_from_file_path( + image_document.metadata["file_path"] + ) + base64_image = encode_image(image_document.metadata["file_path"]) + return { + "type": "text", + "text": f'<img src="data:image/{mimetype};base64,{base64_image}" />', + }, "" + + return None, None + + +def generate_nvidia_multi_modal_chat_message( + model: str, + prompt: Optional[str] = None, + inputs: Optional[List[ChatMessage]] = [], + image_documents: Optional[Sequence[ImageDocument]] = [], +) -> List[Dict[str, Any]]: + # If image_documents is None, return a text-only chat message + completion_content = [] + asset_ids = [] + extra_headers = {} + model_type = NVIDIA_MULTI_MODAL_MODELS[model]["type"] + + for input in inputs: + if input.content: + asset_ids.extend(_nv_vlm_get_asset_ids(input.content)) + + # Process each image document + for image_document in image_documents: + image_content, asset_id = create_image_content(image_document) + if image_content: + completion_content.append(image_content) + if asset_id: + asset_ids.append(asset_id) + + if len(asset_ids) > 0: + extra_headers["NVCF-INPUT-ASSET-REFERENCES"] = ",".join(asset_ids) + + # Append the text prompt to the completion content + if prompt: + completion_content.append({"type": "text", "text": prompt}) + return completion_content, extra_headers + + inputs = [ + { + "role": message.role, + "content": _nv_vlm_adjust_input(message, model_type).content, + } + for message in inputs + ] + return inputs, extra_headers + + +def process_response(response) -> List[dict]: + """General-purpose response processing for single responses and streams.""" + if hasattr(response, "json"): ## For single response (i.e. non-streaming) + try: + return [response.json()] + except json.JSONDecodeError: + response = str(response.__dict__) + if isinstance(response, str): ## For set of responses (i.e. streaming) + msg_list = [] + for msg in response.split("\n\n"): + if "{" not in msg: + continue + msg_list += [json.loads(msg[msg.find("{") :])] + return msg_list + raise ValueError(f"Received ill-formed response: {response}") + + +def aggregate_msgs(msg_list: Sequence[dict]) -> Tuple[dict, bool]: + """Dig out relevant details of aggregated message.""" + content_buffer: Dict[str, Any] = {} + content_holder: Dict[Any, Any] = {} + usage_holder: Dict[Any, Any] = {} #### + finish_reason_holder: Optional[str] = None + is_stopped = False + for msg in msg_list: + usage_holder = msg.get("usage", {}) #### + if "choices" in msg: + ## Tease out ['choices'][0]...['delta'/'message'] + # when streaming w/ usage info, we may get a response + # w/ choices: [] that includes final usage info + choices = msg.get("choices", [{}]) + msg = choices[0] if choices else {} + # TODO: this needs to be fixed, the fact we only + # use the first choice breaks the interface + finish_reason_holder = msg.get("finish_reason", None) + is_stopped = finish_reason_holder == "stop" + msg = msg.get("delta", msg.get("message", msg.get("text", ""))) + if not isinstance(msg, dict): + msg = {"content": msg} + elif "data" in msg: + ## Tease out ['data'][0]...['embedding'] + msg = msg.get("data", [{}])[0] + content_holder = msg + for k, v in msg.items(): + if k in ("content",) and k in content_buffer: + content_buffer[k] += v + else: + content_buffer[k] = v + if is_stopped: + break + content_holder = { + **content_holder, + **content_buffer, + "text": content_buffer["content"], + } + if usage_holder: + content_holder.update(token_usage=usage_holder) #### + if finish_reason_holder: + content_holder.update(finish_reason=finish_reason_holder) + return content_holder, is_stopped + + +def _nv_vlm_adjust_input(message: ChatMessage, model_type: str) -> ChatMessage: + """ + This function converts the OpenAI VLM API input message to + NVIDIA VLM API input message, in place. + + The NVIDIA VLM API input message.content: + { + "role": "user", + "content": [ + ..., + { + "type": "image_url", + "image_url": "{data}" + }, + ... + ] + } + where OpenAI VLM API input message.content: + { + "role": "user", + "content": [ + ..., + { + "type": "image_url", + "image_url": { + "url": "{url | data}" + } + }, + ... + ] + } + + In the process, it accepts a url or file and converts them to + data urls. + """ + if content := message.content: + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and "image_url" in part: + if ( + isinstance(part["image_url"], dict) + and "url" in part["image_url"] + ): + url = _url_to_b64_string(part["image_url"]["url"]) + if model_type == "nv-vlm": + part["image_url"] = url + else: + part["image_url"]["url"] = url + return message + + +def _nv_vlm_get_asset_ids( + content: Union[str, List[Union[str, Dict[str, Any]]]], +) -> List[str]: + """ + Extracts asset IDs from the message content. + + VLM APIs accept asset IDs as input in two forms: + - content = [{"image_url": {"url": "data:image/{type};asset_id,{asset_id}"}}*] + - content = .*<img src="data:image/{type};asset_id,{asset_id}"/>.* + """ + + def extract_asset_id(data: str) -> List[str]: + pattern = re.compile(r'data:image/[^;]+;asset_id,([^"\'\s]+)') + return pattern.findall(data) + + asset_ids = [] + if isinstance(content, str): + asset_ids.extend(extract_asset_id(content)) + elif isinstance(content, list): + for part in content: + if isinstance(part, str): + asset_ids.extend(extract_asset_id(part)) + elif isinstance(part, dict) and "image_url" in part: + image_url = part["image_url"] + if isinstance(image_url, str): + asset_ids.extend(extract_asset_id(image_url)) + elif isinstance(part, dict) and "text" in part: + image_url = part["text"] + if isinstance(image_url, str): + asset_ids.extend(extract_asset_id(image_url)) + + return asset_ids + + +def _is_url(s: str) -> bool: + try: + result = urllib.parse.urlparse(s) + return all([result.scheme, result.netloc]) + except Exception as e: + raise f"Unable to parse URL: {e}" + return False + + +def _url_to_b64_string(image_source: str) -> str: + try: + if _is_url(image_source): + return image_source + elif image_source.startswith("data:image"): + return image_source + elif os.path.exists(image_source): + encoded = encode_image(image_source) + image_type = infer_image_mimetype_from_base64(encoded) + return f"data:image/{image_type};base64,{encoded}" + else: + raise ValueError( + "The provided string is not a valid URL, base64, or file path." + ) + except Exception as e: + raise ValueError(f"Unable to process the provided image source: {e}") diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..9555020026cb10a9348f5fe873ce6676eaeed519 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.multi_modal_llms.nvidia" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name <you@example.com>"] +description = "llama-index multi_modal nvidia integration" +license = "MIT" +name = "llama-index-multi-modal-llms-nvidia" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.0" +filetype = "^1.2.0" + +[tool.poetry.group.dev.dependencies] +aiohttp = "^3.10.10" +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pillow = "10.0.0" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "^8.2" +pytest-asyncio = "^0.24.0" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..45d59ac8248a2970130a0a35eb97b7c847273629 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/BUILD @@ -0,0 +1,5 @@ +python_tests() + +python_test_utils( + name="test_utils", +) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/__init__.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..63a843400e84d2c4a59f6a9b26558be61785b000 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/conftest.py @@ -0,0 +1,59 @@ +import pytest +import os + +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal as Interface +from llama_index.multi_modal_llms.nvidia.utils import DEFAULT_MODEL + +from typing import Generator + + +# this fixture is used to mask the NVIDIA_API_KEY environment variable and restore it +# after the test. it also returns the value of the NVIDIA_API_KEY environment variable +# before it was masked so that it can be used in the test. +@pytest.fixture() +def masked_env_var() -> Generator[str, None, None]: + var = "NVIDIA_API_KEY" + try: + if val := os.environ.get(var, None): + del os.environ[var] + yield val + finally: + if val: + os.environ[var] = val + + +def pytest_collection_modifyitems(config, items): + if "NVIDIA_API_KEY" not in os.environ: + skip_marker = pytest.mark.skip( + reason="requires NVIDIA_API_KEY environment variable" + ) + for item in items: + item.add_marker(skip_marker) + + +def pytest_addoption(parser: pytest.Parser) -> None: + parser.addoption( + "--all-models", + action="store_true", + help="Run tests across all models", + ) + parser.addoption( + "--model-id", + action="store", + help="Run tests for a specific chat model", + ) + parser.addoption( + "--nim-endpoint", + type=str, + help="Run tests using NIM mode", + ) + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + if "vlm_model" in metafunc.fixturenames: + models = [DEFAULT_MODEL] + if model := metafunc.config.getoption("--model-id"): + models = [model] + elif metafunc.config.getoption("--all-models"): + models = [model.id for model in Interface().available_models] + metafunc.parametrize("vlm_model", models, ids=models) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py new file mode 100644 index 0000000000000000000000000000000000000000..3e3480ac101b27d0b2bb2bf56e6ba4d9047c96c6 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_api_key.py @@ -0,0 +1,52 @@ +import os + +import pytest + +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal + +from typing import Any +from llama_index.core.schema import ImageDocument + + +def get_api_key(instance: Any) -> str: + return instance.api_key + + +def test_create_default_url_without_api_key(masked_env_var: str) -> None: + with pytest.raises(ValueError) as err_msg: + NVIDIAMultiModal() + assert ( + str(err_msg.value) + == "An API key is required for the hosted NIM. This will become an error in 0.2.0." + ) + + +@pytest.mark.parametrize("param", ["nvidia_api_key", "api_key"]) +def test_create_with_api_key(param: str, masked_env_var: str) -> None: + instance = NVIDIAMultiModal(**{param: "just testing no failure"}) + assert get_api_key(instance) == "just testing no failure" + + +def test_api_key_priority(masked_env_var: str) -> None: + try: + os.environ["NVIDIA_API_KEY"] = "ENV" + assert get_api_key(NVIDIAMultiModal()) == "ENV" + assert get_api_key(NVIDIAMultiModal(nvidia_api_key="PARAM")) == "PARAM" + assert get_api_key(NVIDIAMultiModal(api_key="PARAM")) == "PARAM" + assert ( + get_api_key(NVIDIAMultiModal(api_key="LOW", nvidia_api_key="HIGH")) + == "HIGH" + ) + finally: + # we must clean up environ or it may impact other tests + del os.environ["NVIDIA_API_KEY"] + + +@pytest.mark.integration() +def test_bogus_api_key_error(vlm_model: str, masked_env_var: str) -> None: + client = NVIDIAMultiModal(model=vlm_model, nvidia_api_key="BOGUS") + with pytest.raises(Exception) as exc_info: + client.complete( + prompt="xyz", image_documents=[ImageDocument(image_url="https://xyz.com")] + ) + assert "401" in str(exc_info.value) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py new file mode 100644 index 0000000000000000000000000000000000000000..829622e8724401a47135a0d12c3edb51e00c94d3 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_available_models.py @@ -0,0 +1,11 @@ +import pytest + +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal + + +@pytest.mark.integration() +def test_available_models() -> None: + models = NVIDIAMultiModal().available_models + assert models + assert isinstance(models, list) + assert all(isinstance(model, str) for model in models) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py new file mode 100644 index 0000000000000000000000000000000000000000..afe97b2d210c35abe9058ea7d825f4ab8692c950 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_multi_modal_nvidia.py @@ -0,0 +1,436 @@ +from llama_index.core.multi_modal_llms.base import MultiModalLLM +from llama_index.multi_modal_llms.nvidia import NVIDIAMultiModal +from llama_index.multi_modal_llms.nvidia.utils import ( + NVIDIA_MULTI_MODAL_MODELS, +) +import base64 +import os +from typing import Any, Dict, List, Union + +import pytest +import requests +from llama_index.core.base.llms.types import ( + CompletionResponse, + ChatMessage, + ChatResponse, +) +from llama_index.core.schema import ImageDocument +import numpy as np +from PIL import Image +import tempfile + +# TODO: multiple texts +# TODO: accuracy tests + +# +# API Specification - +# +# - User message may contain 1 or more image_url +# - url is either a url to an image or base64 encoded image +# - format for base64 is "data:image/png;{type}},..." +# - supported image types are png, jpeg (or jpg), webp, gif (non-animated) +# + +# +# note: differences between api catalog and openai api +# - openai api supports server-side image download, api catalog does not consistently +# - NVIDIAMultiModal does client side download to simulate the same behavior +# - NVIDIAMultiModal will automatically read local files and convert them to base64 +# - openai api always uses {"image_url": {"url": "..."}} +# where api catalog sometimes uses {"image_url": "..."} +# + +image_urls = [ + "https://res.cloudinary.com/hello-tickets/image/upload/c_limit,f_auto,q_auto,w_1920/v1640835927/o3pfl41q7m5bj8jardk0.jpg", + "https://www.visualcapitalist.com/wp-content/uploads/2023/10/US_Mortgage_Rate_Surge-Sept-11-1.jpg", + "https://www.sportsnet.ca/wp-content/uploads/2023/11/CP1688996471-1040x572.jpg", + # Add yours here! +] + +MODELS = list(NVIDIA_MULTI_MODAL_MODELS.keys()) + + +def test_embedding_class(): + names_of_base_classes = [b.__name__ for b in NVIDIAMultiModal.__mro__] + assert MultiModalLLM.__name__ in names_of_base_classes + + +def test_init(): + m = NVIDIAMultiModal(max_tokens=400) + assert m.max_tokens == 400 + + +def urlToBase64(url): + return base64.b64encode(requests.get(url).content).decode("utf-8") + + +@pytest.fixture(scope="session") +def temp_image_path(suffix: str): + # Create a white square image + white_square = np.ones((100, 100, 3), dtype=np.uint8) * 255 + image = Image.fromarray(white_square) + + # Create a temporary file + with tempfile.NamedTemporaryFile(suffix=f".{suffix}", delete=False) as temp_file: + image.save(temp_file, format=suffix.upper()) + temp_path = temp_file.name + + yield temp_path + + # Clean up the temporary file after the test + os.unlink(temp_path) + + +@pytest.fixture(scope="session") +def get_asset_id(): + content_type = "image/jpg" + description = "example-image-from-lc-nv-ai-e-notebook" + + create_response = requests.post( + "https://api.nvcf.nvidia.com/v2/nvcf/assets", + headers={ + "Authorization": f"Bearer {os.environ['NVIDIA_API_KEY']}", + "accept": "application/json", + "Content-Type": "application/json", + }, + json={"contentType": content_type, "description": description}, + ) + create_response.raise_for_status() + + upload_response = requests.put( + create_response.json()["uploadUrl"], + headers={ + "Content-Type": content_type, + "x-amz-meta-nvcf-asset-description": description, + }, + data=requests.get(image_urls[0]).content, + ) + upload_response.raise_for_status() + + return create_response.json()["assetId"] + + +def test_class(): + emb = NVIDIAMultiModal(api_key="BOGUS") + assert isinstance(emb, MultiModalLLM) + + +@pytest.mark.parametrize( + "content", + [ + [ImageDocument(image_url=image_urls[0])], + [ImageDocument(image=urlToBase64(image_urls[0]), mimetype="jpeg")], + ], +) +@pytest.mark.parametrize( + "func", + ["invoke", "stream"], +) +def test_vlm_input_style( + vlm_model: str, + content: List[ImageDocument], + func: str, +) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + assert vlm_model in MODELS + if func == "invoke": + response = llm.complete(prompt="Describe the Image.", image_documents=content) + assert isinstance(response, CompletionResponse) + if func == "stream": + for token in llm.stream_complete( + prompt="Describe the Image.", image_documents=content + ): + assert isinstance(token.text, str) + + +@pytest.mark.parametrize( + "suffix", + ["jpeg", "png", "webp", "gif"], + scope="session", +) +def test_vlm_image_type( + suffix: str, + temp_image_path: str, + vlm_model: str, +) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + response = llm.complete( + "Describe image", image_documents=[ImageDocument(image_path=temp_image_path)] + ) + assert isinstance(response, CompletionResponse) + assert isinstance(response.text, str) + + +pytest.mark.skipif(os.path.isfile("data/nvidia-picasso-large.png")) + + +def test_vlm_image_large( + vlm_model: str, +) -> None: + chat = NVIDIAMultiModal(model=vlm_model) + response = chat.complete( + prompt="Describe image", + image_documents=[ImageDocument(image_path="data/nvidia-picasso-large.png")], + ) + assert isinstance(response, CompletionResponse) + assert isinstance(response.text, str) + + +@pytest.mark.parametrize( + "suffix", + ["jpeg", "png", "webp", "gif"], + scope="session", +) +def test_vlm_two_images( + suffix: str, + temp_image_path: str, + vlm_model: str, +) -> None: + chat = NVIDIAMultiModal(model=vlm_model) + response = chat.complete( + prompt="Describe image", + image_documents=[ + ImageDocument(image_path=temp_image_path), + ImageDocument(image_path=temp_image_path), + ], + ) + assert isinstance(response, CompletionResponse) + assert isinstance(response.text, str) + + +@pytest.mark.parametrize( + "content", + [ + [ImageDocument(metadata={"asset_id": ""})], + ], +) +@pytest.mark.parametrize( + "func", + ["invoke", "stream"], +) +def test_vlm_asset_id( + vlm_model: str, + content: Union[str, List[Union[str, Dict[str, Any]]]], + func: str, + get_asset_id: str, +) -> None: + assert isinstance(content[0], ImageDocument) + content[0].metadata["asset_id"] = get_asset_id + + assert content[0].metadata["asset_id"] != "" + + chat = NVIDIAMultiModal(model=vlm_model) + if func == "invoke": + response = chat.complete(prompt="Describe image", image_documents=content) + assert isinstance(response, CompletionResponse) + assert isinstance(response.text, str) + if func == "stream": + for token in chat.stream_complete( + prompt="Describe image", image_documents=content + ): + assert isinstance(token.text, str) + + +## ------------------------- chat/stream_chat test cases ------------------------- ## + + +@pytest.mark.parametrize( + "func", + ["chat", "stream_chat"], +) +def test_stream_chat_multiple_messages(vlm_model: str, func: str) -> None: + """Test streaming chat with multiple messages and images.""" + llm = NVIDIAMultiModal(model=vlm_model) + + messages = [ + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Describe the first image:"}, + {"type": "image_url", "image_url": image_urls[0]}, + ], + ), + ChatMessage( + role="assistant", content="This is a response about the first image." + ), + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Now describe this second image:"}, + {"type": "image_url", "image_url": image_urls[1]}, + ], + ), + ] + + if func == "chat": + response = llm.chat(messages) + assert isinstance(response, ChatResponse) + assert isinstance(response.delta, str) + if func == "stream_chat": + for token in llm.stream_chat(messages): + assert isinstance(token.delta, str) + + +@pytest.mark.parametrize( + "content", + [ + """<img src="data:image/jpg;asset_id,{asset_id}"/>""", + [ + { + "type": "image_url", + "image_url": "data:image/jpg;asset_id,{asset_id}", + } + ], + [ + {"type": "text", "text": "Describe this image:"}, + {"type": "image_url", "image_url": image_urls[1]}, + ], + ], +) +@pytest.mark.parametrize( + "func", + ["chat", "stream_chat"], +) +def test_vlm_asset_id_chat( + vlm_model: str, + content: Union[str, List[Union[str, Dict[str, Any]]]], + func: str, + get_asset_id: str, +) -> None: + def fill( + item: Any, + asset_id: str, + ) -> Union[str, Any]: + # do not mutate item, mutation will cause cross test contamination + result: Any + if isinstance(item, str): + result = item.format(asset_id=asset_id) + elif isinstance(item, ChatMessage): + result = item.model_copy(update={"content": fill(item.content, asset_id)}) + elif isinstance(item, list): + result = [fill(sub_item, asset_id) for sub_item in item] + elif isinstance(item, dict): + result = {key: fill(value, asset_id) for key, value in item.items()} + return result + + asset_id = get_asset_id + assert asset_id != "" + content = fill(content, asset_id) + + llm = NVIDIAMultiModal(model=vlm_model) + if func == "chat": + response = llm.chat([ChatMessage(role="user", content=content)]) + assert isinstance(response, ChatResponse) + assert isinstance(response.delta, str) + if func == "stream_chat": + for token in llm.stream_chat([ChatMessage(role="user", content=content)]): + assert isinstance(token.delta, str) + + +@pytest.mark.parametrize( + "func", + ["chat", "stream_chat"], + scope="session", +) +@pytest.mark.parametrize( + "suffix", + ["jpeg", "png", "webp", "gif"], + scope="session", +) +def test_vlm_image_type_chat( + suffix: str, temp_image_path: str, vlm_model: str, func: str +) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + if func == "chat": + response = llm.chat( + [ChatMessage(content=[{"type": "image_url", "image_url": temp_image_path}])] + ) + assert isinstance(response, ChatResponse) + assert isinstance(response.delta, str) + if func == "stream_chat": + for token in llm.stream_chat( + [ChatMessage(content=[{"type": "image_url", "image_url": temp_image_path}])] + ): + assert isinstance(token, ChatResponse) + + +## ------------------------- Async test cases ------------------------- ## + + +@pytest.mark.parametrize( + "content", + [ + [ImageDocument(image_url=image_urls[0])], + [ImageDocument(image=urlToBase64(image_urls[0]), mimetype="jpeg")], + ], +) +@pytest.mark.asyncio() +async def test_vlm_input_style_async( + vlm_model: str, + content: List[ImageDocument], +) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + assert vlm_model in MODELS + + # Await the completion of the async call + response = await llm.acomplete( + prompt="Describe the Image.", image_documents=content + ) + + # Ensure the response is a valid CompletionResponse + assert isinstance(response, CompletionResponse) + assert isinstance(response.text, str) + + +@pytest.mark.asyncio() +async def test_vlm_chat_async(vlm_model: str) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + messages = [ + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Describe the first image:"}, + {"type": "image_url", "image_url": image_urls[0]}, + ], + ), + ChatMessage( + role="assistant", content="This is a response about the first image." + ), + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Now describe this second image:"}, + {"type": "image_url", "image_url": image_urls[1]}, + ], + ), + ] + response = await llm.achat(messages) + assert isinstance(response, ChatResponse) + assert isinstance(response.delta, str) + + +@pytest.mark.asyncio() +async def test_vlm_chat_async_stream(vlm_model: str) -> None: + llm = NVIDIAMultiModal(model=vlm_model) + messages = [ + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Describe the first image:"}, + {"type": "image_url", "image_url": image_urls[0]}, + ], + ), + ChatMessage( + role="assistant", content="This is a response about the first image." + ), + ChatMessage( + role="user", + content=[ + {"type": "text", "text": "Now describe this second image:"}, + {"type": "image_url", "image_url": image_urls[1]}, + ], + ), + ] + async for token in await llm.astream_chat(messages): + assert isinstance(token, ChatResponse) + assert isinstance(token.delta, str) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6805526c2026e808cd6d4894022f89659000b81f --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nvidia/tests/test_utils.py @@ -0,0 +1,68 @@ +import unittest +from llama_index.multi_modal_llms.nvidia.utils import ( + infer_image_mimetype_from_base64, + infer_image_mimetype_from_file_path, + generate_nvidia_multi_modal_chat_message, + create_image_content, +) +from llama_index.core.base.llms.types import ( + ChatMessage, +) +from llama_index.core.schema import ImageDocument + + +class TestFunctions(unittest.TestCase): + def test_infer_image_mimetype_from_base64(self): + base64_string = "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB" + self.assertEqual(infer_image_mimetype_from_base64(base64_string), "image/jpeg") + + def test_infer_image_mimetype_from_file_path(self): + self.assertEqual(infer_image_mimetype_from_file_path("image.jpg"), "jpg") + self.assertEqual(infer_image_mimetype_from_file_path("image.png"), "png") + self.assertEqual(infer_image_mimetype_from_file_path("image.webp"), "webp") + self.assertEqual(infer_image_mimetype_from_file_path("image.gif"), "gif") + self.assertEqual(infer_image_mimetype_from_file_path("image.txt"), "png") + + # def test_encode_image(self): + # image_path = "image.jpg" + # encoded_image = encode_image(image_path) + # self.assertIsInstance(encoded_image, str) + + def test_create_image_content(self): + image_document = ImageDocument(image="abcd", mimetype="jpeg") + content, asset_id = create_image_content(image_document) + self.assertEqual(content["type"], "text") + self.assertEqual(content["text"], '<img src="data:image/jpeg;base64,abcd" />') + self.assertEqual(asset_id, "") + + image_document = ImageDocument(metadata={"asset_id": "12345"}, mimetype="jpeg") + content, asset_id = create_image_content(image_document) + self.assertEqual(content["type"], "text") + self.assertEqual( + content["text"], '<img src="data:image/jpeg;asset_id,12345" />' + ) + self.assertEqual(asset_id, "12345") + + image_document = ImageDocument(image_url="https://example.com/image.jpg") + content, asset_id = create_image_content(image_document) + self.assertEqual(content["type"], "image_url") + self.assertEqual(content["image_url"], "https://example.com/image.jpg") + self.assertEqual(asset_id, "") + + def test_generate_nvidia_multi_modal_chat_message(self): + inputs = [ChatMessage(role="user", content="Hello")] + image_documents = [ImageDocument(image="base64_string", mimetype="image/jpeg")] + message, extra_headers = generate_nvidia_multi_modal_chat_message( + "google/deplot", inputs=inputs, image_documents=image_documents + ) + self.assertEqual(len(message[0]), 2) + + inputs = [ChatMessage(role="user", content="Hello")] + image_documents = [ + ImageDocument(metadata={"asset_id": "12345"}, mimetype="jpeg") + ] + message, extra_headers = generate_nvidia_multi_modal_chat_message( + "google/deplot", inputs=inputs, image_documents=image_documents + ) + self.assertEqual(len(message[0]), 2) + self.assertEqual(extra_headers["NVCF-INPUT-ASSET-REFERENCES"], "12345")