From 84b73a640f5c4d0af1d37c75544b21c678e05eb5 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:40:01 -0700 Subject: [PATCH] Improving Docs for NVIDIA TensorRT and Triton integrations (#13060) --- docs/docs/examples/llm/nvidia_tensorrt.ipynb | 129 +++++++++++------- docs/docs/examples/llm/nvidia_triton.ipynb | 124 +++++++++++------ .../llms/nvidia_tensorrt/__init__.py | 26 ++++ .../llama_index/llms/nvidia_tensorrt/base.py | 31 ++++- .../llama_index/llms/nvidia_tensorrt/utils.py | 26 ++++ .../pyproject.toml | 2 +- .../llms/nvidia_triton/__init__.py | 26 ++++ .../llama_index/llms/nvidia_triton/base.py | 31 ++++- .../llama_index/llms/nvidia_triton/utils.py | 51 ++++++- .../pyproject.toml | 2 +- 10 files changed, 342 insertions(+), 106 deletions(-) diff --git a/docs/docs/examples/llm/nvidia_tensorrt.ipynb b/docs/docs/examples/llm/nvidia_tensorrt.ipynb index 142ae1522f..32e030f76f 100644 --- a/docs/docs/examples/llm/nvidia_tensorrt.ipynb +++ b/docs/docs/examples/llm/nvidia_tensorrt.ipynb @@ -29,35 +29,66 @@ "metadata": {}, "source": [ "## TensorRT-LLM Environment Setup\n", - "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used.\n", + "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used. Please note, that Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM.\n", "\n", - "1. Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM\n", - "2. Install `tensorrt_llm` via pip with `pip3 install tensorrt_llm -U --extra-index-url https://pypi.nvidia.com`\n", - "3. For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/trt-llm-rag-windows/blob/release/1.0/README.md#building-trt-engine)\n", - " * The following files will be created from following the stop above\n", - " * `Llama_float16_tp1_rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded.\n", - " * `config.json`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine.\n", - " * `model.cache`: Caches some of the timing and optimization information from model compilation, making successive builds quicker.\n", - "4. `mkdir model`\n", - "5. Move all of the files mentioned above to the model directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama-index-llms-nvidia-tensorrt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install tensorrt_llm==0.7.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121" + "In this tutorial we will show how to use the connector with GPT2 model.\n", + "For the best experience, we recommend following\n", + "[Installation](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.8.0?tab=readme-ov-file#installation) process on the\n", + "official [TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM).\n", + "\n", + "The following steps are showing how to set up your model with TensorRT-LLM v0.8.0 for x86_64 users.\n", + "\n", + "1. Obtain and start the basic docker image environment.\n", + "```\n", + "docker run --rm --runtime=nvidia --gpus all --entrypoint /bin/bash -it nvidia/cuda:12.1.0-devel-ubuntu22.04\n", + "```\n", + "\n", + "2. Install dependencies, TensorRT-LLM requires Python 3.10\n", + "```\n", + "apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget\n", + "```\n", + "3. Install the latest stable version (corresponding to the release branch) of TensorRT-LLM. We are using version 0.8.0, but for the most up to date release,\n", + "please refer to [official release page](https://github.com/NVIDIA/TensorRT-LLM/releases).\n", + "```\n", + "pip3 install tensorrt_llm==0.8.0 -U --extra-index-url https://pypi.nvidia.com\n", + "```\n", + "\n", + "4. Check installation\n", + "```\n", + "python3 -c \"import tensorrt_llm\"\n", + "```\n", + "The above command should not produce any errors.\n", + "\n", + "5. For this example we will use GPT2. The GPT2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#usage)\n", + " * First, inside the container, we've started during stage 1, clone TensorRT-LLM repository:\n", + " ```\n", + " git clone --branch v0.8.0 https://github.com/NVIDIA/TensorRT-LLM.git\n", + " ```\n", + " * Install requirements for GPT2 model with:\n", + " ```\n", + " cd TensorRT-LLM/examples/gpt/ && pip install -r requirements.txt\n", + " ```\n", + " * Download hf gpt2 model\n", + " ```\n", + " rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2\n", + " cd gpt2\n", + " rm pytorch_model.bin model.safetensors\n", + " wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin\n", + " cd ..\n", + " ```\n", + " * Convert weights from HF Transformers to TensorRT-LLM format\n", + " ```\n", + " python3 hf_gpt_convert.py -i gpt2 -o ./c-model/gpt2 --tensor-parallelism 1 --storage-type float16\n", + " ```\n", + " * Build TensorRT engine\n", + " ```\n", + " python3 build.py --model_dir=./c-model/gpt2/1-gpu --use_gpt_attention_plugin --remove_input_padding\n", + " ```\n", + " \n", + "6. Install `llama-index-llms-nvidia-tensorrt` package\n", + " ```\n", + " pip install llama-index-llms-nvidia-tensorrt\n", + " ```" ] }, { @@ -75,41 +106,39 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ + "```python\n", "from llama_index.llms.nvidia_tensorrt import LocalTensorRTLLM\n", "\n", - "\n", - "def completion_to_prompt(completion: str) -> str:\n", - " \"\"\"\n", - " Given a completion, return the prompt using llama2 format.\n", - " \"\"\"\n", - " return f\"<s> [INST] {completion} [/INST] \"\n", - "\n", - "\n", "llm = LocalTensorRTLLM(\n", - " model_path=\"./model\",\n", - " engine_name=\"llama_float16_tp1_rank0.engine\",\n", - " tokenizer_dir=\"meta-llama/Llama-2-13b-chat\",\n", - " completion_to_prompt=completion_to_prompt,\n", - ")" + " model_path=\"./engine_outputs\",\n", + " engine_name=\"gpt_float16_tp1_rank0.engine\",\n", + " tokenizer_dir=\"gpt2\",\n", + " max_new_tokens=40,\n", + ")\n", + "\n", + "resp = llm.complete(\"Who is Harry Potter?\")\n", + "print(str(resp))\n", + "```" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "resp = llm.complete(\"Who is Paul Graham?\")\n", - "print(str(resp))" + "The expected response should look like:\n", + "```\n", + "Harry Potter is a fictional character created by J.K. Rowling in her first novel, Harry Potter and the Philosopher's Stone. The character is a wizard who lives in the fictional town#\n", + "```" ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -133,5 +162,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 0 } diff --git a/docs/docs/examples/llm/nvidia_triton.ipynb b/docs/docs/examples/llm/nvidia_triton.ipynb index f6ee2ef4ec..ab33d0f7dc 100644 --- a/docs/docs/examples/llm/nvidia_triton.ipynb +++ b/docs/docs/examples/llm/nvidia_triton.ipynb @@ -18,93 +18,130 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "[NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) provides a cloud and edge inferencing solution optimized for both CPUs and GPUs. This connector allows for llama_index to remotely interact with TRT-LLM models deployed with Triton." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launching Triton Inference Server" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This connector requires a running instance of Triton Inference Server with A TensorRT-LLM model.\n", + "For this example, we will use a [Triton Command Line Interface (Triton CLI)](https://github.com/triton-inference-server/triton_cli) to deploy a GPT2 model on Triton.\n", + "\n", + "When using Triton and related tools on your host (outside of a Triton container image) there are a number of additional dependencies that may be required for various workflows. Most system dependency issues can be resolved by installing and running the CLI from within the latest corresponding `tritonserver` container image, which should have all necessary system dependencies installed.\n", "\n", - "Nvidia's Triton is an inference server that provides API access to hosted LLM models. This connector allows for llama_index to remotely interact with a Triton inference server over GRPC to accelerate inference operations.\n", + "For TRT-LLM, you can use `nvcr.io/nvidia/tritonserver:{YY.MM}-trtllm-python-py3` image, where `YY.MM` corresponds to the version of `tritonserver`, for example in this example we're using 24.02 version of the container. To get the list of available versions, please refer to [Triton Inference Server NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).\n", "\n", - "[Triton Inference Server Github](https://github.com/triton-inference-server/server)" + "To start the container, run in your Linux terminal:\n", + "\n", + "```\n", + "docker run -ti --gpus all --network=host --shm-size=1g --ulimit memlock=-1 nvcr.io/nvidia/tritonserver:24.02-trtllm-python-py3\n", + "```\n", + "Next, we'll need to install dependencies with the following:\n", + "```\n", + "pip install \\\n", + " \"psutil\" \\\n", + " \"pynvml>=11.5.0\" \\\n", + " \"torch==2.1.2\" \\\n", + " \"tensorrt_llm==0.8.0\" --extra-index-url https://pypi.nvidia.com/\n", + "```\n", + "Finally, run the following to install Triton CLI." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Install tritonclient\n", - "Since we are interacting with the Triton inference server we will need to install the `tritonclient` package. The `tritonclient` package.\n", - "\n", - "`tritonclient` can be easily installed using `pip3 install tritonclient`." + "```\n", + "pip install git+https://github.com/triton-inference-server/triton_cli.git\n", + "```" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%pip install llama-index-llms-nvidia-triton" + "To generate model repository for GPT2 model and start an instance of Triton Server:\n", + "```\n", + "triton remove -m all\n", + "triton import -m gpt2 --backend tensorrtllm\n", + "triton start &\n", + "```\n", + "Please, note that by default Triton starts listenning to `localhost:8000` HTTP port and `localhost:8001` GRPC port. The latter will be used in this example.\n", + "For any additional how-tos and questions, please reach out to [Triton Command Line Interface (Triton CLI)](https://github.com/triton-inference-server/triton_cli) issues.\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "!pip3 install tritonclient" + "## Install tritonclient\n", + "Since we are interacting with the Triton Inference Server we will need to [install](https://github.com/triton-inference-server/client?tab=readme-ov-file#download-using-python-package-installer-pip) the `tritonclient` package." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Usage" + "```\n", + "pip install tritonclient[all]\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Call `complete` with a prompt" + "Next, we'll install llama index connector.\n", + "```\n", + "pip install llama-index-llms-nvidia-triton\n", + "```\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from llama_index.llms.nvidia_triton import NvidiaTriton\n", - "\n", - "# A Triton server instance must be running. Use the correct URL for your desired Triton server instance.\n", - "triton_url = \"localhost:8001\"\n", - "resp = NvidiaTriton().complete(\"The tallest mountain in North America is \")\n", - "print(resp)" + "## Basic Usage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Call `chat` with a list of messages" + "#### Call `complete` with a prompt" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from llama_index.core.llms import ChatMessage\n", + "```python\n", "from llama_index.llms.nvidia_triton import NvidiaTriton\n", "\n", - "messages = [\n", - " ChatMessage(\n", - " role=\"system\",\n", - " content=\"You are a clown named bozo that has had a rough day at the circus\",\n", - " ),\n", - " ChatMessage(role=\"user\", content=\"What has you down bozo?\"),\n", - "]\n", - "resp = NvidiaTriton().chat(messages)\n", - "print(resp)" + "# A Triton server instance must be running. Use the correct URL for your desired Triton server instance.\n", + "triton_url = \"localhost:8001\"\n", + "model_name = \"gpt2\"\n", + "resp = NvidiaTriton(server_url=triton_url, model_name=model_name, tokens=32).complete(\"The tallest mountain in North America is \")\n", + "print(resp)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should expect the following response\n", + "```\n", + "the Great Pyramid of Giza, which is about 1,000 feet high. The Great Pyramid of Giza is the tallest mountain in North America.\n", + "```\n" ] }, { @@ -112,13 +149,14 @@ "metadata": {}, "source": [ "## Further Examples\n", - "Remember that a Triton instance represents a running server instance therefore you should ensure you have a valid server configuration running and change the `localhost:8001` to the correct IP/hostname:port combination for your server.\n", - "\n", - "An example of setting up this environment can be found at Nvidia's (GenerativeAIExamples Github Repo)[https://github.com/NVIDIA/GenerativeAIExamples/tree/main/RetrievalAugmentedGeneration]" + "For more information on Triton Inference Server, please refer to a [Quickstart](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md#quickstart) guide, [NVIDIA Developer Triton page](https://developer.nvidia.com/triton-inference-server), and [GitHub issues](https://github.com/triton-inference-server/server/issues) channel." ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -142,5 +180,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 0 } diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py index 9386d75fe9..2275dc6b55 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from llama_index.llms.nvidia_tensorrt.base import LocalTensorRTLLM __all__ = ["LocalTensorRTLLM"] diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py index 5261a470d7..7de63b9abf 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import gc import json import os @@ -158,7 +184,9 @@ class LocalTensorRTLLM(CustomLLM): ] remove_input_padding = config["plugin_config"]["remove_input_padding"] tp_size = config["builder_config"]["tensor_parallel"] - pp_size = config["builder_config"]["pipeline_parallel"] + pp_size = 1 + if "pipeline_parallel" in config["builder_config"]: + pp_size = config["builder_config"]["pipeline_parallel"] world_size = tp_size * pp_size assert ( world_size == tensorrt_llm.mpi_world_size() @@ -185,6 +213,7 @@ class LocalTensorRTLLM(CustomLLM): gpt_attention_plugin=use_gpt_attention_plugin, paged_kv_cache=paged_kv_cache, remove_input_padding=remove_input_padding, + max_batch_size=config["builder_config"]["max_batch_size"], ) assert ( diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py index 4814e23136..c47b4660fa 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import time import uuid from typing import Any, Dict, Optional diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml index bc42ebcc68..32e0d79e23 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml @@ -27,7 +27,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-llms-nvidia-tensorrt" readme = "README.md" -version = "0.1.4" +version = "0.1.5" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py index f75556832b..7516d7be8b 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from llama_index.llms.nvidia_triton.base import NvidiaTriton __all__ = ["NvidiaTriton"] diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py index c273d0bb9b..b15793c781 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import random from typing import ( Any, @@ -36,8 +62,8 @@ DEFAULT_MAX_TOKENS = 100 DEFAULT_BEAM_WIDTH = 1 DEFAULT_REPTITION_PENALTY = 1.0 DEFAULT_LENGTH_PENALTY = 1.0 -DEFAULT_REUSE_CLIENT = True -DEFAULT_TRITON_LOAD_MODEL = True +DEFAULT_REUSE_CLIENT = False +DEFAULT_TRITON_LOAD_MODEL = False class NvidiaTriton(LLM): @@ -235,7 +261,6 @@ class NvidiaTriton(LLM): result_queue = client.request_streaming( model_params["model_name"], request_id, **invocation_params ) - response = "" for token in result_queue: if isinstance(token, InferenceServerException): diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py index ef644788df..3a53beaddd 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py @@ -1,3 +1,29 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import abc import json import random @@ -25,13 +51,21 @@ class StreamingResponseGenerator(Queue): """A Generator that provides the inference results from an LLM.""" def __init__( - self, client: "GrpcTritonClient", request_id: str, force_batch: bool + self, + client: "GrpcTritonClient", + request_id: str, + force_batch: bool, + model_name: str, + max_tokens: int, ) -> None: """Instantiate the generator class.""" super().__init__() self._client = client self.request_id = request_id self._batch = force_batch + self._model_name = model_name + self._max_tokens = max_tokens + self._counter = 0 def __iter__(self) -> "StreamingResponseGenerator": """Return self as a generator.""" @@ -40,15 +74,16 @@ class StreamingResponseGenerator(Queue): def __next__(self) -> str: """Return the next retrieved token.""" val = self.get() - if val is None or val in STOP_WORDS: + if val is None or val in STOP_WORDS or self._counter == self._max_tokens - 1: self._stop_stream() raise StopIteration + self._counter += 1 return val def _stop_stream(self) -> None: """Drain and shutdown the Triton stream.""" self._client.stop_stream( - "tensorrt_llm", self.request_id, signal=not self._batch + self._model_name, self.request_id, signal=not self._batch ) @@ -163,8 +198,8 @@ class _BaseTritonClient(abc.ABC): ) -> List[Union["grpcclient.InferInput", "httpclient.InferInput"]]: """Create the input for the triton inference server.""" query = np.array(prompt).astype(object) - request_output_len = np.array([tokens]).astype(np.uint32).reshape((1, -1)) - runtime_top_k = np.array([top_k]).astype(np.uint32).reshape((1, -1)) + request_output_len = np.array([tokens]).astype(np.int32).reshape((1, -1)) + runtime_top_k = np.array([top_k]).astype(np.int32).reshape((1, -1)) runtime_top_p = np.array([top_p]).astype(np.float32).reshape((1, -1)) temperature_array = np.array([temperature]).astype(np.float32).reshape((1, -1)) len_penalty = np.array([length_penalty]).astype(np.float32).reshape((1, -1)) @@ -172,7 +207,7 @@ class _BaseTritonClient(abc.ABC): np.array([repetition_penalty]).astype(np.float32).reshape((1, -1)) ) random_seed = np.array([RANDOM_SEED]).astype(np.uint64).reshape((1, -1)) - beam_width_array = np.array([beam_width]).astype(np.uint32).reshape((1, -1)) + beam_width_array = np.array([beam_width]).astype(np.int32).reshape((1, -1)) streaming_data = np.array([[stream]], dtype=bool) return [ @@ -318,8 +353,10 @@ class GrpcTritonClient(_BaseTritonClient): if not request_id: request_id = str(random.randint(1, 9999999)) # nosec - result_queue = StreamingResponseGenerator(self, request_id, force_batch) inputs = self._generate_inputs(stream=not force_batch, **params) + result_queue = StreamingResponseGenerator( + self, request_id, force_batch, model_name, max_tokens=params["tokens"] + ) outputs = self._generate_outputs() self._send_prompt_streaming( model_name, diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml index a8526c296a..d5f681d349 100644 --- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml @@ -27,7 +27,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-llms-nvidia-triton" readme = "README.md" -version = "0.1.3" +version = "0.1.4" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab