From 84b73a640f5c4d0af1d37c75544b21c678e05eb5 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:40:01 -0700
Subject: [PATCH] Improving Docs for NVIDIA TensorRT and Triton integrations
 (#13060)

---
 docs/docs/examples/llm/nvidia_tensorrt.ipynb  | 129 +++++++++++-------
 docs/docs/examples/llm/nvidia_triton.ipynb    | 124 +++++++++++------
 .../llms/nvidia_tensorrt/__init__.py          |  26 ++++
 .../llama_index/llms/nvidia_tensorrt/base.py  |  31 ++++-
 .../llama_index/llms/nvidia_tensorrt/utils.py |  26 ++++
 .../pyproject.toml                            |   2 +-
 .../llms/nvidia_triton/__init__.py            |  26 ++++
 .../llama_index/llms/nvidia_triton/base.py    |  31 ++++-
 .../llama_index/llms/nvidia_triton/utils.py   |  51 ++++++-
 .../pyproject.toml                            |   2 +-
 10 files changed, 342 insertions(+), 106 deletions(-)

diff --git a/docs/docs/examples/llm/nvidia_tensorrt.ipynb b/docs/docs/examples/llm/nvidia_tensorrt.ipynb
index 142ae1522f..32e030f76f 100644
--- a/docs/docs/examples/llm/nvidia_tensorrt.ipynb
+++ b/docs/docs/examples/llm/nvidia_tensorrt.ipynb
@@ -29,35 +29,66 @@
    "metadata": {},
    "source": [
     "## TensorRT-LLM Environment Setup\n",
-    "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used.\n",
+    "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used. Please note, that Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM.\n",
     "\n",
-    "1. Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM\n",
-    "2. Install `tensorrt_llm` via pip with `pip3 install tensorrt_llm -U --extra-index-url https://pypi.nvidia.com`\n",
-    "3. For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/trt-llm-rag-windows/blob/release/1.0/README.md#building-trt-engine)\n",
-    "    * The following files will be created from following the stop above\n",
-    "    * `Llama_float16_tp1_rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded.\n",
-    "    * `config.json`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine.\n",
-    "    * `model.cache`: Caches some of the timing and optimization information from model compilation, making successive builds quicker.\n",
-    "4. `mkdir model`\n",
-    "5. Move all of the files mentioned above to the model directory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install llama-index-llms-nvidia-tensorrt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install tensorrt_llm==0.7.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121"
+    "In this tutorial we will show how to use the connector with GPT2 model.\n",
+    "For the best experience, we recommend following\n",
+    "[Installation](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.8.0?tab=readme-ov-file#installation) process on the\n",
+    "official [TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM).\n",
+    "\n",
+    "The following steps are showing how to set up your model with TensorRT-LLM v0.8.0 for x86_64 users.\n",
+    "\n",
+    "1. Obtain and start the basic docker image environment.\n",
+    "```\n",
+    "docker run --rm --runtime=nvidia --gpus all --entrypoint /bin/bash -it nvidia/cuda:12.1.0-devel-ubuntu22.04\n",
+    "```\n",
+    "\n",
+    "2. Install dependencies, TensorRT-LLM requires Python 3.10\n",
+    "```\n",
+    "apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget\n",
+    "```\n",
+    "3. Install the latest stable version (corresponding to the release branch) of TensorRT-LLM. We are using version 0.8.0, but for the most up to date release,\n",
+    "please refer to [official release page](https://github.com/NVIDIA/TensorRT-LLM/releases).\n",
+    "```\n",
+    "pip3 install tensorrt_llm==0.8.0 -U --extra-index-url https://pypi.nvidia.com\n",
+    "```\n",
+    "\n",
+    "4. Check installation\n",
+    "```\n",
+    "python3 -c \"import tensorrt_llm\"\n",
+    "```\n",
+    "The above command should not produce any errors.\n",
+    "\n",
+    "5. For this example we will use GPT2. The GPT2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#usage)\n",
+    "    * First, inside the container, we've started during stage 1, clone TensorRT-LLM repository:\n",
+    "    ```\n",
+    "    git clone --branch v0.8.0 https://github.com/NVIDIA/TensorRT-LLM.git\n",
+    "    ```\n",
+    "    * Install requirements for GPT2 model with:\n",
+    "    ```\n",
+    "    cd TensorRT-LLM/examples/gpt/ && pip install -r requirements.txt\n",
+    "    ```\n",
+    "    * Download hf gpt2 model\n",
+    "    ```\n",
+    "    rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2\n",
+    "    cd gpt2\n",
+    "    rm pytorch_model.bin model.safetensors\n",
+    "    wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin\n",
+    "    cd ..\n",
+    "    ```\n",
+    "    * Convert weights from HF Transformers to TensorRT-LLM format\n",
+    "    ```\n",
+    "    python3 hf_gpt_convert.py -i gpt2 -o ./c-model/gpt2 --tensor-parallelism 1 --storage-type float16\n",
+    "    ```\n",
+    "    * Build TensorRT engine\n",
+    "    ```\n",
+    "    python3 build.py --model_dir=./c-model/gpt2/1-gpu --use_gpt_attention_plugin --remove_input_padding\n",
+    "    ```\n",
+    "  \n",
+    "6. Install `llama-index-llms-nvidia-tensorrt` package\n",
+    "  ```\n",
+    "  pip install llama-index-llms-nvidia-tensorrt\n",
+    "  ```"
    ]
   },
   {
@@ -75,41 +106,39 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
+    "```python\n",
     "from llama_index.llms.nvidia_tensorrt import LocalTensorRTLLM\n",
     "\n",
-    "\n",
-    "def completion_to_prompt(completion: str) -> str:\n",
-    "    \"\"\"\n",
-    "    Given a completion, return the prompt using llama2 format.\n",
-    "    \"\"\"\n",
-    "    return f\"<s> [INST] {completion} [/INST] \"\n",
-    "\n",
-    "\n",
     "llm = LocalTensorRTLLM(\n",
-    "    model_path=\"./model\",\n",
-    "    engine_name=\"llama_float16_tp1_rank0.engine\",\n",
-    "    tokenizer_dir=\"meta-llama/Llama-2-13b-chat\",\n",
-    "    completion_to_prompt=completion_to_prompt,\n",
-    ")"
+    "    model_path=\"./engine_outputs\",\n",
+    "    engine_name=\"gpt_float16_tp1_rank0.engine\",\n",
+    "    tokenizer_dir=\"gpt2\",\n",
+    "    max_new_tokens=40,\n",
+    ")\n",
+    "\n",
+    "resp = llm.complete(\"Who is Harry Potter?\")\n",
+    "print(str(resp))\n",
+    "```"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "resp = llm.complete(\"Who is Paul Graham?\")\n",
-    "print(str(resp))"
+    "The expected response should look like:\n",
+    "```\n",
+    "Harry Potter is a fictional character created by J.K. Rowling in her first novel, Harry Potter and the Philosopher's Stone. The character is a wizard who lives in the fictional town#\n",
+    "```"
    ]
   }
  ],
  "metadata": {
+  "colab": {
+   "provenance": []
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -133,5 +162,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 0
 }
diff --git a/docs/docs/examples/llm/nvidia_triton.ipynb b/docs/docs/examples/llm/nvidia_triton.ipynb
index f6ee2ef4ec..ab33d0f7dc 100644
--- a/docs/docs/examples/llm/nvidia_triton.ipynb
+++ b/docs/docs/examples/llm/nvidia_triton.ipynb
@@ -18,93 +18,130 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "[NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) provides a cloud and edge inferencing solution optimized for both CPUs and GPUs. This connector allows for llama_index to remotely interact with TRT-LLM models deployed with Triton."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launching Triton Inference Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This connector requires a running instance of Triton Inference Server with A TensorRT-LLM model.\n",
+    "For this example, we will use a [Triton Command Line Interface (Triton CLI)](https://github.com/triton-inference-server/triton_cli) to deploy a GPT2 model on Triton.\n",
+    "\n",
+    "When using Triton and related tools on your host (outside of a Triton container image) there are a number of additional dependencies that may be required for various workflows. Most system dependency issues can be resolved by installing and running the CLI from within the latest corresponding `tritonserver` container image, which should have all necessary system dependencies installed.\n",
     "\n",
-    "Nvidia's Triton is an inference server that provides API access to hosted LLM models. This connector allows for llama_index to remotely interact with a Triton inference server over GRPC to accelerate inference operations.\n",
+    "For TRT-LLM, you can use `nvcr.io/nvidia/tritonserver:{YY.MM}-trtllm-python-py3` image, where `YY.MM` corresponds to the version of `tritonserver`, for example in this example we're using 24.02 version of the container. To get the list of available versions, please refer to [Triton Inference Server NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).\n",
     "\n",
-    "[Triton Inference Server Github](https://github.com/triton-inference-server/server)"
+    "To start the container, run in your Linux terminal:\n",
+    "\n",
+    "```\n",
+    "docker run -ti --gpus all --network=host --shm-size=1g --ulimit memlock=-1 nvcr.io/nvidia/tritonserver:24.02-trtllm-python-py3\n",
+    "```\n",
+    "Next, we'll need to install dependencies with the following:\n",
+    "```\n",
+    "pip install \\\n",
+    "  \"psutil\" \\\n",
+    "  \"pynvml>=11.5.0\" \\\n",
+    "  \"torch==2.1.2\" \\\n",
+    "  \"tensorrt_llm==0.8.0\" --extra-index-url https://pypi.nvidia.com/\n",
+    "```\n",
+    "Finally, run the following to install Triton CLI."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Install tritonclient\n",
-    "Since we are interacting with the Triton inference server we will need to install the `tritonclient` package. The `tritonclient` package.\n",
-    "\n",
-    "`tritonclient` can be easily installed using `pip3 install tritonclient`."
+    "```\n",
+    "pip install git+https://github.com/triton-inference-server/triton_cli.git\n",
+    "```"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "%pip install llama-index-llms-nvidia-triton"
+    "To generate model repository for GPT2 model and start an instance of Triton Server:\n",
+    "```\n",
+    "triton remove -m all\n",
+    "triton import -m gpt2 --backend tensorrtllm\n",
+    "triton start &\n",
+    "```\n",
+    "Please, note that by default Triton starts listenning to `localhost:8000` HTTP port and `localhost:8001` GRPC port. The latter will be used in this example.\n",
+    "For any additional how-tos and questions, please reach out to [Triton Command Line Interface (Triton CLI)](https://github.com/triton-inference-server/triton_cli) issues.\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "!pip3 install tritonclient"
+    "## Install tritonclient\n",
+    "Since we are interacting with the Triton Inference Server we will need to [install](https://github.com/triton-inference-server/client?tab=readme-ov-file#download-using-python-package-installer-pip) the `tritonclient` package."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Basic Usage"
+    "```\n",
+    "pip install tritonclient[all]\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Call `complete` with a prompt"
+    "Next, we'll install llama index connector.\n",
+    "```\n",
+    "pip install llama-index-llms-nvidia-triton\n",
+    "```\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from llama_index.llms.nvidia_triton import NvidiaTriton\n",
-    "\n",
-    "# A Triton server instance must be running. Use the correct URL for your desired Triton server instance.\n",
-    "triton_url = \"localhost:8001\"\n",
-    "resp = NvidiaTriton().complete(\"The tallest mountain in North America is \")\n",
-    "print(resp)"
+    "## Basic Usage"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Call `chat` with a list of messages"
+    "#### Call `complete` with a prompt"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from llama_index.core.llms import ChatMessage\n",
+    "```python\n",
     "from llama_index.llms.nvidia_triton import NvidiaTriton\n",
     "\n",
-    "messages = [\n",
-    "    ChatMessage(\n",
-    "        role=\"system\",\n",
-    "        content=\"You are a clown named bozo that has had a rough day at the circus\",\n",
-    "    ),\n",
-    "    ChatMessage(role=\"user\", content=\"What has you down bozo?\"),\n",
-    "]\n",
-    "resp = NvidiaTriton().chat(messages)\n",
-    "print(resp)"
+    "# A Triton server instance must be running. Use the correct URL for your desired Triton server instance.\n",
+    "triton_url = \"localhost:8001\"\n",
+    "model_name = \"gpt2\"\n",
+    "resp = NvidiaTriton(server_url=triton_url, model_name=model_name, tokens=32).complete(\"The tallest mountain in North America is \")\n",
+    "print(resp)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should expect the following response\n",
+    "```\n",
+    "the Great Pyramid of Giza, which is about 1,000 feet high. The Great Pyramid of Giza is the tallest mountain in North America.\n",
+    "```\n"
    ]
   },
   {
@@ -112,13 +149,14 @@
    "metadata": {},
    "source": [
     "## Further Examples\n",
-    "Remember that a Triton instance represents a running server instance therefore you should ensure you have a valid server configuration running and change the `localhost:8001` to the correct IP/hostname:port combination for your server.\n",
-    "\n",
-    "An example of setting up this environment can be found at Nvidia's (GenerativeAIExamples Github Repo)[https://github.com/NVIDIA/GenerativeAIExamples/tree/main/RetrievalAugmentedGeneration]"
+    "For more information on Triton Inference Server, please refer to a [Quickstart](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md#quickstart) guide, [NVIDIA Developer Triton page](https://developer.nvidia.com/triton-inference-server), and [GitHub issues](https://github.com/triton-inference-server/server/issues) channel."
    ]
   }
  ],
  "metadata": {
+  "colab": {
+   "provenance": []
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -142,5 +180,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 0
 }
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py
index 9386d75fe9..2275dc6b55 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/__init__.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from llama_index.llms.nvidia_tensorrt.base import LocalTensorRTLLM
 
 __all__ = ["LocalTensorRTLLM"]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py
index 5261a470d7..7de63b9abf 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/base.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import gc
 import json
 import os
@@ -158,7 +184,9 @@ class LocalTensorRTLLM(CustomLLM):
                 ]
                 remove_input_padding = config["plugin_config"]["remove_input_padding"]
                 tp_size = config["builder_config"]["tensor_parallel"]
-                pp_size = config["builder_config"]["pipeline_parallel"]
+                pp_size = 1
+                if "pipeline_parallel" in config["builder_config"]:
+                    pp_size = config["builder_config"]["pipeline_parallel"]
                 world_size = tp_size * pp_size
                 assert (
                     world_size == tensorrt_llm.mpi_world_size()
@@ -185,6 +213,7 @@ class LocalTensorRTLLM(CustomLLM):
                     gpt_attention_plugin=use_gpt_attention_plugin,
                     paged_kv_cache=paged_kv_cache,
                     remove_input_padding=remove_input_padding,
+                    max_batch_size=config["builder_config"]["max_batch_size"],
                 )
 
                 assert (
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py
index 4814e23136..c47b4660fa 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/llama_index/llms/nvidia_tensorrt/utils.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import time
 import uuid
 from typing import Any, Dict, Optional
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml
index bc42ebcc68..32e0d79e23 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-tensorrt/pyproject.toml
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-nvidia-tensorrt"
 readme = "README.md"
-version = "0.1.4"
+version = "0.1.5"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py
index f75556832b..7516d7be8b 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/__init__.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 from llama_index.llms.nvidia_triton.base import NvidiaTriton
 
 __all__ = ["NvidiaTriton"]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py
index c273d0bb9b..b15793c781 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/base.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import random
 from typing import (
     Any,
@@ -36,8 +62,8 @@ DEFAULT_MAX_TOKENS = 100
 DEFAULT_BEAM_WIDTH = 1
 DEFAULT_REPTITION_PENALTY = 1.0
 DEFAULT_LENGTH_PENALTY = 1.0
-DEFAULT_REUSE_CLIENT = True
-DEFAULT_TRITON_LOAD_MODEL = True
+DEFAULT_REUSE_CLIENT = False
+DEFAULT_TRITON_LOAD_MODEL = False
 
 
 class NvidiaTriton(LLM):
@@ -235,7 +261,6 @@ class NvidiaTriton(LLM):
         result_queue = client.request_streaming(
             model_params["model_name"], request_id, **invocation_params
         )
-
         response = ""
         for token in result_queue:
             if isinstance(token, InferenceServerException):
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py
index ef644788df..3a53beaddd 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/llama_index/llms/nvidia_triton/utils.py
@@ -1,3 +1,29 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import abc
 import json
 import random
@@ -25,13 +51,21 @@ class StreamingResponseGenerator(Queue):
     """A Generator that provides the inference results from an LLM."""
 
     def __init__(
-        self, client: "GrpcTritonClient", request_id: str, force_batch: bool
+        self,
+        client: "GrpcTritonClient",
+        request_id: str,
+        force_batch: bool,
+        model_name: str,
+        max_tokens: int,
     ) -> None:
         """Instantiate the generator class."""
         super().__init__()
         self._client = client
         self.request_id = request_id
         self._batch = force_batch
+        self._model_name = model_name
+        self._max_tokens = max_tokens
+        self._counter = 0
 
     def __iter__(self) -> "StreamingResponseGenerator":
         """Return self as a generator."""
@@ -40,15 +74,16 @@ class StreamingResponseGenerator(Queue):
     def __next__(self) -> str:
         """Return the next retrieved token."""
         val = self.get()
-        if val is None or val in STOP_WORDS:
+        if val is None or val in STOP_WORDS or self._counter == self._max_tokens - 1:
             self._stop_stream()
             raise StopIteration
+        self._counter += 1
         return val
 
     def _stop_stream(self) -> None:
         """Drain and shutdown the Triton stream."""
         self._client.stop_stream(
-            "tensorrt_llm", self.request_id, signal=not self._batch
+            self._model_name, self.request_id, signal=not self._batch
         )
 
 
@@ -163,8 +198,8 @@ class _BaseTritonClient(abc.ABC):
     ) -> List[Union["grpcclient.InferInput", "httpclient.InferInput"]]:
         """Create the input for the triton inference server."""
         query = np.array(prompt).astype(object)
-        request_output_len = np.array([tokens]).astype(np.uint32).reshape((1, -1))
-        runtime_top_k = np.array([top_k]).astype(np.uint32).reshape((1, -1))
+        request_output_len = np.array([tokens]).astype(np.int32).reshape((1, -1))
+        runtime_top_k = np.array([top_k]).astype(np.int32).reshape((1, -1))
         runtime_top_p = np.array([top_p]).astype(np.float32).reshape((1, -1))
         temperature_array = np.array([temperature]).astype(np.float32).reshape((1, -1))
         len_penalty = np.array([length_penalty]).astype(np.float32).reshape((1, -1))
@@ -172,7 +207,7 @@ class _BaseTritonClient(abc.ABC):
             np.array([repetition_penalty]).astype(np.float32).reshape((1, -1))
         )
         random_seed = np.array([RANDOM_SEED]).astype(np.uint64).reshape((1, -1))
-        beam_width_array = np.array([beam_width]).astype(np.uint32).reshape((1, -1))
+        beam_width_array = np.array([beam_width]).astype(np.int32).reshape((1, -1))
         streaming_data = np.array([[stream]], dtype=bool)
 
         return [
@@ -318,8 +353,10 @@ class GrpcTritonClient(_BaseTritonClient):
         if not request_id:
             request_id = str(random.randint(1, 9999999))  # nosec
 
-        result_queue = StreamingResponseGenerator(self, request_id, force_batch)
         inputs = self._generate_inputs(stream=not force_batch, **params)
+        result_queue = StreamingResponseGenerator(
+            self, request_id, force_batch, model_name, max_tokens=params["tokens"]
+        )
         outputs = self._generate_outputs()
         self._send_prompt_streaming(
             model_name,
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml
index a8526c296a..d5f681d349 100644
--- a/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia-triton/pyproject.toml
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-nvidia-triton"
 readme = "README.md"
-version = "0.1.3"
+version = "0.1.4"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab