diff --git a/recipes/benchmarks/inference_throughput/on-prem/README.md b/recipes/benchmarks/inference_throughput/on-prem/README.md index 0d2053f5ac76a3d1470fee76e8084170bc9d95cb..ffd332e9dd90918c464cd1a61981e30519fb4cfe 100644 --- a/recipes/benchmarks/inference_throughput/on-prem/README.md +++ b/recipes/benchmarks/inference_throughput/on-prem/README.md @@ -37,3 +37,5 @@ To run pretrained model benchmark, follow the command below. ``` python pretrained_vllm_benchmark.py ``` + +Refer to more vLLM benchmark details on their official Github repo [here](https://github.com/vllm-project/vllm/tree/main/benchmarks). diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py index 7c7057cc12278c0b3a02c97b802f9417dbfbb008..30b2765f81ace4508cd1e4f9c253d4d13faeb229 100644 --- a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py +++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py @@ -4,7 +4,6 @@ import csv import json import time -import random import threading import numpy as np import requests @@ -18,7 +17,7 @@ from azure.core.exceptions import HttpResponseError from azure.ai.contentsafety.models import AnalyzeTextOptions from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, Tuple, List +from typing import Tuple, List diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json index d5f055039e4cca1ce890913b9f4f7c24d925d6cb..deaee9bb358f6fadf2663e6b7f9771b091b2f23a 100644 --- a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json +++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json @@ -1,7 +1,7 @@ { "MAX_NEW_TOKENS" : 256, "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256], - "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct", + "MODEL_PATH" : "meta-llama/your-model-path", "MODEL_HEADERS" : {"Content-Type": "application/json"}, "SAFE_CHECK" : true, "THRESHOLD_TPS" : 7, diff --git a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py index 84fdf2e64f0e8ef50efa7810e1052dd5b8dc8048..3d74cd4e3d40e077111a9ceb1b4debfcc9c67286 100644 --- a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py +++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py @@ -18,7 +18,7 @@ from azure.core.exceptions import HttpResponseError from azure.ai.contentsafety.models import AnalyzeTextOptions from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, Tuple, List +from typing import Tuple, List # Predefined inputs