diff --git a/benchmarks/inference_throughput/cloud-api/README.md b/benchmarks/inference_throughput/cloud-api/README.md index 00190a0fcdc1bb65b230f9d0e2ca637f23c5264a..e0bf2ec60e63868854c4d9818052fe88754d61c4 100644 --- a/benchmarks/inference_throughput/cloud-api/README.md +++ b/benchmarks/inference_throughput/cloud-api/README.md @@ -7,7 +7,9 @@ Disclaimer - The purpose of the code is to provide a configurable setup to measu # Azure - Getting Started To get started, there are certain steps we need to take to deploy the models: +<!-- markdown-link-check-disable --> * Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE) +<!-- markdown-link-check-enable --> * Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article * Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) * Select Llama models from Model catalog diff --git a/examples/inference.py b/examples/inference.py index 3b554440ba37fb63d0058dc0a1f3274c3230bfc4..4f83c8f2caec1fbac1558a2182671fe5de67f24e 100644 --- a/examples/inference.py +++ b/examples/inference.py @@ -7,6 +7,7 @@ import fire import os import sys import time +import gradio as gr import torch from transformers import LlamaTokenizer @@ -39,18 +40,8 @@ def main( use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels **kwargs ): - if prompt_file is not None: - assert os.path.exists( - prompt_file - ), f"Provided Prompt file does not exist {prompt_file}" - with open(prompt_file, "r") as f: - user_prompt = "\n".join(f.readlines()) - elif not sys.stdin.isatty(): - user_prompt = "\n".join(sys.stdin.readlines()) - else: - print("No user prompt provided. Exiting.") - sys.exit(1) + def inference(user_prompt, temperature, top_p, top_k, max_new_tokens, **kwargs,): safety_checker = get_safety_checker(enable_azure_content_safety, enable_sensitive_topics, enable_salesforce_content_safety, @@ -126,7 +117,49 @@ def main( if not is_safe: print(method) print(report) - + return output_text + + if prompt_file is not None: + assert os.path.exists( + prompt_file + ), f"Provided Prompt file does not exist {prompt_file}" + with open(prompt_file, "r") as f: + user_prompt = "\n".join(f.readlines()) + inference(user_prompt, temperature, top_p, top_k, max_new_tokens) + elif not sys.stdin.isatty(): + user_prompt = "\n".join(sys.stdin.readlines()) + inference(user_prompt, temperature, top_p, top_k, max_new_tokens) + else: + gr.Interface( + fn=inference, + inputs=[ + gr.components.Textbox( + lines=9, + label="User Prompt", + placeholder="none", + ), + gr.components.Slider( + minimum=0, maximum=1, value=1.0, label="Temperature" + ), + gr.components.Slider( + minimum=0, maximum=1, value=1.0, label="Top p" + ), + gr.components.Slider( + minimum=0, maximum=100, step=1, value=50, label="Top k" + ), + gr.components.Slider( + minimum=1, maximum=2000, step=1, value=200, label="Max tokens" + ), + ], + outputs=[ + gr.components.Textbox( + lines=5, + label="Output", + ) + ], + title="Llama2 Playground", + description="https://github.com/facebookresearch/llama-recipes", + ).queue().launch(server_name="0.0.0.0", share=True) if __name__ == "__main__": fire.Fire(main)