diff --git a/benchmarks/inference_throughput/cloud-api/README.md b/benchmarks/inference_throughput/cloud-api/README.md
index 00190a0fcdc1bb65b230f9d0e2ca637f23c5264a..e0bf2ec60e63868854c4d9818052fe88754d61c4 100644
--- a/benchmarks/inference_throughput/cloud-api/README.md
+++ b/benchmarks/inference_throughput/cloud-api/README.md
@@ -7,7 +7,9 @@ Disclaimer - The purpose of the code is to provide a configurable setup to measu
 # Azure - Getting Started
 To get started, there are certain steps we need to take to deploy the models:
 
+<!-- markdown-link-check-disable -->
 * Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)
+<!-- markdown-link-check-enable -->
 * Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article
 * Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)
 * Select Llama models from Model catalog
diff --git a/examples/inference.py b/examples/inference.py
index 3b554440ba37fb63d0058dc0a1f3274c3230bfc4..4f83c8f2caec1fbac1558a2182671fe5de67f24e 100644
--- a/examples/inference.py
+++ b/examples/inference.py
@@ -7,6 +7,7 @@ import fire
 import os
 import sys
 import time
+import gradio as gr
 
 import torch
 from transformers import LlamaTokenizer
@@ -39,18 +40,8 @@ def main(
     use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
     **kwargs
 ):
-    if prompt_file is not None:
-        assert os.path.exists(
-            prompt_file
-        ), f"Provided Prompt file does not exist {prompt_file}"
-        with open(prompt_file, "r") as f:
-            user_prompt = "\n".join(f.readlines())
-    elif not sys.stdin.isatty():
-        user_prompt = "\n".join(sys.stdin.readlines())
-    else:
-        print("No user prompt provided. Exiting.")
-        sys.exit(1)
 
+  def inference(user_prompt, temperature, top_p, top_k, max_new_tokens, **kwargs,):
     safety_checker = get_safety_checker(enable_azure_content_safety,
                                         enable_sensitive_topics,
                                         enable_salesforce_content_safety,
@@ -126,7 +117,49 @@ def main(
             if not is_safe:
                 print(method)
                 print(report)
-                
+    return output_text
+
+  if prompt_file is not None:
+      assert os.path.exists(
+          prompt_file
+      ), f"Provided Prompt file does not exist {prompt_file}"
+      with open(prompt_file, "r") as f:
+          user_prompt = "\n".join(f.readlines())
+      inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
+  elif not sys.stdin.isatty():
+      user_prompt = "\n".join(sys.stdin.readlines())
+      inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
+  else:
+      gr.Interface(
+        fn=inference,
+        inputs=[
+            gr.components.Textbox(
+                lines=9,
+                label="User Prompt",
+                placeholder="none",
+            ),
+            gr.components.Slider(
+                minimum=0, maximum=1, value=1.0, label="Temperature"
+            ),
+            gr.components.Slider(
+                minimum=0, maximum=1, value=1.0, label="Top p"
+            ),
+            gr.components.Slider(
+                minimum=0, maximum=100, step=1, value=50, label="Top k"
+            ),
+            gr.components.Slider(
+                minimum=1, maximum=2000, step=1, value=200, label="Max tokens"
+            ),
+        ],
+        outputs=[
+            gr.components.Textbox(
+                lines=5,
+                label="Output",
+            )
+        ],
+        title="Llama2 Playground",
+        description="https://github.com/facebookresearch/llama-recipes",
+      ).queue().launch(server_name="0.0.0.0", share=True)
 
 if __name__ == "__main__":
     fire.Fire(main)