diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt
index 9819089e3017f00bdb3fbb4c6a9c5753732025b4..f7b6bc2492f383a90ea1069d06e067269336e6bd 100644
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1400,6 +1400,19 @@ sqlite
 customerservice
 fn
 ExecuTorch
+LLMScore
+RecursiveCharacterTextSplitter
+TPD
+TPM
+Tianjun
+Zhang
+distractor
+distractors
+frac
+numRefusal
+totalQA
+DirectoryLoader
+SitemapLoader
 nf
 quant
 DLAI
@@ -1415,3 +1428,26 @@ xH
 unquantized
 eom
 ipython
+CPUs
+modelUpgradeExample
+guardrailing
+MaaS
+MFU
+BBH
+GPQA
+IFEVAL
+IFeval
+bos
+gpqa
+ifeval
+lighteval
+sqrt
+wis
+evals
+mmlu
+parsers
+reproducibility
+openhathi
+sarvam
+subtask
+acc
diff --git a/README.md b/README.md
index ae5c427cb1888719057e7b71aed5e07c3e3192d9..aa08f1eef4bf7d729b18dbd79a412d0f002a2490 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Llama Recipes: Examples to get started using the Llama models from Meta
 <!-- markdown-link-check-disable -->
-The 'llama-recipes' repository is a companion to the [Meta Llama](https://github.com/meta-llama/llama-models) models. We support the latest version, [Llama 3.1](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/MODEL_CARD.md), in this repository. The goal is to provide a scalable library for fine-tuning Meta Llama models, along with some example scripts and notebooks to quickly get started with using the models in a variety of use-cases, including fine-tuning for domain adaptation and building LLM-based applications with Llama and other tools in the LLM ecosystem. The examples here showcase how to run Llama locally, in the cloud, and on-prem. 
+The 'llama-recipes' repository is a companion to the [Meta Llama](https://github.com/meta-llama/llama-models) models. We support the latest version, [Llama 3.1](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/MODEL_CARD.md), in this repository. The goal is to provide a scalable library for fine-tuning Meta Llama models, along with some example scripts and notebooks to quickly get started with using the models in a variety of use-cases, including fine-tuning for domain adaptation and building LLM-based applications with Llama and other tools in the LLM ecosystem. The examples here showcase how to run Llama locally, in the cloud, and on-prem.
 
 <!-- markdown-link-check-enable -->
 > [!IMPORTANT]
@@ -31,7 +31,7 @@ The 'llama-recipes' repository is a companion to the [Meta Llama](https://github
 > ```
 > Each message gets trailed by an `<|eot_id|>` token before a new header is started, signaling a role change.
 >
-> More details on the new tokenizer and prompt template can be found [here](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1). 
+> More details on the new tokenizer and prompt template can be found [here](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1).
 
 >
 > [!NOTE]
@@ -55,6 +55,7 @@ The 'llama-recipes' repository is a companion to the [Meta Llama](https://github
   - [Repository Organization](#repository-organization)
     - [`recipes/`](#recipes)
     - [`src/`](#src)
+  - [Supported Features](#supported-features)
   - [Contributing](#contributing)
   - [License](#license)
 
@@ -145,7 +146,7 @@ Contains examples are organized in folders by topic:
 [use_cases](./recipes/use_cases)|Scripts showing common applications of Meta Llama3
 [3p_integrations](./recipes/3p_integrations)|Partner owned folder showing common applications of Meta Llama3
 [responsible_ai](./recipes/responsible_ai)|Scripts to use PurpleLlama for safeguarding model outputs
-[experimental](./experimental)|Meta Llama implementations of experimental LLM techniques
+[experimental](./recipes/experimental)|Meta Llama implementations of experimental LLM techniques
 
 ### `src/`
 
@@ -160,6 +161,30 @@ Contains modules which support the example recipes:
 | [utils](src/llama_recipes/utils/) | Utility files for:<br/> - `train_utils.py` provides training/eval loop and more train utils.<br/> - `dataset_utils.py` to get preprocessed datasets.<br/> - `config_utils.py` to override the configs received from CLI.<br/> - `fsdp_utils.py` provides FSDP  wrapping policy for PEFT methods.<br/> - `memory_utils.py` context manager to track different memory stats in train loop. |
 
 
+## Supported Features
+The recipes and modules in this repository support the following features:
+
+| Feature                                        |   |
+| ---------------------------------------------- | - |
+| HF support for inference                       | âœ… |
+| HF support for finetuning                      | âœ… |
+| PEFT                                           | âœ… |
+| Deferred initialization ( meta init)           | âœ… |
+| Low CPU mode for multi GPU                     | âœ… |
+| Mixed precision                                | âœ… |
+| Single node quantization                       | âœ… |
+| Flash attention                                | âœ… |
+| Activation checkpointing FSDP                  | âœ… |
+| Hybrid Sharded Data Parallel (HSDP)            | âœ… |
+| Dataset packing & padding                      | âœ… |
+| BF16 Optimizer (Pure BF16)                     | âœ… |
+| Profiling & MFU tracking                       | âœ… |
+| Gradient accumulation                          | âœ… |
+| CPU offloading                                 | âœ… |
+| FSDP checkpoint conversion to HF for inference | âœ… |
+| W&B experiment tracker                         | âœ… |
+
+
 ## Contributing
 
 Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us.
diff --git a/pyproject.toml b/pyproject.toml
index 670e04cc51a16659da630eeee0289209e1a7382d..8ae81804fb4c5b8c0eaea6bc3319e46608faafe2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-recipes"
-version = "0.0.2"
+version = "0.0.3"
 authors = [
   { name="Hamid Shojanazeri", email="hamidnazeri@meta.com" },
   { name="Matthias Reso", email="mreso@meta.com" },
diff --git a/recipes/3p_integrations/azure/Azure MaaS/azure_api_example.ipynb b/recipes/3p_integrations/azure/Azure MaaS/azure_api_example.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e840d63c1cc9a30c00afdc766e6aac764d8a18da
--- /dev/null
+++ b/recipes/3p_integrations/azure/Azure MaaS/azure_api_example.ipynb	
@@ -0,0 +1,494 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Use Azure API with Llama 3.1\n",
+        "\n",
+        "This notebook shows examples of how to use Llama 3.1 APIs offered by Microsoft Azure. We will cover:  \n",
+        "* HTTP requests API usage for Llama 3.1 instruct models in CLI\n",
+        "* HTTP requests API usage for Llama 3.1 instruct models in Python\n",
+        "* Plug the APIs into LangChain\n",
+        "* Wire the model with Gradio to build a simple chatbot with memory\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Prerequisite\n",
+        "\n",
+        "Before we start building with Azure Llama 3.1 APIs, there are certain steps we need to take to deploy the models:\n",
+        "\n",
+        "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
+        "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
+        "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group.\n",
+        "* For Llama 3.1 instruct models from Model catalog, click Deploy in the model page and select \"Serverless API with Azure AI Content Safety\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
+        "* For Llama 3.1 pretrained models, Azure currently only support manual deployment under regular subscription. This means you will need to acquire a virtual machine with managed compute resource. We won't cover it here in this tutorial.\n",
+        "\n",
+        "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## HTTP Requests API Usage in CLI\n",
+        "\n",
+        "### Basics\n",
+        "\n",
+        "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
+        "\n",
+        "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
+        "This can be acquired from previous steps.  \n",
+        "\n",
+        "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
+        "\n",
+        "* The `host-url` is your endpoint url with completion schema. \n",
+        "* The `headers` defines the content type as well as your api key. \n",
+        "* The `payload` or `data`, which is your prompt detail and model hyper parameters."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
+        "\n",
+        "```\n",
+        "{ \n",
+        "  \"messages\": [ \n",
+        "    { \n",
+        "      \"content\": \"You are a helpful assistant.\", \n",
+        "      \"role\": \"system\" \n",
+        "},  \n",
+        "    { \n",
+        "      \"content\": \"Hello!\", \n",
+        "      \"role\": \"user\" \n",
+        "    } \n",
+        "  ], \n",
+        "  \"max_tokens\": 50, \n",
+        "} \n",
+        "```\n",
+        "\n",
+        "Here is a sample curl call for chat completion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"What is good about Wuhan?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Streaming\n",
+        "\n",
+        "One fantastic feature the API offers is the streaming capability.  \n",
+        "Streaming allows the generated tokens to be sent as data-only server-sent events whenever they become available.  \n",
+        "This is extremely important for interactive applications such as chatbots, so the user is always engaged.  \n",
+        "\n",
+        "To use streaming, simply set `\"stream\":true` as part of the request payload.  \n",
+        "In the streaming mode, the REST API response will be different from non-streaming mode.\n",
+        "\n",
+        "Here is an example: "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"What is good about Wuhan?\",\"role\":\"user\"}], \"max_tokens\": 500, \"stream\": true}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As you can see the result comes back as a stream of `data` objects, each contains generated information including a `choice`.  \n",
+        "The stream terminated by a `data:[DONE]\\n\\n` message."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Content Safety Filtering\n",
+        "\n",
+        "If you enabled content filtering during deployment, Azure Llama 3.1 API endpoints will have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
+        "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
+        "\n",
+        "For model input and output, if the filter detects there is harmful content, the generation will error out with additional information. \n",
+        "\n",
+        "If you disabled content filtering during deployment, Llama models had content safety built-in for generation. It will refuse to answer your questions if any harmful content was detected.\n",
+        "\n",
+        "Here is an example prompt that triggered content safety filtering:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"How to make bomb?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## HTTP Requests API Usage in Python\n",
+        "\n",
+        "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
+        "\n",
+        "Here is an example for the instruct model:\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import urllib.request\n",
+        "import json\n",
+        "\n",
+        "#Configure payload data sending to API endpoint\n",
+        "data = {\"messages\":[\n",
+        "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
+        "            {\"role\":\"user\", \"content\":\"What is good about Wuhan?\"}],\n",
+        "        \"max_tokens\": 500,\n",
+        "        \"temperature\": 0.9,\n",
+        "        \"stream\": True,\n",
+        "}\n",
+        "\n",
+        "body = str.encode(json.dumps(data))\n",
+        "\n",
+        "#Replace the url with your API endpoint\n",
+        "url = 'https://your-endpoint.inference.ai.azure.com/v1/chat/completions'\n",
+        "\n",
+        "#Replace this with the key for the endpoint\n",
+        "api_key = 'your-auth-key'\n",
+        "if not api_key:\n",
+        "    raise Exception(\"API Key is missing\")\n",
+        "\n",
+        "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
+        "\n",
+        "req = urllib.request.Request(url, body, headers)\n",
+        "\n",
+        "try:\n",
+        "    response = urllib.request.urlopen(req)\n",
+        "    result = response.read()\n",
+        "    print(result)\n",
+        "except urllib.error.HTTPError as error:\n",
+        "    print(\"The request failed with status code: \" + str(error.code))\n",
+        "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
+        "    print(error.info())\n",
+        "    print(error.read().decode(\"utf8\", 'ignore'))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However in this example, the streamed data content returns back as a single payload. It didn't stream as a serial of data events as we wished. To build true streaming capabilities utilizing the API endpoint, we will utilize the [`requests`](https://requests.readthedocs.io/en/latest/) library instead."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Streaming in Python\n",
+        "\n",
+        "`Requests` library is a simple HTTP library for Python built with [`urllib3`](https://github.com/urllib3/urllib3). It automatically maintains the keep-alive and HTTP connection pooling. With the `Session` class, we can easily stream the result from our API calls.  \n",
+        "\n",
+        "Here is a quick example:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "import requests\n",
+        "\n",
+        "data = {\"messages\":[\n",
+        "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
+        "            {\"role\":\"user\", \"content\":\"What is good about Wuhan?\"}],\n",
+        "        \"max_tokens\": 500,\n",
+        "        \"temperature\": 0.9,\n",
+        "        \"stream\": True\n",
+        "}\n",
+        "\n",
+        "\n",
+        "def post_stream(url):\n",
+        "    s = requests.Session()\n",
+        "    api_key = \"your-auth-key\"\n",
+        "    headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
+        "\n",
+        "    with s.post(url, data=json.dumps(data), headers=headers, stream=True) as resp:\n",
+        "        print(resp.status_code)\n",
+        "        for line in resp.iter_lines():\n",
+        "            if line:\n",
+        "                print(line)\n",
+        "\n",
+        "\n",
+        "url = \"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\"\n",
+        "post_stream(url)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Use Llama 3.1 API with LangChain\n",
+        "\n",
+        "In this section, we will demonstrate how to use Llama 3.1 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
+        "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
+        "In this example, we will use the `AzureMLChatOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
+        "\n",
+        "First, let's install dependencies: \n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install langchain"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Once all dependencies are installed, you can directly create a `llm` instance based on `AzureMLChatOnlineEndpoint` as follows:  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_community.chat_models.azureml_endpoint import (\n",
+        "    AzureMLEndpointApiType,\n",
+        "    CustomOpenAIChatContentFormatter,\n",
+        "    AzureMLChatOnlineEndpoint,\n",
+        ")\n",
+        "\n",
+        "from langchain_core.messages import HumanMessage\n",
+        "\n",
+        "llm = AzureMLChatOnlineEndpoint(\n",
+        "    endpoint_api_key=\"your-auth-key\",\n",
+        "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
+        "    endpoint_api_type=AzureMLEndpointApiType.serverless,\n",
+        "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 256, \"top_p\": 0.9},\n",
+        "    content_formatter=CustomOpenAIChatContentFormatter(),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However, you might wonder what is the `CustomOpenAIChatContentFormatter` in the context when creating the `llm` instance?   \n",
+        "The `CustomOpenAIChatContentFormatter` is a [handler class](https://python.langchain.com/docs/integrations/llms/azure_ml#content-formatter) for transforming the request and response of an AzureML endpoint to match with required schema. Since there are various models in the Azure model catalog, each of which needs to handle the data accordingly.  \n",
+        "In our case, we can use the default `CustomOpenAIChatContentFormatter` which can handle Llama model schemas. If you need to have special handlings, you can customize this specific class. \n",
+        "\n",
+        "Once you have the `llm` ready, you can simple inference it by:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.invoke([HumanMessage(content=\"What is good about Wuhan?\")])\n",
+        "response"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is an example that you can create a translator chain with the `llm` instance and translate English to French:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.chains import LLMChain\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "\n",
+        "template = \"\"\"\n",
+        "You are a Translator. Translate the following content from {input_language} to {output_language} and reply with only the translated result.\n",
+        "{input_content}\n",
+        "\"\"\"\n",
+        "\n",
+        "translator_chain = LLMChain(\n",
+        "    llm = llm,\n",
+        "    prompt = PromptTemplate(\n",
+        "            template=template,\n",
+        "            input_variables=[\"input_language\", \"output_language\", \"input_content\"],\n",
+        "        ),\n",
+        ")\n",
+        "\n",
+        "print(translator_chain.run(input_language=\"English\", output_language=\"French\", input_content=\"What is good about Wuhan?\"))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Build a chatbot with Llama 3.1 API\n",
+        "\n",
+        "In this section, we will build a simple chatbot using Azure Llama 3.1 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
+        "\n",
+        "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
+        "\n",
+        "First, let's install Gradio dependencies.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install gradio==4.39.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's use `AzureMLChatOnlineEndpoint` class from the previous example.  \n",
+        "In this example, we have three major components:  \n",
+        "1. Chatbot UI hosted as web interface by Gradio. These are the UI logics that render our model predictions.\n",
+        "2. Model itself, which is the core component that ingests prompts and returns an answer back.\n",
+        "3. Memory component, which stores previous conversation context. In this example, we will use [conversation window buffer](https://python.langchain.com/docs/modules/memory/types/buffer_window) which logs context in certain time window in the past. \n",
+        "\n",
+        "All of them are chained together using LangChain."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import gradio as gr\n",
+        "import langchain\n",
+        "from langchain.chains import ConversationChain\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "from langchain.memory import ConversationBufferWindowMemory\n",
+        "from langchain_core.messages import HumanMessage\n",
+        "from langchain_community.chat_models.azureml_endpoint import (\n",
+        "    AzureMLEndpointApiType,\n",
+        "    CustomOpenAIChatContentFormatter,\n",
+        "    AzureMLChatOnlineEndpoint,\n",
+        ")\n",
+        "\n",
+        "llm = AzureMLChatOnlineEndpoint(\n",
+        "    endpoint_api_key=\"your-auth-key\",\n",
+        "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
+        "    endpoint_api_type=AzureMLEndpointApiType.serverless,\n",
+        "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 256, \"top_p\": 0.9},\n",
+        "    content_formatter=CustomOpenAIChatContentFormatter(),\n",
+        ")\n",
+        "\n",
+        "langchain.debug=True\n",
+        "\n",
+        "#Create memory\n",
+        "memory = ConversationBufferWindowMemory(llm=llm, k=5, memory_key=\"chat_history\", ai_prefix=\"Assistant\", human_prefix=\"User\")\n",
+        "\n",
+        "#Create input prompt template with chat history for chaining\n",
+        "INPUT_TEMPLATE = \"\"\"Current conversation:\n",
+        "{chat_history}\n",
+        "\n",
+        "User question:{input}\"\"\"\n",
+        "\n",
+        "conversation_prompt_template = PromptTemplate(\n",
+        "    input_variables=[\"chat_history\", \"input\"], template=INPUT_TEMPLATE\n",
+        ")\n",
+        "\n",
+        "conversation_chain_with_memory = ConversationChain(\n",
+        "    llm = llm,\n",
+        "    prompt = conversation_prompt_template,\n",
+        "    verbose = True,\n",
+        "    memory = memory,\n",
+        ")\n",
+        "\n",
+        "#Prediction\n",
+        "def predict(message, history):\n",
+        "    history_format = []\n",
+        "    for user, assistant in history:\n",
+        "        history_format.append({\"role\": \"user\", \"content\": user })\n",
+        "        history_format.append({\"role\": \"assistant\", \"content\":assistant})\n",
+        "    history_format.append({\"role\": \"user\", \"content\": message})\n",
+        "    response = conversation_chain_with_memory.run(input=message)\n",
+        "    return response\n",
+        "\n",
+        "#Launch Gradio chatbot interface\n",
+        "gr.ChatInterface(predict).launch()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "After successfully executing the code above, a chat interface should appear as the interactive output or you can open the localhost url in your selected browser window. You can see how amazing it is to build a AI chatbot just in few lines of code.\n",
+        "\n",
+        "This concludes our tutorial and examples. Here are some additional reference:  \n",
+        "* [Fine-tune Llama](https://learn.microsoft.com/azure/ai-studio/how-to/fine-tune-model-llama)\n",
+        "* [Plan and manage costs (marketplace)](https://learn.microsoft.com/azure/ai-studio/how-to/costs-plan-manage#monitor-costs-for-models-offered-through-the-azure-marketplace)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "fileHeader": "",
+    "fileUid": "599e1edd-cd59-4e55-823f-17157fc07b18",
+    "isAdHoc": false,
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/recipes/3p_integrations/azure/README.md b/recipes/3p_integrations/azure/README.md
index a33772af35241c26dbfa00c62c35b05a2d77a93d..53fa5739b095046ddf30ce3cdfa66a11c3fc56d6 100644
--- a/recipes/3p_integrations/azure/README.md
+++ b/recipes/3p_integrations/azure/README.md
@@ -1,5 +1,2 @@
-In this folder, we show various examples in a notebook for running Llama model inference on Azure's serverless API offerings. We will cover:  
-* HTTP requests API usage for Llama 3 instruct models in CLI
-* HTTP requests API usage for Llama 3 instruct models in Python
-* Plug the APIs into LangChain
-* Wire the model with Gradio to build a simple chatbot with memory
+In this folder, we show various recipes for Llama models working with Azure AI services. This includes:
+* Examples for running Llama model inference on Azure's serverless API offerings (aka. MaaS)
diff --git a/recipes/3p_integrations/azure/azure_api_example.ipynb b/recipes/3p_integrations/azure/azure_api_example.ipynb
deleted file mode 100644
index 2b8b36178cf5535c8e7ccfb2b7859dabe1c94ff8..0000000000000000000000000000000000000000
--- a/recipes/3p_integrations/azure/azure_api_example.ipynb
+++ /dev/null
@@ -1,532 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Use Azure API with Llama 3\n",
-    "\n",
-    "This notebook shows examples of how to use Llama 3 APIs offered by Microsoft Azure. We will cover:  \n",
-    "* HTTP requests API usage for Llama 3 instruct models in CLI\n",
-    "* HTTP requests API usage for Llama 3 instruct models in Python\n",
-    "* Plug the APIs into LangChain\n",
-    "* Wire the model with Gradio to build a simple chatbot with memory\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prerequisite\n",
-    "\n",
-    "Before we start building with Azure Llama 3 APIs, there are certain steps we need to take to deploy the models:\n",
-    "\n",
-    "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
-    "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
-    "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)\n",
-    "* For Llama 3 instruct models from Model catalog, click Deploy in the model page and select \"Pay-as-you-go\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
-    "* For Llama 3 pretrained models, Azure currently only support manual deployment under regular subscription. We are working with them to bring \"Pay-as-you-go\" for pretrained models.\n",
-    "\n",
-    "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## HTTP Requests API Usage in CLI\n",
-    "\n",
-    "### Basics\n",
-    "\n",
-    "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
-    "\n",
-    "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
-    "This can be acquired from previous steps.  \n",
-    "\n",
-    "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
-    "\n",
-    "* The `host-url` is your endpoint url with completion schema. \n",
-    "* The `headers` defines the content type as well as your api key. \n",
-    "* The `payload` or `data`, which is your prompt detail and model hyper parameters."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
-    "\n",
-    "```\n",
-    "{ \n",
-    "  \"messages\": [ \n",
-    "    { \n",
-    "      \"content\": \"You are a helpful assistant.\", \n",
-    "      \"role\": \"system\" \n",
-    "},  \n",
-    "    { \n",
-    "      \"content\": \"Hello!\", \n",
-    "      \"role\": \"user\" \n",
-    "    } \n",
-    "  ], \n",
-    "  \"max_tokens\": 50, \n",
-    "} \n",
-    "```\n",
-    "\n",
-    "Here is a sample curl call for chat completion"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"Who wrote the book Innovators dilemma?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Streaming\n",
-    "\n",
-    "One fantastic feature the API offers is the streaming capability.  \n",
-    "Streaming allows the generated tokens to be sent as data-only server-sent events whenever they become available.  \n",
-    "This is extremely important for interactive applications such as chatbots, so the user is always engaged.  \n",
-    "\n",
-    "To use streaming, simply set `\"stream\":True` as part of the request payload.  \n",
-    "In the streaming mode, the REST API response will be different from non-streaming mode.\n",
-    "\n",
-    "Here is an example: "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"Who wrote the book Innovators dilemma?\",\"role\":\"user\"}], \"max_tokens\": 500, \"stream\": True}'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As you can see the result comes back as a stream of `data` objects, each contains generated information including a `choice`.  \n",
-    "The stream terminated by a `data:[DONE]\\n\\n` message."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Content Safety Filtering\n",
-    "\n",
-    "All Azure Llama 3 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
-    "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
-    "\n",
-    "For model input and output, if the filter detects there is harmful content, the generation will error out with a response payload containing the reasoning, along with information on the type of content violation and its severity. \n",
-    "\n",
-    "Here is an example prompt that triggered content safety filtering:\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"How to make bomb?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## HTTP Requests API Usage in Python\n",
-    "\n",
-    "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
-    "\n",
-    "Here is an example for the instruct model:\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import urllib.request\n",
-    "import json\n",
-    "\n",
-    "#Configure payload data sending to API endpoint\n",
-    "data = {\"messages\":[\n",
-    "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
-    "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}], \n",
-    "        \"max_tokens\": 500,\n",
-    "        \"temperature\": 0.9,\n",
-    "        \"stream\": True,\n",
-    "}\n",
-    "\n",
-    "body = str.encode(json.dumps(data))\n",
-    "\n",
-    "#Replace the url with your API endpoint\n",
-    "url = 'https://your-endpoint.inference.ai.azure.com/v1/chat/completions'\n",
-    "\n",
-    "#Replace this with the key for the endpoint\n",
-    "api_key = 'your-auth-key'\n",
-    "if not api_key:\n",
-    "    raise Exception(\"API Key is missing\")\n",
-    "\n",
-    "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
-    "\n",
-    "req = urllib.request.Request(url, body, headers)\n",
-    "\n",
-    "try:\n",
-    "    response = urllib.request.urlopen(req)\n",
-    "    result = response.read()\n",
-    "    print(result)\n",
-    "except urllib.error.HTTPError as error:\n",
-    "    print(\"The request failed with status code: \" + str(error.code))\n",
-    "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
-    "    print(error.info())\n",
-    "    print(error.read().decode(\"utf8\", 'ignore'))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "However in this example, the streamed data content returns back as a single payload. It didn't stream as a serial of data events as we wished. To build true streaming capabilities utilizing the API endpoint, we will utilize the [`requests`](https://requests.readthedocs.io/en/latest/) library instead."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Streaming in Python\n",
-    "\n",
-    "`Requests` library is a simple HTTP library for Python built with [`urllib3`](https://github.com/urllib3/urllib3). It automatically maintains the keep-alive and HTTP connection pooling. With the `Session` class, we can easily stream the result from our API calls.  \n",
-    "\n",
-    "Here is a quick example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import requests\n",
-    "\n",
-    "data = {\"messages\":[\n",
-    "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
-    "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"temperature\": 0.9,\n",
-    "        \"stream\": True\n",
-    "}\n",
-    "\n",
-    "\n",
-    "def post_stream(url):\n",
-    "    s = requests.Session()\n",
-    "    api_key = \"your-auth-key\"\n",
-    "    headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
-    "\n",
-    "    with s.post(url, data=json.dumps(data), headers=headers, stream=True) as resp:\n",
-    "        print(resp.status_code)\n",
-    "        for line in resp.iter_lines():\n",
-    "            if line:\n",
-    "                print(line)\n",
-    "\n",
-    "\n",
-    "url = \"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\"\n",
-    "post_stream(url)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Use Llama 3 API with LangChain\n",
-    "\n",
-    "In this section, we will demonstrate how to use Llama 3 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
-    "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
-    "In this example, we will use the `AzureMLOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
-    "\n",
-    "First, let's install dependencies: \n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pip install langchain"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Once all dependencies are installed, you can directly create a `llm` instance based on `AzureMLOnlineEndpoint` as follows:  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.llms.azureml_endpoint import AzureMLOnlineEndpoint, ContentFormatterBase\n",
-    "from typing import Dict\n",
-    "import json\n",
-    "\n",
-    "\n",
-    "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
-    "#Content formatter for Llama 3 API for Azure MaaS\n",
-    "\n",
-    "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
-    "        #Formats the request according to the chosen api\n",
-    "        prompt = ContentFormatterBase.escape_special_characters(prompt)\n",
-    "        request_payload_dict = {\n",
-    "                \"messages\": [\n",
-    "                    {\"role\":\"system\", \"content\":\"You are a helpful assistant\"},\n",
-    "                    {\"role\":\"user\", \"content\":f\"{prompt}\"}\n",
-    "                    ]               \n",
-    "            }\n",
-    "        #Add model parameters as part of the dict\n",
-    "        request_payload_dict.update(model_kwargs)\n",
-    "        request_payload = json.dumps(request_payload_dict)\n",
-    "        return str.encode(request_payload)\n",
-    "\n",
-    "    def format_response_payload(self, output: bytes) -> str:\n",
-    "        #Formats response\n",
-    "        return json.loads(output)[\"choices\"][0][\"message\"][\"content\"]\n",
-    "\n",
-    "\n",
-    "content_formatter = AzureLlamaAPIContentFormatter()\n",
-    "\n",
-    "llm = AzureMLOnlineEndpoint(\n",
-    "    endpoint_api_key=\"your-auth-key\",\n",
-    "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
-    "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 512, \"top_p\": 0.9},\n",
-    "    content_formatter=content_formatter,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "However, you might wonder what is the `content_formatter` in the context when creating the `llm` instance?   \n",
-    "The `content_formatter` parameter is a [handler class](https://python.langchain.com/docs/integrations/llms/azure_ml#content-formatter) for transforming the request and response of an AzureML endpoint to match with required schema. Since there are various models in the Azure model catalog, each of which needs to handle the data accordingly.  \n",
-    "In our case, all current formatters provided by Langchain including `LLamaContentFormatter` don't follow the schema. So we created our own customized formatter called `AzureLlamaAPIContentFormatter` to handle the input and output data.  \n",
-    "\n",
-    "Once you have the `llm` ready, you can simple inference it by:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(llm(\"Who wrote the book Innovators dilemma?\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is an example that you can create a translator chain with the `llm` instance and translate English to French:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chains import LLMChain\n",
-    "from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "template = \"\"\"\n",
-    "You are a Translator. Translate the following content from {input_language} to {output_language} and reply with only the translated result.\n",
-    "{input_content}\n",
-    "\"\"\"\n",
-    "\n",
-    "translator_chain = LLMChain(\n",
-    "    llm = llm,\n",
-    "    prompt = PromptTemplate(\n",
-    "            template=template,\n",
-    "            input_variables=[\"input_language\", \"output_language\", \"input_content\"],\n",
-    "        ),\n",
-    ")\n",
-    "\n",
-    "print(translator_chain.run(input_language=\"English\", output_language=\"French\", input_content=\"Who wrote the book Innovators dilemma?\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Build a chatbot with Llama 3 API\n",
-    "\n",
-    "In this section, we will build a simple chatbot using Azure Llama 3 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
-    "\n",
-    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
-    "\n",
-    "First, let's install Gradio dependencies.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "pip install gradio"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's use `AzureMLOnlineEndpoint` class from the previous example.  \n",
-    "In this example, we have three major components:  \n",
-    "1. Chatbot UI hosted as web interface by Gradio. These are the UI logics that render our model predictions.\n",
-    "2. Model itself, which is the core component that ingests prompts and returns an answer back.\n",
-    "3. Memory component, which stores previous conversation context. In this example, we will use [conversation window buffer](https://python.langchain.com/docs/modules/memory/types/buffer_window) which logs context in certain time window in the past. \n",
-    "\n",
-    "All of them are chained together using LangChain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import gradio as gr\n",
-    "from langchain.chains import ConversationChain\n",
-    "from langchain.prompts import PromptTemplate\n",
-    "from langchain.llms.azureml_endpoint import AzureMLOnlineEndpoint, ContentFormatterBase\n",
-    "from langchain.memory import ConversationBufferWindowMemory\n",
-    "\n",
-    "import langchain\n",
-    "from typing import Dict\n",
-    "import json\n",
-    "\n",
-    "langchain.debug=True\n",
-    "\n",
-    "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
-    "#Content formatter for Llama 3 API for Azure MaaS\n",
-    "\n",
-    "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
-    "        #Formats the request according to the chosen api\n",
-    "        prompt = ContentFormatterBase.escape_special_characters(prompt)\n",
-    "\n",
-    "        #Note how we instructed the model with system prompts. Past conversation can be past as in system prompt as well\n",
-    "        request_payload_dict = {\n",
-    "                \"messages\": [\n",
-    "                    {\"role\":\"system\", \"content\":\"The following is a conversation between a user and you. Answer the user question based on the conversation. Provide your answer only\"},\n",
-    "                    {\"role\":\"user\", \"content\":f\"{prompt}\"}\n",
-    "                    ]               \n",
-    "            }\n",
-    "        request_payload_dict.update(model_kwargs)\n",
-    "        request_payload = json.dumps(request_payload_dict)\n",
-    "        return str.encode(request_payload)\n",
-    "\n",
-    "    def format_response_payload(self, output: bytes) -> str:\n",
-    "        #Formats response\n",
-    "        return json.loads(output)[\"choices\"][0][\"message\"][\"content\"]\n",
-    "\n",
-    "#Create content fomartter\n",
-    "content_formatter = AzureLlamaAPIContentFormatter()\n",
-    "\n",
-    "#Create llm instance\n",
-    "llm = AzureMLOnlineEndpoint(\n",
-    "    endpoint_api_key=\"your-auth-key\",\n",
-    "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
-    "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 128, \"top_p\": 0.9},\n",
-    "    content_formatter=content_formatter,\n",
-    ")\n",
-    "\n",
-    "#Create memory\n",
-    "memory = ConversationBufferWindowMemory(llm=llm, k=5, memory_key=\"chat_history\", ai_prefix=\"Assistant\", human_prefix=\"User\")\n",
-    "\n",
-    "#Create input prompt template with chat history for chaining\n",
-    "INPUT_TEMPLATE = \"\"\"Current conversation:\n",
-    "{chat_history}\n",
-    "\n",
-    "User question:{input}\"\"\"\n",
-    "\n",
-    "conversation_prompt_template = PromptTemplate(\n",
-    "    input_variables=[\"chat_history\", \"input\"], template=INPUT_TEMPLATE\n",
-    ")\n",
-    "\n",
-    "conversation_chain_with_memory = ConversationChain(\n",
-    "    llm = llm,\n",
-    "    prompt = conversation_prompt_template,\n",
-    "    verbose = True,\n",
-    "    memory = memory,\n",
-    ")\n",
-    "\n",
-    "#Prediction\n",
-    "def predict(message, history):\n",
-    "    history_format = []\n",
-    "    for user, assistant in history:\n",
-    "        history_format.append({\"role\": \"user\", \"content\": user })\n",
-    "        history_format.append({\"role\": \"assistant\", \"content\":assistant})\n",
-    "    history_format.append({\"role\": \"user\", \"content\": message})\n",
-    "    response = conversation_chain_with_memory.run(input=message)\n",
-    "    return response\n",
-    "\n",
-    "#Launch Gradio chatbot interface\n",
-    "gr.ChatInterface(predict).launch()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "After successfully executing the code above, a chat interface should appear as the interactive output or you can open the localhost url in your selected browser window.  \n",
-    "\n",
-    "This concludes our tutorial and examples. Here are some additional reference:  \n",
-    "* [Fine-tune Llama](https://learn.microsoft.com/azure/ai-studio/how-to/fine-tune-model-llama)\n",
-    "* [Plan and manage costs (marketplace)](https://learn.microsoft.com/azure/ai-studio/how-to/costs-plan-manage#monitor-costs-for-models-offered-through-the-azure-marketplace)\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/get_default_finetune_args.py b/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/get_default_finetune_args.py
index 6cddb41a6576067e59693690b34b5bc95f001de8..3d24dae56469e086554e41b794fbc66e809f6804 100644
--- a/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/get_default_finetune_args.py
+++ b/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/get_default_finetune_args.py
@@ -1,7 +1,7 @@
 def get_default_finetune_args():
     return {
-        "learning_rate": 3e-4,
-        "max_steps": 360,
+        "learning_rate": 0.0003,
+        "max_steps": 60,
         "early_stopping": False,
         "load_best_model_at_end": False,
         "peft_args": {"r_value": 32},
diff --git a/recipes/README.md b/recipes/README.md
index 9b5234eec15326e04ff056f2025e3ed5f2e0f981..86d90b7e0d04dfe6f8452869fc454f5d2f5ea649 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -4,8 +4,8 @@ This folder contains examples organized by topic:
 
 | Subfolder | Description |
 |---|---|
-[quickstart](./quickstart)|The "Hello World" of using Llama 3, start here if you are new to using Llama 3
-[use_cases](./use_cases)|Scripts showing common applications of Llama 3
-[3p_integrations](./3p_integrations)|Partner-owned folder showing Meta Llama 3 usage along with third-party tools 
+[quickstart](./quickstart)|The "Hello World" of using Llama, start here if you are new to using Llama
+[use_cases](./use_cases)|Scripts showing common applications of Llama
+[3p_integrations](./3p_integrations)|Partner-owned folder showing Llama usage along with third-party tools
 [responsible_ai](./responsible_ai)|Scripts to use PurpleLlama for safeguarding model outputs
-[experimental](./experimental)|Meta Llama implementations of experimental LLM techniques
+[experimental](./experimental)| Llama implementations of experimental LLM techniques
diff --git a/recipes/quickstart/Getting_to_know_Llama.ipynb b/recipes/quickstart/Getting_to_know_Llama.ipynb
index b3dbf21c57850e74fcadc2496e9f51cd43f14871..caecb672eaaf4d0abe6d670d8cce863e5f31ad73 100644
--- a/recipes/quickstart/Getting_to_know_Llama.ipynb
+++ b/recipes/quickstart/Getting_to_know_Llama.ipynb
@@ -15,8 +15,8 @@
     "id": "LERqQn5v8-ak"
    },
    "source": [
-    "# **Getting to know Llama 3: Everything you need to start building**\n",
-    "Our goal in this session is to provide a guided tour of Llama 3 with comparison with Llama 2, including understanding different Llama 3 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 3 projects."
+    "# **Getting to know Llama 3.1: Everything you need to start building**\n",
+    "Our goal in this session is to provide a guided tour of Llama 3.1 with comparison with Llama 2, including understanding different Llama 3.1 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 3.1 projects."
    ]
   },
   {
@@ -113,6 +113,20 @@
     "      llama-3-70b --> llama-3-70b-instruct\n",
     "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
     "  \"\"\")\n",
+    "  \n",
+    "def llama3_1_family():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "      llama-3-1 --> llama-3-8b\n",
+    "      llama-3-1 --> llama-3-70b\n",
+    "      llama-3-1 --> llama-3-4050b\n",
+    "      llama-3-1-8b --> llama-3-1-8b\n",
+    "      llama-3-1-8b --> llama-3-1-8b-instruct\n",
+    "      llama-3-1-70b --> llama-3-1-70b\n",
+    "      llama-3-1-70b --> llama-3-1-70b-instruct\n",
+    "      llama-3-1-405b --> llama-3-1-405b-instruct\n",
+    "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
     "\n",
     "import ipywidgets as widgets\n",
     "from IPython.display import display, Markdown\n",
@@ -184,7 +198,7 @@
     "id": "i4Np_l_KtIno"
    },
    "source": [
-    "### **1 - Understanding Llama 3**"
+    "### **1 - Understanding Llama 3.1**"
    ]
   },
   {
@@ -193,13 +207,13 @@
     "id": "PGPSI3M5PGTi"
    },
    "source": [
-    "### **1.1 - What is Llama 3?**\n",
+    "### **1.1 - What is Llama 3.1?**\n",
     "\n",
     "* State of the art (SOTA), Open Source LLM\n",
-    "* 8B, 70B - base and instruct models\n",
+    "* 8B, 70B, 405B - base and instruct models\n",
     "* Choosing model: Size, Quality, Cost, Speed\n",
     "* Pretrained + Chat\n",
-    "* [Meta Llama 3 Blog](https://ai.meta.com/blog/meta-llama-3/)\n",
+    "* [Meta Llama 3.1 Blog](https://ai.meta.com/blog/meta-llama-3-1/)\n",
     "* [Getting Started with Meta Llama](https://llama.meta.com/docs/get-started)"
    ]
   },
@@ -238,13 +252,22 @@
     "llama3_family()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama3_1_family()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
     "id": "aYeHVVh45bdT"
    },
    "source": [
-    "### **1.2 - Accessing Llama 3**\n",
+    "### **1.2 - Accessing Llama 3.1**\n",
     "* Download + Self Host (i.e. [download Llama](https://ai.meta.com/resources/models-and-libraries/llama-downloads))\n",
     "* Hosted API Platform (e.g. [Groq](https://console.groq.com/), [Replicate](https://replicate.com/meta/meta-llama-3-8b-instruct), [Together](https://api.together.xyz/playground/language/meta-llama/Llama-3-8b-hf), [Anyscale](https://app.endpoints.anyscale.com/playground))\n",
     "\n",
@@ -258,7 +281,7 @@
     "id": "kBuSay8vtzL4"
    },
    "source": [
-    "### **1.3 - Use Cases of Llama 3**\n",
+    "### **1.3 - Use Cases of Llama 3.1**\n",
     "* Content Generation\n",
     "* Summarization\n",
     "* General Chatbots\n",
@@ -943,7 +966,7 @@
     "import bs4\n",
     "\n",
     "# Step 1: Load the document from a web url\n",
-    "loader = WebBaseLoader([\"https://huggingface.co/blog/llama3\"])\n",
+    "loader = WebBaseLoader([\"https://huggingface.co/blog/llama31\"])\n",
     "documents = loader.load()\n",
     "\n",
     "# Step 2: Split the document into chunks with a specified chunk size\n",
@@ -1013,8 +1036,8 @@
    "source": [
     "# This time your previous question and answer will be included as a chat history which will enable the ability\n",
     "# to ask follow up questions.\n",
-    "chat_history = [(query, result[\"answer\"])]\n",
     "query = \"What two sizes?\"\n",
+    "chat_history = [(query, result[\"answer\"])]\n",
     "result = chain({\"question\": query, \"chat_history\": chat_history})\n",
     "md(result['answer'])"
    ]
@@ -1079,7 +1102,7 @@
    },
    "source": [
     "#### **Resources**\n",
-    "- [Meta Llama 3 Blog](https://ai.meta.com/blog/meta-llama-3/)\n",
+    "- [Meta Llama 3.1 Blog](https://ai.meta.com/blog/meta-llama-3-1/)\n",
     "- [Getting Started with Meta Llama](https://llama.meta.com/docs/get-started)\n",
     "- [Llama 3 repo](https://github.com/meta-llama/llama3)\n",
     "- [Llama 3 model card](https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md)\n",
@@ -1088,6 +1111,11 @@
     "- [Acceptable Use Policy](https://ai.meta.com/llama/use-policy/)\n",
     "\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {
diff --git a/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb b/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb
index 0132099570f7e02a7ed15ce0a774cd16f43dbd3f..f9e70566614b086c1be0db9463da04bba754ef08 100644
--- a/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb
+++ b/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb
@@ -7,11 +7,11 @@
    "source": [
     "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
     "\n",
-    "# Prompt Engineering with Llama 3\n",
+    "# Prompt Engineering with Llama 3.1\n",
     "\n",
     "Prompt engineering is using natural language to produce a desired response from a large language model (LLM).\n",
     "\n",
-    "This interactive guide covers prompt engineering & best practices with Llama 3."
+    "This interactive guide covers prompt engineering & best practices with Llama 3.1."
    ]
   },
   {
@@ -45,6 +45,15 @@
     "\n",
     "Llama models come in varying parameter sizes. The smaller models are cheaper to deploy and run; the larger models are more capable.\n",
     "\n",
+    "#### Llama 3.1\n",
+    "1. `llama-3.1-8b` - base pretrained 8 billion parameter model\n",
+    "1. `llama-3.1-70b` - base pretrained 70 billion parameter model\n",
+    "1. `llama-3.1-405b` - base pretrained 405 billion parameter model\n",
+    "1. `llama-3.1-8b-instruct` - instruction fine-tuned 8 billion parameter model\n",
+    "1. `llama-3.1-70b-instruct` - instruction fine-tuned 70 billion parameter model\n",
+    "1. `llama-3.1-405b-instruct` - instruction fine-tuned 405 billion parameter model (flagship)\n",
+    "\n",
+    "\n",
     "#### Llama 3\n",
     "1. `llama-3-8b` - base pretrained 8 billion parameter model\n",
     "1. `llama-3-70b` - base pretrained 70 billion parameter model\n",
@@ -133,7 +142,7 @@
     "\n",
     "Tokens matter most when you consider API pricing and internal behavior (ex. hyperparameters).\n",
     "\n",
-    "Each model has a maximum context length that your prompt cannot exceed. That's 8K tokens for Llama 3, 4K for Llama 2, and 100K for Code Llama. \n"
+    "Each model has a maximum context length that your prompt cannot exceed. That's 128k tokens for Llama 3.1, 4K for Llama 2, and 100K for Code Llama.\n"
    ]
   },
   {
@@ -143,7 +152,7 @@
    "source": [
     "## Notebook Setup\n",
     "\n",
-    "The following APIs will be used to call LLMs throughout the guide. As an example, we'll call Llama 3 chat using [Grok](https://console.groq.com/playground?model=llama3-70b-8192).\n",
+    "The following APIs will be used to call LLMs throughout the guide. As an example, we'll call Llama 3.1 chat using [Grok](https://console.groq.com/playground?model=llama3-70b-8192).\n",
     "\n",
     "To install prerequisites run:"
    ]
@@ -171,8 +180,9 @@
     "# Get a free API key from https://console.groq.com/keys\n",
     "os.environ[\"GROQ_API_KEY\"] = \"YOUR_GROQ_API_KEY\"\n",
     "\n",
-    "LLAMA3_70B_INSTRUCT = \"llama3-70b-8192\"\n",
-    "LLAMA3_8B_INSTRUCT = \"llama3-8b-8192\"\n",
+    "LLAMA3_405B_INSTRUCT = \"llama-3.1-405b-reasoning\" # Note: Groq currently only gives access here to paying customers for 405B model\n",
+    "LLAMA3_70B_INSTRUCT = \"llama-3.1-70b-versatile\"\n",
+    "LLAMA3_8B_INSTRUCT = \"llama3.1-8b-instant\"\n",
     "\n",
     "DEFAULT_MODEL = LLAMA3_70B_INSTRUCT\n",
     "\n",
@@ -225,7 +235,7 @@
    "source": [
     "### Completion APIs\n",
     "\n",
-    "Let's try Llama 3!"
+    "Let's try Llama 3.1!"
    ]
   },
   {
@@ -488,7 +498,7 @@
     "\n",
     "Simply adding a phrase encouraging step-by-step thinking \"significantly improves the ability of large language models to perform complex reasoning\" ([Wei et al. (2022)](https://arxiv.org/abs/2201.11903)). This technique is called \"CoT\" or \"Chain-of-Thought\" prompting.\n",
     "\n",
-    "Llama 3 now reasons step-by-step naturally without the addition of the phrase. This section remains for completeness."
+    "Llama 3.1 now reasons step-by-step naturally without the addition of the phrase. This section remains for completeness."
    ]
   },
   {
@@ -704,7 +714,7 @@
    "source": [
     "### Limiting Extraneous Tokens\n",
     "\n",
-    "A common struggle with Llama 2 is getting output without extraneous tokens (ex. \"Sure! Here's more information on...\"), even if explicit instructions are given to Llama 2 to be concise and no preamble. Llama 3 can better follow instructions.\n",
+    "A common struggle with Llama 2 is getting output without extraneous tokens (ex. \"Sure! Here's more information on...\"), even if explicit instructions are given to Llama 2 to be concise and no preamble. Llama 3.x can better follow instructions.\n",
     "\n",
     "Check out this improvement that combines a role, rules and restrictions, explicit instructions, and an example:"
    ]
diff --git a/recipes/quickstart/RAG/hello_llama_cloud.ipynb b/recipes/quickstart/RAG/hello_llama_cloud.ipynb
index ba2a08e8061551e1c66da2aadeca27a4b8a19591..a608024c2bd5b59623ede2491d46145858ce458e 100644
--- a/recipes/quickstart/RAG/hello_llama_cloud.ipynb
+++ b/recipes/quickstart/RAG/hello_llama_cloud.ipynb
@@ -8,11 +8,11 @@
     "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/use_cases/RAG/HelloLlamaCloud.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
     "\n",
     "## This demo app shows:\n",
-    "* How to run Llama 3 in the cloud hosted on Replicate\n",
+    "* How to run Llama 3.1 in the cloud hosted on Replicate\n",
     "* How to use LangChain to ask Llama general questions and follow up questions\n",
-    "* How to use LangChain to load a recent web page - Hugging Face's [blog post on Llama 3](https://huggingface.co/blog/llama3) - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama 3 be able to answer questions about the data not publicly available when Llama 3 was trained, or about your own data. RAG is one way to prevent LLM's hallucination\n",
+    "* How to use LangChain to load a recent web page - Hugging Face's [blog post on Llama 3.1](https://huggingface.co/blog/llama31) - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama 3 be able to answer questions about the data not publicly available when Llama 3 was trained, or about your own data. RAG is one way to prevent LLM's hallucination\n",
     "\n",
-    "**Note** We will be using [Replicate](https://replicate.com/meta/meta-llama-3-8b-instruct) to run the examples here. You will need to first sign in with Replicate with your github account, then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while. You can also use other Llama 3 cloud providers such as [Groq](https://console.groq.com/), [Together](https://api.together.xyz/playground/language/meta-llama/Llama-3-8b-hf), or [Anyscale](https://app.endpoints.anyscale.com/playground) - see Section 2 of the Getting to Know Llama [notebook](https://github.com/meta-llama/llama-recipes/blob/main/recipes/quickstart/Getting_to_know_Llama.ipynb) for more information."
+    "**Note** We will be using [Replicate](https://replicate.com/meta/meta-llama-3.1-405b-instruct) to run the examples here. You will need to first sign in with Replicate with your github account, then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while. You can also use other Llama 3.1 cloud providers such as [Groq](https://console.groq.com/), [Together](https://api.together.xyz/playground/language/meta-llama/Llama-3-8b-hf), or [Anyscale](https://app.endpoints.anyscale.com/playground) - see Section 2 of the Getting to Know Llama [notebook](https://github.com/meta-llama/llama-recipes/blob/main/recipes/quickstart/Getting_to_know_Llama.ipynb) for more information."
    ]
   },
   {
@@ -23,7 +23,7 @@
     "Let's start by installing the necessary packages:\n",
     "- sentence-transformers for text embeddings\n",
     "- FAISS gives us database capabilities \n",
-    "- LangChai provides necessary RAG tools for this demo"
+    "- LangChain provides necessary RAG tools for this demo"
    ]
   },
   {
@@ -59,7 +59,7 @@
    "id": "3e8870c1",
    "metadata": {},
    "source": [
-    "Next we call the Llama 3 8b chat model from Replicate. You can also use Llama 3 70b model by replacing the `model` name with \"meta/meta-llama-3-70b-instruct\"."
+    "Next we call the Llama 3.1 405b chat model from Replicate. You can also use Llama 3 8B or 70B model by replacing the `model` name with the respective model URL(s)."
    ]
   },
   {
@@ -71,7 +71,7 @@
    "source": [
     "from langchain_community.llms import Replicate\n",
     "llm = Replicate(\n",
-    "    model=\"meta/meta-llama-3-8b-instruct\",\n",
+    "    model=\"meta/meta-llama-3.1-405b-instruct\",\n",
     "    model_kwargs={\"temperature\": 0.0, \"top_p\": 1, \"max_new_tokens\":500}\n",
     ")"
    ]
@@ -189,8 +189,8 @@
    "id": "fc436163",
    "metadata": {},
    "source": [
-    "Next, let's explore using Llama 3 to answer questions using documents for context. \n",
-    "This gives us the ability to update Llama 3's knowledge thus giving it better context without needing to finetune. "
+    "Next, let's explore using Llama 3.1 to answer questions using documents for context. \n",
+    "This gives us the ability to update Llama 3.1's knowledge thus giving it better context without needing to finetune. "
    ]
   },
   {
@@ -246,7 +246,7 @@
     "\n",
     "In general, you should use larger chuck sizes for highly structured text such as code and smaller size for less structured text. You may need to experiment with different chunk sizes and overlap values to find out the best numbers.\n",
     "\n",
-    "We then use `RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama 3, thereby increasing its knowledge.\n",
+    "We then use `RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama 3.1, thereby increasing its knowledge. 3.1 also really shines with the new 128k context!\n",
     "\n",
     "For each question, LangChain performs a semantic similarity search of it in the vector db, then passes the search results as the context to Llama to answer the question."
    ]
diff --git a/recipes/quickstart/README.md b/recipes/quickstart/README.md
index 4c82bfbbdcfe5a9e1ac38d10d7e9de7dc6f97a7c..326cbdb29d337676ec0ea1e4b9eb8466b6d6cd7d 100644
--- a/recipes/quickstart/README.md
+++ b/recipes/quickstart/README.md
@@ -2,28 +2,8 @@
 
 If you are new to developing with Meta Llama models, this is where you should start. This folder contains introductory-level notebooks across different techniques relating to Meta Llama.
 
-* The [Running_Llama3_Anywhere](./Running_Llama3_Anywhere/) notebooks demonstrate how to run Llama inference across Linux, Mac and Windows platforms using the appropriate tooling.
-* The [Prompt_Engineering_with_Llama_3](./Prompt_Engineering_with_Llama_3.ipynb) notebook showcases the various ways to elicit appropriate outputs from Llama. Take this notebook for a spin to get a feel for how Llama responds to different inputs and generation parameters.
+* The [Running_Llama_Anywhere](./Running_Llama3_Anywhere/) notebooks demonstrate how to run Llama inference across Linux, Mac and Windows platforms using the appropriate tooling.
+* The [Prompt_Engineering_with_Llama](./Prompt_Engineering_with_Llama_3.ipynb) notebook showcases the various ways to elicit appropriate outputs from Llama. Take this notebook for a spin to get a feel for how Llama responds to different inputs and generation parameters.
 * The [inference](./inference/) folder contains scripts to deploy Llama for inference on server and mobile. See also [3p_integrations/vllm](../3p_integrations/vllm/) and [3p_integrations/tgi](../3p_integrations/tgi/) for hosting Llama on open-source model servers.
-* The [RAG](./RAG/) folder contains a simple Retrieval-Augmented Generation application using Llama 3.
-* The [finetuning](./finetuning/) folder contains resources to help you finetune Llama 3 on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-recipes finetuning code found in [finetuning.py](../../src/llama_recipes/finetuning.py) which supports these features:
-
-| Feature                                        |   |
-| ---------------------------------------------- | - |
-| HF support for finetuning                      | âœ… |
-| Deferred initialization ( meta init)           | âœ… |
-| HF support for inference                       | âœ… |
-| Low CPU mode for multi GPU                     | âœ… |
-| Mixed precision                                | âœ… |
-| Single node quantization                       | âœ… |
-| Flash attention                                | âœ… |
-| PEFT                                           | âœ… |
-| Activation checkpointing FSDP                  | âœ… |
-| Hybrid Sharded Data Parallel (HSDP)            | âœ… |
-| Dataset packing & padding                      | âœ… |
-| BF16 Optimizer ( Pure BF16)                    | âœ… |
-| Profiling & MFU tracking                       | âœ… |
-| Gradient accumulation                          | âœ… |
-| CPU offloading                                 | âœ… |
-| FSDP checkpoint conversion to HF for inference | âœ… |
-| W&B experiment tracker                         | âœ… |
+* The [RAG](./RAG/) folder contains a simple Retrieval-Augmented Generation application using Llama.
+* The [finetuning](./finetuning/) folder contains resources to help you finetune Llama on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-recipes finetuning code found in [finetuning.py](../../src/llama_recipes/finetuning.py) which supports these features:
diff --git a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
index 1935c4c327f8bc0a4ba3dc5e52ffe62f38f06b83..06f0e4094afaac2114c9a9ebdffd3b24026cb801 100644
--- a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
+++ b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
@@ -4,8 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Running Meta Llama 3 on Google Colab using Hugging Face transformers library\n",
-    "This notebook goes over how you can set up and run Llama 3 using Hugging Face transformers library\n",
+    "## Running Meta Llama 3.1 on Google Colab using Hugging Face transformers library\n",
+    "This notebook goes over how you can set up and run Llama 3.1 using Hugging Face transformers library\n",
     "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/Running_Llama2_Anywhere/Running_Llama_on_HF_transformers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
    ]
   },
@@ -14,7 +14,7 @@
    "metadata": {},
    "source": [
     "### Steps at a glance:\n",
-    "This demo showcases how to run the example with already converted Llama 3 weights on [Hugging Face](https://huggingface.co/meta-llama). Please Note: To use the downloads on Hugging Face, you must first request a download as shown in the steps below making sure that you are using the same email address as your Hugging Face account.\n",
+    "This demo showcases how to run the example with already converted Llama 3.1 weights on [Hugging Face](https://huggingface.co/meta-llama). Please Note: To use the downloads on Hugging Face, you must first request a download as shown in the steps below making sure that you are using the same email address as your Hugging Face account.\n",
     "\n",
     "To use already converted weights, start here:\n",
     "1. Request download of model weights from the Llama website\n",
@@ -45,7 +45,7 @@
     "Request download of model weights from the Llama website\n",
     "Before you can run the model locally, you will need to get the model weights. To get the model weights, visit the [Llama website](https://llama.meta.com/) and click on â€œdownload modelsâ€. \n",
     "\n",
-    "Fill  the required information, select the models â€œMeta Llama 3â€ and accept the terms & conditions. You will receive a URL in your email in a short time."
+    "Fill  the required information, select the models â€œMeta Llama 3.1â€ and accept the terms & conditions. You will receive a URL in your email in a short time."
    ]
   },
   {
@@ -94,7 +94,7 @@
    "source": [
     "Then, we will set the model variable to a specific model weâ€™d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3.1-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
     "\n",
-    "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
+    "1. Accept Terms of Service for Meta Llama 3.1 on Meta [website](https://llama.meta.com/llama-downloads).\n",
     "2. Use the same email address from Step (1) to login into Hugging Face.\n",
     "\n",
     "Follow the instructions on this Hugging Face page to login from your [terminal](https://huggingface.co/docs/huggingface_hub/en/quick-start). "
@@ -208,7 +208,7 @@
     "#### 2. Clone the llama repo and get the weights\n",
     "Git clone the [Meta Llama 3 repo](https://github.com/meta-llama/llama3). Run the `download.sh` script and follow the instructions. This will download the model checkpoints and tokenizer.\n",
     "\n",
-    "This example demonstrates a Meta Llama 3 model with 8B-instruct parameters, but the steps we follow would be similar for other llama models, as well as for other parameter models."
+    "This example demonstrates a Meta Llama 3.1 model with 8B-instruct parameters, but the steps we follow would be similar for other llama models, as well as for other parameter models."
    ]
   },
   {
@@ -223,7 +223,7 @@
     "* `cd transformers`\n",
     "* `pip install -e .`\n",
     "* `pip install torch tiktoken blobfile accelerate`\n",
-    "* `python3 src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ${path_to_meta_downloaded_model} --output_dir ${path_to_save_converted_hf_model} --model_size 8B --llama_version 3`"
+    "* `python3 src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ${path_to_meta_downloaded_model} --output_dir ${path_to_save_converted_hf_model} --model_size 8B --llama_version 3.1`"
    ]
   },
   {
@@ -233,7 +233,7 @@
     "\n",
     "#### 4. Prepare the script\n",
     "Import the following necessary modules in your script: \n",
-    "* `AutoModel` is the Llama 2 model class\n",
+    "* `AutoModel` is the Llama 3 model class\n",
     "* `AutoTokenizer` prepares your prompt for the model to process\n",
     "* `pipeline` is an abstraction to generate model outputs"
    ]
diff --git a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb
index 48a7cab70dfc3e9c356fc9f9cef7865af994bd98..0a5f43059bc4af06884a319c21134bf3ce014d3b 100644
--- a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb
+++ b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_Mac_Windows_Linux.ipynb
@@ -5,7 +5,7 @@
    "metadata": {},
    "source": [
     "## Running Llama 3 on Mac, Windows or Linux\n",
-    "This notebook goes over how you can set up and run Llama 3 locally on a Mac, Windows or Linux using [Ollama](https://ollama.com/)."
+    "This notebook goes over how you can set up and run Llama 3.1 locally on a Mac, Windows or Linux using [Ollama](https://ollama.com/)."
    ]
   },
   {
@@ -14,9 +14,9 @@
    "source": [
     "### Steps at a glance:\n",
     "1. Download and install Ollama.\n",
-    "2. Download and test run Llama 3.\n",
-    "3. Use local Llama 3 via Python.\n",
-    "4. Use local Llama 3 via LangChain.\n"
+    "2. Download and test run Llama 3.1\n",
+    "3. Use local Llama 3.1 via Python.\n",
+    "4. Use local Llama 3.1 via LangChain.\n"
    ]
   },
   {
@@ -36,16 +36,16 @@
    "source": [
     "#### 2. Download and test run Llama 3\n",
     "\n",
-    "On a terminal or console, run `ollama pull llama3` to download the Llama 3 8b chat model, in the 4-bit quantized format with size about 4.7 GB.\n",
+    "On a terminal or console, run `ollama pull llama3.1` to download the Llama 3.1 8b chat model, in the 4-bit quantized format with size about 4.7 GB.\n",
     "\n",
-    "Run `ollama pull llama3:70b` to download the Llama 3 70b chat model, also in the 4-bit quantized format with size 39GB.\n",
+    "Run `ollama pull llama3.1:70b` to download the Llama 3.1 70b chat model, also in the 4-bit quantized format with size 39GB.\n",
     "\n",
-    "Then you can run `ollama run llama3` and ask Llama 3 questions such as \"who wrote the book godfather?\" or \"who wrote the book godfather? answer in one sentence.\" You can also try `ollama run llama3:70b`, but the inference speed will most likely be too slow - for example, on an Apple M1 Pro with 32GB RAM, it takes over 10 seconds to generate one token using Llama 3 70b chat (vs over 10 tokens per second with Llama 3 8b chat).\n",
+    "Then you can run `ollama run llama3.1` and ask Llama 3.1 questions such as \"who wrote the book godfather?\" or \"who wrote the book godfather? answer in one sentence.\" You can also try `ollama run llama3.1:70b`, but the inference speed will most likely be too slow - for example, on an Apple M1 Pro with 32GB RAM, it takes over 10 seconds to generate one token using Llama 3.1 70b chat (vs over 10 tokens per second with Llama 3.1 8b chat).\n",
     "\n",
-    "You can also run the following command to test Llama 3 8b chat:\n",
+    "You can also run the following command to test Llama 3.1 8b chat:\n",
     "```\n",
     " curl http://localhost:11434/api/chat -d '{\n",
-    "  \"model\": \"llama3\",\n",
+    "  \"model\": \"llama3.1\",\n",
     "  \"messages\": [\n",
     "    {\n",
     "      \"role\": \"user\",\n",
@@ -63,7 +63,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### 3. Use local Llama 3 via Python\n",
+    "#### 3. Use local Llama 3.1 via Python\n",
     "\n",
     "The Python code below is the port of the curl command above."
    ]
@@ -81,7 +81,7 @@
     "\n",
     "def llama3(prompt):\n",
     "    data = {\n",
-    "        \"model\": \"llama3\",\n",
+    "        \"model\": \"llama3.1\",\n",
     "        \"messages\": [\n",
     "            {\n",
     "              \"role\": \"user\",\n",
@@ -114,7 +114,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### 4. Use local Llama 3 via LangChain\n",
+    "#### 4. Use local Llama 3.1 via LangChain\n",
     "\n",
     "Code below use LangChain with Ollama to query Llama 3 running locally. For a more advanced example of using local Llama 3 with LangChain and agent-powered RAG, see [this](https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_rag_agent_llama3_local.ipynb)."
    ]
@@ -136,7 +136,7 @@
    "source": [
     "from langchain_community.chat_models import ChatOllama\n",
     "\n",
-    "llm = ChatOllama(model=\"llama3\", temperature=0)\n",
+    "llm = ChatOllama(model=\"llama3.1\", temperature=0)\n",
     "response = llm.invoke(\"who wrote the book godfather?\")\n",
     "print(response.content)\n"
    ]
diff --git a/recipes/quickstart/finetuning/README.md b/recipes/quickstart/finetuning/README.md
index 102733d64a3686541b40de91eade017204d1d822..aea8cbc497efe84dd7bb2dde0310aae3e86a751f 100644
--- a/recipes/quickstart/finetuning/README.md
+++ b/recipes/quickstart/finetuning/README.md
@@ -27,8 +27,8 @@ It lets us specify the training settings for everything from `model_name` to `da
 ```python
     model_name: str="PATH/to/Model"
     tokenizer_name: str=None
-    enable_fsdp: bool=False
-    low_cpu_fsdp: bool=False
+    enable_fsdp: bool=False # shards model parameters, optimizer states and gradients across DDP ranks
+    low_cpu_fsdp: bool=False # saves cpu memory by loading pretrained model on rank0 only
     run_validation: bool=True
     batch_size_training: int=4
     batching_strategy: str="packing" #alternative: padding
@@ -42,14 +42,14 @@ It lets us specify the training settings for everything from `model_name` to `da
     num_workers_dataloader: int=1
     lr: float=1e-4
     weight_decay: float=0.0
-    gamma: float= 0.85
+    gamma: float= 0.85 # multiplicatively decay the learning rate by gamma after each epoch
     seed: int=42
     use_fp16: bool=False
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
     peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
-    use_peft: bool=False
+    use_peft: bool=False # use parameter efficient fine tuning
     from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
diff --git a/recipes/quickstart/finetuning/datasets/raft_dataset.py b/recipes/quickstart/finetuning/datasets/raft_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9341dd317c0869b30eecbc30bdfc4e26ab819bdf
--- /dev/null
+++ b/recipes/quickstart/finetuning/datasets/raft_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+
+
+import copy
+from datasets import load_dataset
+import itertools
+
+# check system prompt token seq or user prompt token seq is in the current token list
+def check_header(targets,seq):
+    for i in range(len(seq)-3):
+        if seq[i:i+3] in targets:
+            return True
+    return False
+def replace_target(target,seq):
+    for i in range(len(seq)-3):
+        if seq[i:i+3] == target:
+            seq[i],seq[i+1],seq[i+2] = -100,-100,-100
+    return seq
+def tokenize_dialog(dialog, tokenizer):
+    # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
+    if tokenizer.vocab_size >= 128000:
+        dialog_tokens = tokenizer.apply_chat_template(dialog)
+        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
+        labels = copy.copy(dialog_tokens)
+        last_idx = 0
+        # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
+        # user prompt header "<|start_header_id|>user<|end_header_id|>" has been tokenized to [128006, 882, 128007]
+        prompt_header_seqs = [[128006, 9125, 128007],[128006, 882, 128007]]
+        for n, idx in enumerate(eot_indices):
+            current_seq = labels[last_idx:idx+1]
+            if check_header(prompt_header_seqs,current_seq):
+                # found prompt header, indicating that this seq should be masked
+                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
+            else:
+                last_idx = idx
+        # Lastly mask all the assistant header prompt <|start_header_id|>assistant<|end_header_id|>, which has been tokenized to [128006, 78191, 128007]
+        assistant_header_seq = [128006, 78191, 128007]
+        labels = replace_target(assistant_header_seq,labels)
+        dialog_tokens = [dialog_tokens]
+        labels_tokens = [labels]
+    else:
+        raise Exception("This raft_dataset only supports Llama 3 family models, please make sure the tokenizer is from Llama 3 family models.")
+
+    combined_tokens = {
+        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
+        "labels": list(itertools.chain(*(t for t in labels_tokens))),
+    }
+
+    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
+def raft_tokenize(q_a_pair, tokenizer):
+    end_tag = "</DOCUMENT>"
+    # find the last end_tag in the instruction, the rest is the question
+    try:
+        index =q_a_pair["instruction"].rindex(end_tag)+len(end_tag)
+    except ValueError:
+        print(q_a_pair["instruction"])
+        raise Exception("The instruction does not contain the end tag <\/DOCUMENT>")
+    # all the lines after end_tag are the question
+    question = q_a_pair["instruction"][index:].strip()
+    # all the lines before end_tag are the context
+    documents = q_a_pair["instruction"][:index].strip() 
+    # output is the label
+    answer = q_a_pair["output"]
+    system_prompt = "You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context."
+    user_prompt = """
+        Question: {question}\nContext: {context}\n
+        Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+        - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+        - First provide step-by-step reasoning on how to answer the question.
+        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+        - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+        You MUST begin your final answer with the tag "<ANSWER>:".
+    """.format(question=question, context=documents)
+
+    chat = [
+    {"role": "system", "content": system_prompt},
+    {"role": "user", "content": user_prompt},
+    {"role": "assistant", "content": answer}
+    ]
+    return tokenize_dialog(chat, tokenizer)
+
+
+def get_custom_dataset(dataset_config, tokenizer, split, split_ratio=0.9):
+    # load_dataset will return DatasetDict that contains all the data in the train set
+    dataset_dict = load_dataset('json', data_files=dataset_config.data_path)
+    dataset = dataset_dict['train']
+    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)
+
+    dataset = dataset[split].map(lambda sample: {
+        "instruction": sample["instruction"],
+        "output": sample["cot_answer"],
+        },
+        batched=True,
+    )
+    dataset = dataset.map(lambda x: raft_tokenize(x, tokenizer))
+    return dataset
diff --git a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
index 999a51837475dfe9b46bf34f6c6db0631988edd9..e26a10bd5ddfe1b677fc3b957cdf2cea0ec464b7 100644
--- a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
+++ b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
@@ -65,7 +65,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c7963d43806d432aaa3d00e2055e355c",
+       "model_id": "68838a4f42f84545912e95b339a31034",
        "version_major": 2,
        "version_minor": 0
       },
@@ -75,13 +75,6 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
-     ]
     }
    ],
    "source": [
@@ -101,6 +94,7 @@
     "train_config.context_length = 1024 if torch.cuda.get_device_properties(0).total_memory < 16e9 else 2048 # T4 16GB or A10 24GB\n",
     "train_config.batching_strategy = \"packing\"\n",
     "train_config.output_dir = \"meta-llama-samsum\"\n",
+    "train_config.use_peft = True\n",
     "\n",
     "from transformers import BitsAndBytesConfig\n",
     "config = BitsAndBytesConfig(\n",
@@ -205,7 +199,7 @@
     "model_input = tokenizer(eval_prompt, return_tensors=\"pt\").to(\"cuda\")\n",
     "\n",
     "model.eval()\n",
-    "with torch.no_grad():\n",
+    "with torch.inference_mode():\n",
     "    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))"
    ]
   },
@@ -230,34 +224,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/datasets/load.py:1486: FutureWarning: The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum\n",
-      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-      "  warnings.warn(\n",
-      "Preprocessing dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14732/14732 [00:02<00:00, 6124.69it/s]\n"
+      "/home/ubuntu/llama-recipes/src/llama_recipes/model_checkpointing/checkpoint_handler.py:17: DeprecationWarning: `torch.distributed._shard.checkpoint` will be deprecated, use `torch.distributed.checkpoint` instead\n",
+      "  from torch.distributed._shard.checkpoint import (\n",
+      "Preprocessing dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 14732/14732 [00:02<00:00, 5872.02it/s]\n"
      ]
     }
    ],
    "source": [
     "from llama_recipes.configs.datasets import samsum_dataset\n",
-    "from llama_recipes.data.concatenator import ConcatDataset\n",
-    "from llama_recipes.utils.config_utils import get_dataloader_kwargs\n",
-    "from llama_recipes.utils.dataset_utils import get_preprocessed_dataset\n",
-    "\n",
-    "train_dataset = get_preprocessed_dataset(tokenizer, samsum_dataset, 'train')\n",
-    "\n",
-    "train_dl_kwargs = get_dataloader_kwargs(train_config, train_dataset, tokenizer, \"train\")\n",
+    "from llama_recipes.utils.dataset_utils import get_dataloader\n",
     "\n",
-    "if train_config.batching_strategy == \"packing\":\n",
-    "        train_dataset = ConcatDataset(train_dataset, chunk_size=train_config.context_length)\n",
+    "samsum_dataset.trust_remote_code = True\n",
     "\n",
-    "# Create DataLoaders for the training and validation dataset\n",
-    "train_dataloader = torch.utils.data.DataLoader(\n",
-    "    train_dataset,\n",
-    "    num_workers=train_config.num_workers_dataloader,\n",
-    "    pin_memory=True,\n",
-    "    **train_dl_kwargs,\n",
-    ")"
+    "train_dataloader = get_dataloader(tokenizer, samsum_dataset, train_config)\n",
+    "eval_dataloader = get_dataloader(tokenizer, samsum_dataset, train_config, \"val\")"
    ]
   },
   {
@@ -310,17 +290,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/cuda/memory.py:330: FutureWarning: torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.\n",
+      "/home/ubuntu/llama-recipes/src/llama_recipes/utils/train_utils.py:92: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
+      "  scaler = torch.cuda.amp.GradScaler()\n",
+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/cuda/memory.py:343: FutureWarning: torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.\n",
       "  warnings.warn(\n",
       "Training Epoch: 1:   0%|\u001b[34m          \u001b[0m| 0/319 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
       "To disable this warning, you can either:\n",
       "\t- Avoid using `tokenizers` before the fork if possible\n",
       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
-      "  warnings.warn(\n",
+      "/home/ubuntu/llama-recipes/src/llama_recipes/utils/train_utils.py:151: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "  with autocast():\n",
+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:600: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+      "  return fn(*args, **kwargs)\n",
       "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
       "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
-      "Training Epoch: 1/1, step 1278/1279 completed (loss: 0.27870458364486694): : 320it [2:07:09, 23.84s/it]                      3.94s/it]  \n"
+      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
+      "  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]\n",
+      "Training Epoch: 1/1, step 1278/1279 completed (loss: 0.28094857931137085): : 320it [2:08:50, 24.16s/it]                      4.21s/it]  \n"
      ]
     },
     {
@@ -332,7 +318,7 @@
       "Peak active CUDA memory was 15 GB\n",
       "CUDA Malloc retries : 0\n",
       "CPU Total Peak Memory consumed during the train (max): 2 GB\n",
-      "Epoch 1: train_perplexity=1.3403, train_epoch_loss=0.2929, epoch time 7630.169942979002s\n"
+      "Epoch 1: train_perplexity=1.3404, train_epoch_loss=0.2930, epoch time 7730.981359725998s\n"
      ]
     }
    ],
@@ -354,7 +340,7 @@
     "results = train(\n",
     "    model,\n",
     "    train_dataloader,\n",
-    "    None,\n",
+    "    eval_dataloader,\n",
     "    tokenizer,\n",
     "    optimizer,\n",
     "    scheduler,\n",
@@ -380,16 +366,7 @@
    "cell_type": "code",
    "execution_count": 7,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ubuntu/miniconda3/envs/llama/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model.save_pretrained(train_config.output_dir)"
    ]
@@ -440,13 +417,13 @@
       "A: He said heâ€™d name it after his dead hamster â€“ Lemmy  - he's  a great Motorhead fan :-)))\n",
       "---\n",
       "Summary:\n",
-      "A wants to get a puppy for her son. She will take him to the animal shelter tomorrow. B is not sure if he can go with her, but he's willing to.\n"
+      "A wants to get a puppy for his son. A took him to the animal shelter last Monday and he showed A one he really liked. A wants to get him one of those little dogs. A and B agree that raising a dog is a tough issue.\n"
      ]
     }
    ],
    "source": [
     "model.eval()\n",
-    "with torch.no_grad():\n",
+    "with torch.inference_mode():\n",
     "    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))\n"
    ]
   }
@@ -467,7 +444,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.11.9"
   },
   "vscode": {
    "interpreter": {
diff --git a/recipes/quickstart/inference/local_inference/inference.py b/recipes/quickstart/inference/local_inference/inference.py
index 22dd9345af090a302cef52939bf125dd94732a3f..bf2f824a9db9d91d8b456d50dd06d4b0b0be8875 100644
--- a/recipes/quickstart/inference/local_inference/inference.py
+++ b/recipes/quickstart/inference/local_inference/inference.py
@@ -89,7 +89,6 @@ def main(
 
         batch = tokenizer(
             user_prompt,
-            padding="max_length",
             truncation=True,
             max_length=max_padding_length,
             return_tensors="pt",
diff --git a/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb b/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
index 5253dfb35f5c2742949ed1a0ce819bc87c96543d..6517d88c936336378e58278a4121bca6f811927a 100644
--- a/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
+++ b/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
@@ -15,7 +15,7 @@
    "source": [
     "# Llama Guard 3 Customization: Taxonomy Customization, Zero/Few-shot prompting, Evaluation and Fine Tuning \n",
     "\n",
-    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_changes_and_fine_tuning.ipynb\">\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb\">\n",
     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>\n",
     "\n",
@@ -499,12 +499,12 @@
     "\n",
     "def llm_eval(prompts: List[Tuple[List[str], AgentType]],\n",
     "            model_id: str = \"meta-llama/Llama-Guard-3-8B\",\n",
-    "            llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_2.name, \n",
+    "            llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_3.name, \n",
     "            load_in_8bit: bool = True, \n",
     "            load_in_4bit: bool = False, \n",
     "            logprobs: bool = False) -> Tuple[List[str], Optional[List[List[Tuple[int, float]]]]]:\n",
     "    \"\"\"\n",
-    "    Runs Llama Guard inference with HF transformers. Works with Llama Guard 1 or 2\n",
+    "    Runs Llama Guard inference with HF transformers.\n",
     "\n",
     "    This function loads Llama Guard from Hugging Face or a local model and \n",
     "    executes the predefined prompts in the script to showcase how to do inference with Llama Guard.\n",
@@ -515,9 +515,9 @@
     "            List of Tuples containing all the conversations to evaluate. The tuple contains a list of messages that configure a conversation and a role.\n",
     "        model_id : str \n",
     "            The ID of the pretrained model to use for generation. This can be either the path to a local folder containing the model files,\n",
-    "            or the repository ID of a model hosted on the Hugging Face Hub. Defaults to 'meta-llama/Meta-Llama-Guard-2-8B'.\n",
+    "            or the repository ID of a model hosted on the Hugging Face Hub. Defaults to 'meta-llama/Meta-Llama-Guard-3-8B'.\n",
     "        llama_guard_version : LlamaGuardVersion\n",
-    "            The version of the Llama Guard model to use for formatting prompts. Defaults to LLAMA_GUARD_2.\n",
+    "            The version of the Llama Guard model to use for formatting prompts. Defaults to 3.\n",
     "        load_in_8bit : bool\n",
     "            defines if the model should be loaded in 8 bit. Uses BitsAndBytes. Default True \n",
     "        load_in_4bit : bool\n",
diff --git a/recipes/responsible_ai/prompt_guard/inference.py b/recipes/responsible_ai/prompt_guard/inference.py
index 89001aa5a5113ce8d90e5be638faa2da17145dce..4e41dd4e00e4f0c87834311b042ad34a1de38cb6 100644
--- a/recipes/responsible_ai/prompt_guard/inference.py
+++ b/recipes/responsible_ai/prompt_guard/inference.py
@@ -31,7 +31,45 @@ def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
     return model, tokenizer
 
 
-def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu'):
+def preprocess_text_for_promptguard(text: str, tokenizer) -> str:
+    """
+    Preprocess the text by removing spaces that break apart larger tokens.
+    This hotfixes a workaround to PromptGuard, where spaces can be inserted into a string
+    to allow the string to be classified as benign.
+
+    Args:
+        text (str): The input text to preprocess.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+
+    Returns:
+        str: The preprocessed text.
+    """
+
+    try:
+        cleaned_text = ''
+        index_map = []
+        for i, char in enumerate(text):
+            if not char.isspace():
+                cleaned_text += char
+                index_map.append(i)
+        tokens = tokenizer.tokenize(cleaned_text)
+        result = []
+        last_end = 0
+        for token in tokens:
+            token_str = tokenizer.convert_tokens_to_string([token])
+            start = cleaned_text.index(token_str, last_end)
+            end = start + len(token_str)
+            original_start = index_map[start]
+            if original_start > 0 and text[original_start - 1].isspace():
+                result.append(' ')
+            result.append(token_str)
+            last_end = end
+        return ''.join(result)
+    except Exception:
+        return text
+
+
+def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
     """
     Evaluate the model on the given text with temperature-adjusted softmax.
     Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
@@ -44,6 +82,8 @@ def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu
     Returns:
         torch.Tensor: The probability of each class adjusted by the temperature.
     """
+    if preprocess:
+        text = preprocess_text_for_promptguard(text, tokenizer)
     # Encode the text
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     inputs = inputs.to(device)
@@ -57,7 +97,7 @@ def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu
     return probabilities
 
 
-def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu'):
+def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
     """
     Evaluate the probability that a given string contains malicious jailbreak or prompt injection.
     Appropriate for filtering dialogue between a user and an LLM.
@@ -70,11 +110,11 @@ def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu'):
     Returns:
         float: The probability of the text containing malicious content.
     """
-    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
+    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device, preprocess)
     return probabilities[0, 2].item()
 
 
-def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device='cpu'):
+def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
     """
     Evaluate the probability that a given string contains any embedded instructions (malicious or benign).
     Appropriate for filtering third party inputs (e.g. web searches, tool outputs) into an LLM.
@@ -87,11 +127,11 @@ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device
     Returns:
         float: The combined probability of the text containing malicious or embedded instructions.
     """
-    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
+    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device, preprocess)
     return (probabilities[0, 1] + probabilities[0, 2]).item()
 
 
-def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
+def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu', preprocess=True):
     """
     Process a batch of texts and return their class probabilities.
     Args:
@@ -104,6 +144,8 @@ def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
     Returns:
         torch.Tensor: A tensor containing the class probabilities for each text in the batch.
     """
+    if preprocess:
+        texts = [preprocess_text_for_promptguard(text, tokenizer) for text in texts]
     inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
     inputs = inputs.to(device)
     with torch.no_grad():
@@ -113,7 +155,7 @@ def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
     return probabilities
 
 
-def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16):
+def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
     """
     Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
     Args:
@@ -138,7 +180,7 @@ def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0
     for i in range(0, len(all_chunks), max_batch_size):
         batch_chunks = all_chunks[i:i+max_batch_size]
         batch_indices = text_indices[i:i+max_batch_size]
-        probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device)
+        probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device, preprocess)
         scores = probabilities[:, score_indices].sum(dim=1).tolist()
         
         for idx, score in zip(batch_indices, scores):
@@ -146,7 +188,7 @@ def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0
     return all_scores
 
 
-def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
     """
     Compute jailbreak scores for a list of texts.
     Args:
@@ -160,10 +202,10 @@ def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, dev
     Returns:
         list[float]: A list of jailbreak scores for each text.
     """
-    return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size)
+    return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size, preprocess)
 
 
-def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
     """
     Compute indirect injection scores for a list of texts.
     Args:
@@ -177,4 +219,4 @@ def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature
     Returns:
         list[float]: A list of indirect injection scores for each text.
     """
-    return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size)
+    return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size, preprocess)
diff --git a/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb b/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
index fa0013dd17c7735c29da56bac9222536da7a7593..dc070a2953a33332a2a4bfd557b8e62e22c0e2d3 100644
--- a/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
+++ b/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
@@ -789,7 +789,7 @@
    "metadata": {},
    "source": [
     "\n",
-    "One good way to quickly obtain labeled training data for a use case is to use the original, non-fine tuned model itself to highlight risky examples to label, while drawing random negatives from below a score threshold. This helps address the class imbalance (attacks and risky prompts can be a very small percentage of all prompts) and includes false positive examples (which tend to be very valuable to train on) in the dataset. The use of synthetic data for specific "
+    "One good way to quickly obtain labeled training data for a use case is to use the original, non-fine tuned model itself to highlight risky examples to label, while drawing random negatives from below a score threshold. This helps address the class imbalance (attacks and risky prompts can be a very small percentage of all prompts) and includes false positive examples (which tend to be very valuable to train on) in the dataset. Generating synthetic fine-tuning data for specific use cases can also be an effective strategy."
    ]
   }
  ],
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..50356d50950bb92265abc54c0d056d9afa390f24
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md
@@ -0,0 +1,243 @@
+
+## Chatbot Recipe:
+As the popularity of our Meta Llama 3 models grows, we've seen a surge in demand to adapt them to specific domains, enabling businesses to better serve their customers. For example, a company might have a vast collection of plain text documents related to their custom domain and want to create a chatbot that can answer client questions.
+
+In response to this demand, we're exploring the possibility of building a Llama chatbot that can answer Llama-related questions using our Meta Llama 3 models. In this tutorial, we'll demonstrate how to do just that. While our Meta Llama 3 70B Instruct model is an excellent candidate, its production costs are relatively high. To reduce these costs, we'll focus on creating a Llama chatbot based on the Meta Llama 8B Instruct model, aiming to achieve similar accuracy while minimizing inference costs.
+
+One common approach to produce a model based on new domain data is **fine-tuning**. The idea is to start from a pre-trained model that already has some knowledge of language from its pre-training and adapt it to a new domain. However, [recent paper](https://arxiv.org/pdf/2405.05904) highlights the risk of using supervised fine-tuning to update LLMs' knowledge, as it presents empirical evidence that acquiring new knowledge through fine-tuning is correlated with hallucinations w.r.t. preexisting knowledge. Fine-tuning can also be costly if the domain knowledge has to be updated frequently.
+
+Another solution is to use **RAG (Retrieval-Augmented Generation)**, which combines the strengths of traditional information retrieval systems (such as databases) with the capabilities of generative large language models (LLMs). RAG operates by first retrieving relevant information from a database using a query generated by the LLM. This retrieved information is then integrated into the LLM's query input, enabling it to generate more accurate and contextually relevant text. This helps to reduce LLM hallucination as the related documents are provided to LLM and has a lower cost to update the domain knowledge.
+
+In this tutorial, we'll use **Retrieval Augmented Fine Tuning (RAFT)**, a technique that combines fine-tuning with RAG to better utilize custom domain text data. RAFT is a general recipe for fine-tuning a pre-trained Large Language Model (LLM) to a domain-specific RAG setting. It helps LLM to better utilize custom domain text data, by ignoring those documents that donâ€™t help in answering the question. This approach can create a more factual model and reduce LLM hallucinations during inference.
+
+The process involves preparing training data with each data point containing:
+
+* A question (Q)
+* A set of documents (D)
+* A corresponding Chain-of-thought style answer (A*) generated from one of the documents (D*)
+
+RAFT tries to teach the models to differentiate between two types of documents:
+
+* Oracle documents (D*): documents from which the answer to the question can be deduced
+* Distractor documents (Di): documents that do not contain answer-relevant information
+
+The following graph illustrates the RAFT main concepts:
+![RAFT images](images/RAFT.png)
+
+For more information on RAFT, please refer to their [blog post](https://gorilla.cs.berkeley.edu/blogs/9_raft.html).
+
+## Fine-tuning Llama
+
+To build a Llama bot, we need to collect relevant text data. Ideally, we would include a vast range of Llama-related web documents, but for demo purposes, we'll focus on official documents. For example, we can use the raw text from official web pages listed in [Getting started with Meta Llama](https://llama.meta.com/get-started/), excluding the FAQ page since some evaluation questions will come from there.
+
+We have two options to obtain the text data: using a local folder or web crawling. For the local folder option, we can download the desired documents in PDF, Text, or Markdown format to the "data" folder specified in the [raft.yaml](./raft.yaml) file. Langchain DirectoryLoader will load files in that folder, but it may also ask us to install more package dependency if the files formats are not supported natively.
+
+Alternatively, we can create a sitemap XML file, similar to the example below, and put the file path in the [raft.yaml](./raft.yaml) file, so eventually a Langchain SitemapLoader can retrieve all the text from the web pages.
+
+```xml
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>http://llama.meta.com/responsible-use-guide/</loc>
+  </url>
+  <!-- more URLs -->
+</urlset>
+```
+
+## Create RAFT Dataset
+
+To create a RAFT dataset from the prepared documents, we can use the Meta Llama 3 70B Instruct model either through APIs from LLM cloud providers or by hosting a local VLLM server.
+
+For this example, we'll demonstrate how to create a VLLM OpenAI-compatible server that hosts Meta Llama 3 70B Instruct locally and generates the RAFT dataset.
+
+**Local Server Setup**
+
+First, ensure VLLM is installed. Then, run the following command to start the VLLM server:
+```bash
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server  --model meta-Llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 2 --disable-log-requests --port 8001
+```
+**Note**: Make sure the port is available, and the server requires at least 135GB GPU memory, so we need to use multiple GPUs in a tensor parallel way.
+
+**Querying the Server**
+
+Once the server is ready, query it using the following command in another terminal:
+```bash
+python raft.py -u "http://localhost:8001/v1" -k "EMPTY" -t 4
+```
+If you prefer to use a cloud API, replace the endpoint URL with the cloud provider's URL and set the API key using the `-k` flag or environment variables.
+
+**RAFT Dataset Generation**
+
+The [raft.py](raft.py) script reads all documents from local or web sources, depending on the settings, and splits the data into text chunks of 1000 characters using RecursiveCharacterTextSplitter.
+
+Then, it applies the `question_prompt_template` defined in [raft.yaml](raft.yaml) to each chunk to generate queries to Meta Llama 3 70B model, and the model will generate a question list (By default 4 questions in that list) for each text chunk. For each question and corresponding text chunk, we generate a Chain-of-Thought (COT) style answer using Meta Llama 3 70B Instruct APIs.
+
+Once we have the COT answers, we can create a dataset where each sample contains an "instruction" section. This section includes some unrelated chunks called distractors (by default, we add 4 distractors). In the original RAFT method, there is an oracle probability P (by default, 80%) that a related document will be included. This means that there is a 1-P (by default, 20%) chance that no related documents are provided, and the RAFT model should still try to predict the COT answer label, as stated in the blog, "By removing the oracle documents in some instances of the training data, we are compelling the model to memorize domain-knowledge."
+
+**Modification to Add Refusal Examples**
+
+In this tutorial, we made an important modification by adding additional refusal examples (by default, this refusal probability is 5%). When the related documents are not presented, we set the COT answer label to "Sorry, I don't know the answer to this question because related documents are not found. Please try again." Our hypothesis is that this will increase answer precision and reduce chatbot hallucination. In real-world production scenarios, we prefer that the chatbot refuses to answer when not enough context is provided, so that we can detect this refusal signal and mitigate the risk of producing wrong or misleading answers (e.g., we can ask a human agent to take over the conversation to better serve customers).
+
+**RAFT Format JSON Example**
+
+Here is a RAFT format JSON example from our saved `raft.jsonl` file:
+```json
+{
+   "id":"seed_task_228",
+   "type":"general",
+   "question":"What is the context length supported by Llama 3 models?",
+   "context":{
+      "sentences":[
+         [
+            "DISTRACT_DOCS 1"
+            "DISTRACT_DOCS 2"
+            "We hope that Code Llama will inspire others to leverage Llama 2 to create new innovative tools for research and commercial products. Download the model Explore more on Code Llama Discover more about Code Llama here \u2014 visit our resources, ranging from our research paper, getting started guide and more. Code Llama GitHub repository Research paper Download the model Getting started guide Meta Llama 3 Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Get Started Experience Llama 3 on Meta AI Experience Llama 3 with Meta AI We\u2019ve integrated Llama 3 into Meta AI, our intelligent assistant, that expands the ways people can get things done, create and connect with Meta AI. You can see first-hand the performance of Llama 3 by using Meta AI for coding tasks and problem solving. Whether you're developing agents, or other AI-powered applications, Llama 3 in both 8B and 70B will offer the capabilities and flexibility you need to develop your ideas. Experience Llama 3 on Meta AI Enhanced performance Experience the state-of-the-art performance of Llama 3, an openly accessible model that excels at language nuances, contextual understanding, and complex tasks like translation and dialogue generation. With enhanced scalability and performance, Llama 3 can handle  multi-step tasks effortlessly, while our refined post-training processes significantly lower false refusal rates, improve response alignment, and boost diversity in model answers. Additionally, it drastically elevates capabilities like reasoning, code generation, and instruction following. Build the future of AI with Llama 3. Download Llama 3 Getting Started Guide With each Meta Llama request, you will receive: Meta Llama Guard 2 Getting started guide Responsible Use Guide Acceptable use policy Model card Community license agreement Benchmarks Llama 3 models take data and scale to new heights. It\u2019s been trained on our two recently announced custom-built 24K GPU clusters on over 15T token of data \u2013 a training dataset 7x larger than that used for Llama 2, including 4x more code. This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. Model card Trust & safety A comprehensive approach to responsibility With the release of Llama 3, we\u2019ve updated the Responsible Use Guide (RUG) to provide the most comprehensive information on responsible development with LLMs. Our system-centric approach includes updates to our trust and safety tools with Llama Guard 2, optimized to support the newly announced taxonomy published by MLCommons expanding its coverage to a more comprehensive set of safety categories, Code Shield, and Cybersec Eval 2. In line with the principles outlined in our RUG , we recommend thorough checking and filtering of all inputs to and outputs from LLMs based on your unique content guidelines for your intended use case and audience. Meta Llama Guard 2 Explore more on Meta Llama 3 Introducing Meta Llama 3: The most capable openly available LLM to date Read the blog Meet Your New Assistant: Meta AI, Built With Llama 3 Learn more Meta Llama 3 repository View repository Model card Explore Meta Llama 3 License META LLAMA 3 COMMUNITY LICENSE AGREEMENT Meta Llama 3 Version Release Date: April 18, 2024 \u201c Agreement \u201d means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. \u201c Documentation \u201d means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https:\/\/llama.meta.com\/get-started\/ .",
+            "DISTRACT_DOCS 3"
+            "DISTRACT_DOCS 4"
+         ]
+      ],
+      "title":[
+         [
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+         ]
+      ]
+   },
+   "oracle_context":"We hope that Code Llama will inspire others to leverage Llama 2 to create new innovative tools for research and commercial products. Download the model Explore more on Code Llama Discover more about Code Llama here \u2014 visit our resources, ranging from our research paper, getting started guide and more. Code Llama GitHub repository Research paper Download the model Getting started guide Meta Llama 3 Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Get Started Experience Llama 3 on Meta AI Experience Llama 3 with Meta AI We\u2019ve integrated Llama 3 into Meta AI, our intelligent assistant, that expands the ways people can get things done, create and connect with Meta AI. You can see first-hand the performance of Llama 3 by using Meta AI for coding tasks and problem solving. Whether you're developing agents, or other AI-powered applications, Llama 3 in both 8B and 70B will offer the capabilities and flexibility you need to develop your ideas. Experience Llama 3 on Meta AI Enhanced performance Experience the state-of-the-art performance of Llama 3, an openly accessible model that excels at language nuances, contextual understanding, and complex tasks like translation and dialogue generation. With enhanced scalability and performance, Llama 3 can handle  multi-step tasks effortlessly, while our refined post-training processes significantly lower false refusal rates, improve response alignment, and boost diversity in model answers. Additionally, it drastically elevates capabilities like reasoning, code generation, and instruction following. Build the future of AI with Llama 3. Download Llama 3 Getting Started Guide With each Meta Llama request, you will receive: Meta Llama Guard 2 Getting started guide Responsible Use Guide Acceptable use policy Model card Community license agreement Benchmarks Llama 3 models take data and scale to new heights. It\u2019s been trained on our two recently announced custom-built 24K GPU clusters on over 15T token of data \u2013 a training dataset 7x larger than that used for Llama 2, including 4x more code. This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. Model card Trust & safety A comprehensive approach to responsibility With the release of Llama 3, we\u2019ve updated the Responsible Use Guide (RUG) to provide the most comprehensive information on responsible development with LLMs. Our system-centric approach includes updates to our trust and safety tools with Llama Guard 2, optimized to support the newly announced taxonomy published by MLCommons expanding its coverage to a more comprehensive set of safety categories, Code Shield, and Cybersec Eval 2. In line with the principles outlined in our RUG , we recommend thorough checking and filtering of all inputs to and outputs from LLMs based on your unique content guidelines for your intended use case and audience. Meta Llama Guard 2 Explore more on Meta Llama 3 Introducing Meta Llama 3: The most capable openly available LLM to date Read the blog Meet Your New Assistant: Meta AI, Built With Llama 3 Learn more Meta Llama 3 repository View repository Model card Explore Meta Llama 3 License META LLAMA 3 COMMUNITY LICENSE AGREEMENT Meta Llama 3 Version Release Date: April 18, 2024 \u201c Agreement \u201d means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. \u201c Documentation \u201d means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https:\/\/llama.meta.com\/get-started\/ .",
+   "cot_answer":"Here's the step-by-step reasoning to answer the question:\n\n1. The question asks about the context length supported by Llama 3 models.\n2. In the context, we need to find the relevant information about Llama 3 models and their context length.\n3. The relevant sentence is: \"This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2.\"\n##begin_quote## This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. ##end_quote##\n4. From this sentence, we can see that Llama 3 models support a context length of 8K.\n\n<ANSWER>: 8K",
+   "instruction":"<DOCUMENT> DISTRACT_DOCS 1 <\/DOCUMENT>...<DOCUMENT> DISTRACT_DOCS 4 <\/DOCUMENT>\nWhat is the context length supported by Llama 3 models?"
+}
+```
+As shown in the above example, we have a "question" section for the generated question, a "cot_answer" section for the generated COT answers (where the final answer will be added after the "<ANSWER>" token), and an "instruction" section that has all the documents included (each document split by `<DOCUMENT>` and `</DOCUMENT>` tags) and finally the generated question appended at the end. This "instruction" section will be the input during fine-tuning, and the "cot_answer" will be the output label that the loss will be calculated on.
+
+## Creating an Evaluation Set
+To create a reliable evaluation set, it's ideal to use human-annotated question and answer pairs. This ensures that the questions are relevant and the answers are accurate. However, human annotation is time-consuming and costly. For demonstration purposes, we'll use a subset of the validation set, which will never be used in the fine-tuning. We only need to keep the "question" section and the final answer section, marked by the `<ANSWER>` tag in "cot_answer". We'll manually check each example and select only the good ones. We want to ensure that the questions are general enough to be used for web search engine queries and are related to Llama. We'll also use some QA pairs from our FAQ page, with modifications. This will result in 72 question and answer pairs as our evaluation set, saved as `eval_llama.json`.
+
+## Fine-Tuning Steps
+Once the RAFT dataset is ready in JSON format, we can start fine-tuning. Unfortunately, the LORA method didn't produce good results, so we'll use the full fine-tuning method. We can use the following commands as an example in the llama-recipes main folder:
+
+```bash
+export PATH_TO_ROOT_FOLDER=./raft-8b
+export PATH_TO_RAFT_JSON=recipes/use_cases/end2end-recipes/raft/output/raft.jsonl
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --lr 1e-5 --context_length 8192 --num_epochs 1 --batch_size_training 1 --model_name meta-Llama/Meta-Llama-3-8B-Instruct --dist_checkpoint_root_folder $PATH_TO_ROOT_FOLDER --dist_checkpoint_folder fine-tuned  --use_fast_kernels --dataset "custom_dataset" --custom_dataset.test_split "test" --custom_dataset.file "recipes/finetuning/datasets/raft_dataset.py" --use-wandb  --run_validation True  --custom_dataset.data_path $PATH_TO_RAFT_JSON
+```
+
+For more details on multi-GPU fine-tuning, please refer to the [multigpu_finetuning.md](../../../quickstart/finetuning/multigpu_finetuning.md) in the finetuning recipe.
+
+Next, we need to convert the FSDP checkpoint to a HuggingFace checkpoint using the following command:
+
+```bash
+python src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py --fsdp_checkpoint_path  "$PATH_TO_ROOT_FOLDER/fine-tuned-meta-Llama/Meta-Llama-3-8B-Instruct" --consolidated_model_path "$PATH_TO_ROOT_FOLDER"
+```
+
+For more details on FSDP to HuggingFace checkpoint conversion, please refer to the [readme](../../../quickstart/inference/local_inference/README.md) in the inference/local_inference recipe.
+
+## Evaluation Steps
+Once we have the RAFT model, we need to evaluate its performance. In this tutorial, we'll not only use traditional evaluation methods (e.g., calculating exact match rate or ROUGE score) but also use LLM as a judge to score model-generated answers.
+
+We'll launch a VLLM server to host our converted model from `PATH_TO_ROOT_FOLDER`. To make things easier, we can rename the model folder to `raft-8b`.
+
+```bash
+CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server  --model raft-8b --port 8000  --disable-log-requests
+```
+
+Similarly, if we want to get the 8B instruct baseline, we can launch a 8B model VLLM server instead:
+
+```bash
+CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server  --model  meta-Llama/Meta-Llama-3-8B-Instruct --port 8000  --disable-log-requests
+```
+
+On another terminal, we can use another Meta Llama 3 70B Instruct model as a judge to compare the answers from the RAFT 8B model with the ground truth and get a score. To do this, we need to host another Meta Llama 3 70B Instruct VLLM server locally with the command, making sure the port is not in use:
+```bash
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model meta-Llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 2 --disable-log-requests --port 8001
+```
+
+Then, we can pass the ports to the eval script to evaluate our RAFT model once our `raft-8b` VLLM server is running:
+```bash
+CUDA_VISIBLE_DEVICES=4 python raft_eval.py -m raft-8b -u "http://localhost:8000/v1" -j "http://localhost:8001/v1" -r 5
+```
+
+To evaluate the 8B baseline, we can use the following command once our 8B VLLM server is running:
+```bash
+CUDA_VISIBLE_DEVICES=4 python raft_eval.py -m meta-Llama/Meta-Llama-3-8B-Instruct -u "http://localhost:8000/v1" -j "http://localhost:8001/v1" -r 5
+```
+
+**NOTE**: Please ensure that the `--model` in VLLM server creation matches the `--m` in raft_eval.py. Otherwise, VLLM will raise a `model not found` error. By default, the RAFT model is called "raft-8b". Here, `-u` specifies the RAFT model endpoint URL, `-j` specifies the judge model endpoint URL, and `-r` defines how many top-k documents the RAG should retrieve.
+
+This [raft_eval.py](./raft_eval.py) script will load questions from the evaluation set, generate answers from models and models+RAG, and compare the generated answers with the ground truth to get the evaluation metrics, such as ROUGE score or LLM-as-judge score. It will then save those metrics and evaluation details to eval logs.
+
+## Experiment Results
+
+**Overview**
+
+During our experiments, we encountered issues with using only the Llama website data, which consisted 1980+ RAFT examples generated from 327K characters text. We believed that this initial data was insufficient, so we created an additional PyTorch RAFT dataset using text from official [Pytorch blogs](https://pytorch.org/blog/) and [Pytorch tutorials](https://pytorch.org/tutorials/). This new dataset contains 20K+ RAFT examples generated from 4.7 million characters. We combined both datasets to create an `all_data` dataset. We then fine-tuned the 8B model on each dataset separately for 1 epoch with a learning rate of 1e-5, resulting in three RAFT models: `llama_only`, `pytorch_only`, and `all_data`.
+
+**Evaluation on non-RAG baseline**
+
+First we run a non-RAG baseline, just using Meta Llama 3 8B Instruct and Meta Llama 3 70B Instruct model to see if our model can already answers some questions without any fine-tuning and external knowledge base. The LLM score, the percentage of correctness marked by LLM_as_judge, for 8B is 47.9% and 70B is 59.2%. Clearly, there are some information that has been pretrained into our Meta Llama 3 models.
+
+**Evaluation on RAG baseline**
+
+Then we tested these 3 RAFT models with Langchain RAG, along with the Meta Llama 3 8B Instruct and Meta Llama 3 70B Instruct RAG baselines, using the RAG document top-k retrieve parameters of 3, 5, and 7. We deployed a Meta Llama 70B Instruct model as the judge to score our model-generated answers against the ground truth in our evaluation set. The LLM scores are shown below:
+
+
+![RAFT LLM_score comparison](images/LLM_score_comparison.png)
+
+Our results showed that RAFT models performed similarly to the 8B RAG baseline, but noticeably worse than the 70B RAG baseline when context documents were limited (top_k <= 5). However, when top_k = 7, the RAFT models performance suddenly increase, with the `all_data` 8B model achieving a score of 76.06% which beats the 70B baseline's 74.65%.
+
+**Refusal Examples**
+
+We also analyzed the number of refusal examples, where the model responded with "Sorry, I do not know." The `all_data` model was more cautious and tended to refuse to answer, whereas the `llama_only` RAFT model did not learn to refuse at all, likely due to the limited dataset size.
+
+![Num of refusal comparison](images/Num_of_refusal_comparison.png)
+
+**Precision Analysis**
+
+We calculated the precision of our model answers, which represents the likelihood of producing correct answers when the model decides to respond. The formula used was $\frac{LLMScore}{1-\frac{numRefusal}{totalQA}}$.
+
+![Answers Precision](images/Answers_Precision.png)
+
+Note that the 8B and 70B RAG baselines never refused to answer, so their precision was equivalent to their LLM_score. Our `all_data` and `pytorch_only` models tended to refuse to answer when provided documents were limited (top_k < 5), but when they did generate an answer, the likelihood of it being correct was higher. Specifically, when top_k = 7, the `all_data` RAFT model had an 82.97% likelihood of producing a correct answer when it decided to respond, outperforming the 70B baseline.
+
+**Example Comparisons**
+
+Here are some examples where our `all_data` RAFT model correctly answered questions that the 70B baseline failed to answer:
+
+```
+Comparing interested question: What tokenizer is used as the basis for the special tokens in Meta Llama
+ground_truth:  tiktoken
+True all_data_RAG_answers: <ANSWER>: The tokenizer used as the basis for the special tokens in Meta Llama is tiktoken.
+False 70B_RAG_answers: <ANSWER>: The tokenizer used as the basis for the special tokens in Meta Llama is SentencePiece.
+```
+
+```
+Comparing interested question: What is the license under which the Llama Guard model and its weights are released?
+groud_truth:  The license is the same as Llama 3, which can be found in the LICENSE file and is accompanied by the Acceptable Use Policy.
+True all_data_RAG_answers: <ANSWER>: The license under which the Llama Guard model and its weights are released is the same as Llama 3, and the [LICENSE](../LICENSE) file contains more information about the license.
+False 70B_RAG_answers: <ANSWER>: The Llama Guard model and its weights are licensed under the Llama 2 Community license.
+```
+
+**Key Takeaways**
+
+From our experiments, we learned:
+
+1. Few thousand RAFT examples are insufficient, and at least 10K examples are recommended.
+2. The LLM_as_judge is not always reliable, and we noticed there are chances that answers were scored incorrectly.
+3. The chunk_size for RAFT documents and RAG documents should be the same.
+4. The RAFT method appears to help the LLM differentiate related documents from distractors rather than forcing it to memorize the training data, as we used Pytorch data as additional data to help our Llama chatbot to answer Llama questions. More research experiments will be needed to understand more about this.
+
+## Local Inference Steps
+
+Once we evaluated and refined our RAFT model, we can deploy it locally to interact with it by asking questions manually. To do this, run the following command:
+
+```bash
+python recipes/inference/local_inference/inference.py --model_name raft-8b
+```
+
+For more details,please check [local_inference recipe](../../../quickstart/inference/local_inference/README.md)
+
+## Acknowledgement
+
+Finally, we would like to extend special thanks to Tianjun Zhang, the first author of the [RAFT paper](https://arxiv.org/pdf/2403.10131), for collaborating with us on this tutorial and providing valuable guidance throughout our experiments. Our code is also partially inspired by the [RAFT section in Gorilla github](https://github.com/ShishirPatil/gorilla/tree/main/raft).
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b9115f7d1b41ab40ddc5b3a61e3c934e9042029
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import yaml
+
+def load_config(config_path: str = "./config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fd66af9be4563dd3093cef5ef8c564a8015cef6
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json
@@ -0,0 +1,287 @@
+[
+    {
+       "question":"What is the role of Llama2 70B in generating hard samples?",
+       "answer":" Llama2 70B generates hard samples by producing alternate policy descriptions that flip the label of existing samples."
+    },
+    {
+       "question":"What is the purpose of quantization in machine learning?",
+       "answer":" The purpose of quantization in machine learning is to reduce computational and memory requirements, making models more efficient for deployment."
+    },
+    {
+       "question":"What policy must your use of the Llama Materials adhere to, as specified in this Agreement?",
+       "answer":" The Acceptable Use Policy for the Llama Materials."
+    },
+    {
+       "question":"How is perplexity calculated in the context of fine-tuning a language model?",
+       "answer":" Perplexity is calculated as an exponentiation of the loss value."
+    },
+    {
+       "question":"How can the Memory API be used to enhance the conversational capabilities of an LLM?",
+       "answer":" The Memory API can be used to enhance the conversational capabilities of an LLM by saving conversation history and feeding it along with new questions to the LLM, enabling multi-turn natural conversation chat."
+    },
+    {
+       "question":"What token is used to signify the end of a message in a turn?",
+       "answer":" <|eot_id|>"
+    },
+    {
+       "question":"Where can I find more information about the research behind the Llama-2 model?",
+       "answer":" https:\/\/ai.meta.com\/research\/publications\/llama-2-open-foundation-and-fine-tuned-chat-models\/"
+    },
+    {
+       "question":"What tokenizer is used as the basis for the special tokens in Meta Llama ",
+       "answer":" tiktoken"
+    },
+    {
+       "question":"What does the model do with the probability of the first token to determine safety?",
+       "answer":" The model turns the probability of the first token into an \"unsafe\" class probability to determine safety."
+    },
+    {
+       "question":"Are Meta user data included in the pretraining dataset?",
+       "answer":" No"
+    },
+    {
+       "question":"What are the benefits of quantization in neural networks?",
+       "answer":" The benefits of quantization in neural networks are smaller model sizes, faster fine-tuning, and faster inference."
+    },
+    {
+       "question":"How does the GPTQ algorithm quantize the weight matrix during post-training?",
+       "answer":" The GPTQ algorithm quantizes the weight matrix by quantizing each row independently during post-training."
+    },
+    {
+       "question":"What is the capability of large language models like Meta Llama in terms of following instructions?",
+       "answer":" They can follow instructions without having previously seen an example of a task."
+    },
+    {
+       "question":"What trade-off do developers need to consider when deploying LLM systems, according to the Responsible Use Guide?",
+       "answer":" The trade-off is between model helpfulness and model alignment."
+    },
+    {
+       "question":"What is the purpose of red-teaming in your organization?",
+       "answer":" The purpose of red-teaming is to enhance safety and performance."
+    },
+    {
+       "question":"What is the purpose of the llama-recipes GitHub repo?",
+       "answer":" The purpose of the llama-recipes GitHub repo is to provide examples, demos, and guidance for using Llama models."
+    },
+    {
+       "question":"What is the purpose of Meta's Responsible Use Guide for developers using Llama ",
+       "answer":" The purpose of Meta's Responsible Use Guide is to provide guidance to developers on how to build products powered by LLMs in a responsible manner."
+    },
+    {
+       "question":"What should be defined to rate the results of the fine-tuned model?",
+       "answer":" A clear evaluation criteria."
+    },
+    {
+       "question":"What steps did the developers take to mitigate safety risks in their instruction-tuned Llama model?",
+       "answer":" The developers took the following steps to mitigate safety risks in their instruction-tuned Llama model: conducting extensive red teaming exercises, performing adversarial evaluations, and implementing safety mitigations techniques."
+    },
+    {
+       "question":"What behaviors are prohibited in the context of employment and economic benefits?",
+       "answer":" discrimination, other unlawful conduct, and harmful conduct"
+    },
+    {
+       "question":"Are there any fees or royalties required to use the Llama Materials under this license?",
+       "answer":" No, there are no fees or royalties required to use the Llama Materials under this license."
+    },
+    {
+       "question":"What is the precision in which LLM models can run without performance degradation using AWQ?",
+       "answer":" 4-bit"
+    },
+    {
+       "question":"What type of professional practices are not allowed without proper authorization or licensure?",
+       "answer":" Financial, legal, medical\/health, or related professional practices."
+    },
+    {
+       "question":"What is the F1 score of Llama Guard 2 when trained on the BeaverTails dataset?",
+       "answer":" 0.736"
+    },
+    {
+       "question":"What is the recommended step for developers before deploying applications of Llama ",
+       "answer":" Perform safety testing and tuning tailored to their specific applications of the model."
+    },
+    {
+       "question":"What is the license used for the Llama Guard model in the Purple Llama project?",
+       "answer":" Llama 2 Community License"
+    },
+    {
+       "question":"What is the first step in developing downstream models responsibly according to the updated guide?",
+       "answer":" Defining content policies and mitigations."
+    },
+    {
+       "question":"What data type is used for weights initialized from a normal distribution in 4-bit models?",
+       "answer":" NF4 (Normal Float 4)"
+    },
+    {
+       "question":"Where can I find examples of using Llama Guard in recipes?",
+       "answer":" https:\/\/github.com\/facebookresearch\/llama-recipes"
+    },
+    {
+       "question":"What is the recommended model-parallel value for the 70B model?",
+       "answer":" 8"
+    },
+    {
+       "question":"Where can you find more information about the Meta Llama 70B Model?",
+       "answer":" The model card,"
+    },
+    {
+       "question":"What percentage of the dataset typically makes up the test and validation sets when using a holdout method?",
+       "answer":" 10% - 30%,"
+    },
+    {
+       "question":"What are some hosting providers that support running Llama models?",
+       "answer":" OpenAI, Together AI, Anyscale, Replicate, Groq, etc."
+    },
+    {
+       "question":"According to the Llama Guard paper, why is it challenging to compare model performance across different models?",
+       "answer":" Because each model is built on its own policy and performs better on an evaluation dataset with a policy aligned to the model."
+    },
+    {
+       "question":"What is the advantage of having three partitions of data in the fine-tuning process?",
+       "answer":" The advantage is to get an unbiased evaluation of the model's performance."
+    },
+    {
+       "question":"What is included in the Llama 2 model download?",
+       "answer":" Model code, Model weights, README, Responsible Use Guide, License, Acceptable use policy, Model card, and Technical specifications."
+    },
+    {
+       "question":"What is the advantage of integrating with custom kernels?",
+       "answer":" The advantage of integrating with custom kernels is that it allows for support on specific devices."
+    },
+    {
+       "question":"What is the purpose of the GPTQ algorithm implemented in the AutoGPTQ library?",
+       "answer":" The purpose of the GPTQ algorithm is post-training quantization."
+    },
+    {
+       "question":"What advantage does AQLM take of when quantizing multiple weights together?",
+       "answer":" It takes advantage of interdependencies between the weights."
+    },
+    {
+       "question":"What is the primary advantage of using lower precision data in resource-constrained environments?",
+       "answer":" Faster inference and fine-tuning."
+    },
+    {
+       "question":"How can Meta Llama models be accessed on Microsoft Azure?",
+       "answer":" Meta Llama models can be accessed on Microsoft Azure through Models as a Service (MaaS) using Azure AI Studio and Model as a Platform (MaaP) using Azure Machine Learning Studio."
+    },
+    {
+       "question":"What is the purpose of aligning Llama Guard 2 with the Proof of Concept MLCommons taxonomy?",
+       "answer":" The purpose of aligning Llama Guard 2 with the Proof of Concept MLCommons taxonomy is to drive adoption of industry standards and facilitate collaboration and transparency in the LLM safety and content evaluation space."
+    },
+    {
+       "question":"What is the name of the repository that provides more examples of Llama recipes?",
+       "answer":" llama-recipes"
+    },
+    {
+       "question":"How will I receive the signed URL after my request is approved?",
+       "answer":" over email"
+    },
+    {
+       "question":"What is the purpose of the restriction on using Llama Materials?",
+       "answer":" To prevent the unauthorized use of Llama Materials to enhance competing language models."
+    },
+    {
+       "question":"What is the format of the prefix-suffix-middle method of infilling?",
+       "answer":" prefix-suffix-middle"
+    },
+
+    {
+       "question":"What is the license under which the Llama Guard model and its weights are released?",
+       "answer":" The license is the same as Llama 3, which can be found in the LICENSE file and is accompanied by the Acceptable Use Policy."
+    },
+    {
+       "question":"How do I download the 4-bit quantized Meta Llama 3 8B chat model using Ollama?",
+       "answer":" To download the 4-bit quantized Meta Llama 3 8B chat model using Ollama, run the command \"ollama pull llama3\" in your terminal."
+    },
+    {
+       "question":"How long are the download links for Llama valid for?",
+       "answer":" 24 hours"
+    },
+    {
+       "question":"What is the primary purpose of the suite of tools provided?",
+       "answer":" To support the AI lifecycle, specifically tuning models with enterprise data."
+    },
+    {
+       "question":"How does Llama Guard 2's classification performance compare to Llama Guard ",
+       "answer":" Llama Guard 2 has better classification performance than Llama Guard 1."
+    },
+    {
+       "question":"What data type is used for computations in Quantization Aware Training despite mimicking int8 values?",
+       "answer":" floating point numbers"
+    },
+    {
+       "question":"What is the purpose of providing specific examples in a prompt?",
+       "answer":" The purpose of providing specific examples in a prompt is to help the model better understand what kind of output is expected."
+    },
+    {
+        "question":"Why is Meta not sharing the training datasets for Llama?",
+        "answer":"We believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational fine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models have gone through Metaâ€™s internal Privacy Review process to ensure responsible data usage in building our products. We are dedicated to the responsible and ethical development of our GenAI products, ensuring our policies reflect diverse contexts and meet evolving societal expectations."
+     },
+     {
+        "question":"Did Meta use human annotators to develop the data for Llama models?",
+        "answer":"Yes. There are more details, for example, about our use of human annotators in the Llama 2 research paper."
+     },
+     {
+        "question":"Can I use the output of the models to improve the Llama family of models, even though I cannot use them for other LLMs?",
+        "answer":"It's correct that the license restricts using any part of the Llama models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed."
+     },
+     {
+        "question":"What operating systems (OS) are officially supported if I want to use Llama model?",
+        "answer":"For the core Llama GitHub repos (Llama and Llama3) Linux is the only OS currently supported by this repo. Additional OS support is available through the Llama-Recipes repo."
+     },
+     {
+        "question":"Do Llama models provide traditional autoregressive text completion?",
+        "answer":"Llama models are auto-regressive language models, built on the transformer architecture. The core language models function by taking a sequence of words as input and predicting the next word, recursively generating text."
+     },
+     {
+        "question":"Do Llama models support logit biases as a request parameter to control token probabilities during sampling?",
+        "answer":"This is implementation dependent (i.e. the code used to run the model)."
+     },
+     {
+        "question":"Do Llama models support adjusting sampling temperature or top-p threshold via request parameters?",
+        "answer":"The model itself supports these parameters, but whether they are exposed or not depends on implementation."
+     },
+     {
+        "question":"What is llama-recipes?",
+        "answer":"The llama-recipes repository is a companion to the Meta Llama 3 models. The goal of this repository is to provide a scalable library for fine-tuning Meta Llama models, along with some example scripts and notebooks to quickly get started with using the models in a variety of use-cases, including fine-tuning for domain adaptation and building LLM-based applications with Meta Llama and other tools in the LLM ecosystem."
+     },
+     {
+        "question":"What is the difference on the tokenization techniques that Meta Llama 3 uses compare Llama 2?",
+        "answer":"Llama 2 uses SentencePiece for tokenization, whereas Llama 3 has transitioned to OpenAIâ€™s Tiktoken."
+     },
+     {
+        "question":"How many tokens were used in Meta Llama 3 pretrain?",
+        "answer":"Meta Llama 3 is pretrained on over 15 trillion tokens that were all collected from publicly available sources."
+     },
+     {
+        "question":"How many tokens were used in  Llama 2 pretrain?",
+        "answer":"Llama 2 was pretrained on 2 trillion tokens of data from publicly available sources."
+     },
+     {
+        "question":"What is the name of the license agreement that Meta Llama 3 is under?",
+        "answer":"Meta LLAMA 3 COMMUNITY LICENSE AGREEMENT."
+     },
+     {
+        "question":"What is the name of the license agreement that Llama 2 is under?",
+        "answer":"LLAMA 2 COMMUNITY LICENSE AGREEMENT."
+     },
+     {
+        "question":"What is the context length of Llama 2 models?",
+        "answer":"Llama 2's context is 4k"
+     },
+     {
+        "question":"What is the context length of Meta Llama 3 models?",
+        "answer":"Meta Llama 3's context is 8k"
+     },
+     {
+        "question":"When is Llama 2 trained?",
+        "answer":"Llama 2 was trained between January 2023 and July 2023."
+     },
+     {
+        "question":"What is the name of the Llama 2 model that uses Grouped-Query Attention (GQA) ",
+        "answer":"Llama 2 70B"
+     },
+     {
+        "question":"What are the names of the Meta Llama 3 model that use Grouped-Query Attention (GQA) ",
+        "answer":"Meta Llama 3 8B and Meta Llama 3 70B"
+     }
+ ]
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1bbfb45848db20cf4995e33857024d85fc47adb
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py
@@ -0,0 +1,174 @@
+# file copied from https://github.com/ShishirPatil/gorilla/blob/main/raft/format.py
+from abc import ABC, abstractmethod
+import argparse
+from datasets import Dataset, load_dataset
+from typing import Dict, Literal, Any, get_args
+
+"""
+This file allows to convert raw HuggingFace Datasets into files suitable to fine tune completion and chat models.
+"""
+
+OutputDatasetType = Literal["parquet", "jsonl"]
+outputDatasetTypes = list(get_args(OutputDatasetType))
+
+InputDatasetType = Literal["arrow", "jsonl"]
+inputDatasetTypes = list(get_args(InputDatasetType))
+
+DatasetFormat = Literal["hf", "completion", "chat"]
+datasetFormats = list(get_args(DatasetFormat))
+
+def get_args() -> argparse.Namespace:
+    """
+    Parses and returns the arguments specified by the user's command
+    """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--input", type=str, required=True, help="Input HuggingFace dataset file")
+    parser.add_argument("--input-type", type=str, default="arrow", help="Format of the input dataset. Defaults to arrow.", choices=inputDatasetTypes)
+    parser.add_argument("--output", type=str, required=True, help="Output file")
+    parser.add_argument("--output-format", type=str, required=True, help="Format to convert the dataset to", choices=datasetFormats)
+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
+    parser.add_argument("--output-chat-system-prompt", type=str, help="The system prompt to use when the output format is chat")
+
+    args = parser.parse_args()
+    return args
+
+class DatasetFormatter(ABC):
+    """
+    Base class for dataset formatters. Formatters rename columns, remove and add 
+    columns to match the expected target format structure. HF, Chat or Completion models file formats.
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    @abstractmethod
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        pass
+
+class DatasetExporter(ABC):
+    """
+    Base class for dataset exporters. Exporters export dataset to different file types, JSONL, Parquet, ...
+    """
+    @abstractmethod
+    def export(self, ds: Dataset, output_path: str):
+        pass
+
+class DatasetConverter():
+    """
+    Entry point class. It resolves which DatasetFormatter and which DatasetExporter to use and runs them.
+    """
+    formats: Dict[DatasetFormat, DatasetFormatter]
+    exporters: Dict[OutputDatasetType, Any]
+
+    def __init__(self) -> None:
+        self.formats = {
+            "hf": HuggingFaceDatasetFormatter(),
+            "completion": OpenAiCompletionDatasetFormatter(),
+            "chat": OpenAiChatDatasetFormatter()
+        }
+        self.exporters = {
+            "parquet": ParquetDatasetExporter(),
+            "jsonl": JsonlDatasetExporter()
+        }
+
+    def convert(self, ds: Dataset, format: DatasetFormat, output_path: str, output_type: OutputDatasetType, params: Dict[str, str]):
+        if not format in self.formats:
+            raise Exception(f"Output Format {format} is not supported, pleased select one of {self.formats.keys()}")
+        
+        if not output_type in self.exporters:
+            raise Exception(f"Output Type {output_type} is not supported, pleased select one of {self.exporters.keys()}")
+
+        formatter = self.formats[format]
+        newds = formatter.format(ds, params)
+        exporter = self.exporters[output_type]
+        exporter.export(newds, output_path)
+
+class HuggingFaceDatasetFormatter(DatasetFormatter):
+    """
+    Returns the HuggingFace Dataset as is
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        return ds
+
+def _remove_all_columns_but(ds: Dataset, keep_columns) -> Dataset:
+    """
+    HF Dataset doesn't have a way to copy only specific columns of a Dataset so this help
+    removes all columns but the ones specified.
+    """
+    remove_columns = list(ds.column_names)
+    for keep in keep_columns:
+        remove_columns.remove(keep)
+    ds = ds.remove_columns(remove_columns)
+    return ds
+
+class OpenAiCompletionDatasetFormatter(DatasetFormatter):
+    """
+    Returns the Dataset in the OpenAI Completion Fine-tuning file format with two fields "prompt" and "completion".
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        newds = ds.rename_columns({'question': 'prompt', 'cot_answer': 'completion'})
+        return _remove_all_columns_but(newds, ['prompt', 'completion'])
+
+class OpenAiChatDatasetFormatter(OpenAiCompletionDatasetFormatter):
+    """
+    Returns the Dataset in the OpenAI Chat Fine-tuning file format with one field "messages".
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        newds = super().format(ds, params)
+
+        def format_messages(row):
+            messages = []
+            if 'system_prompt' in params:
+                system_prompt = params['system_prompt']
+                messages.append({ "role": "system", "content": system_prompt})
+            messages.extend([{ "role": "user", "content": row['prompt']}, { "role": "assistant", "content": row['completion']}])
+            chat_row = {"messages": messages}
+            return chat_row
+
+        newds = newds.map(format_messages)
+        return _remove_all_columns_but(newds, ['messages'])
+
+def append_extension(path: str, extension: str) -> str:
+    suffix = "." + extension
+    if not path.endswith(suffix):
+        path = path + suffix
+    return path
+
+
+class JsonlDatasetExporter(DatasetExporter):
+    """
+    Exports the Dataset to a JSONL file
+    """
+
+    def export(self, ds: Dataset, output_path: str):
+        ds.to_json(append_extension(output_path, "jsonl"))
+
+
+class ParquetDatasetExporter(DatasetExporter):
+    """
+    Exports the Dataset to a Parquet file
+    """
+
+    def export(self, ds: Dataset, output_path: str):
+        ds.to_parquet(append_extension(output_path, "parquet"))
+
+
+def main():
+    """
+    When raft.py is executed from the command line.
+    """
+    args = get_args()
+    ds = load_dataset(args.input_type, data_files={"train": args.input})['train']
+    formatter = DatasetConverter()
+
+    if args.output_chat_system_prompt and args.output_format != "chat":
+        raise Exception("Parameter --output-chat-system-prompt can only be used with --output-format chat")
+
+    format_params = {}
+    if args.output_chat_system_prompt:
+        format_params['system_prompt'] = args.output_chat_system_prompt
+
+    formatter.convert(ds=ds, format=args.output_format, output_path=args.output, output_type=args.output_type, params=format_params)
+
+if __name__ == "__main__":
+    main()
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5d76e526b0d5ae4ba6e5f16774340fb37391819
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..84027b0daf0421f71af883333d582261e69cda5a
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..a860e5e078ec60727f5511aa343e1f3e9049e0bf
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2e56b56190f098d423232f246451921e2fcb23a
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..a216f09eb79850b82ed2228b368eca7f6adfad27
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py
@@ -0,0 +1,89 @@
+import logging
+import os
+import argparse
+from raft_utils import generate_questions, add_chunk_to_dataset
+from format import DatasetConverter, datasetFormats, outputDatasetTypes
+from config import load_config
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def main(api_config):
+    ds = None
+    try:
+        logging.info("Starting to generate question pair.")
+        # Generate questions as list for each chunk
+        chunk_questions_zip = generate_questions(api_config)
+        if not chunk_questions_zip:
+            logging.warning("No questions generated from text. Please check the api_config or model configuration.")
+            return
+        logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
+        ds = add_chunk_to_dataset(chunk_questions_zip,api_config)
+        ds.save_to_disk(args.output)
+        logging.info(f"Data successfully written to {api_config['output']}. Process completed.")
+        formatter = DatasetConverter()
+
+        # Extract format specific params
+        format_params = {}
+        formatter.convert(ds=ds, format=args.output_format, output_path=args.output+"raft", output_type=args.output_type, params=format_params)
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
+
+def parse_arguments():
+    # Define command line arguments for the script
+    parser = argparse.ArgumentParser(
+        description="Generate RAFT question/answer/context pairs from documentation."
+    )
+    parser.add_argument(
+        "-t", "--questions_per_chunk",
+        type=int,
+        default=4,
+        help="Specify the number of question pairs to generate per chunk."
+    )
+    parser.add_argument(
+        "-m", "--model",
+        default="meta-llama/Meta-Llama-3-70B-Instruct",
+        help="Select the model to use for generation."
+    )
+    parser.add_argument(
+        "-c", "--config_path",
+        default="./raft.yaml",
+        help="Set the configuration file path that has system prompt along with language, dataset path and number of questions."
+    )
+    parser.add_argument(
+        "-u", "--endpoint_url",
+        default="http://localhost:8001/v1",
+        type=str,
+        help="LLM API url for generating question/answer pairs."
+    )
+    parser.add_argument(
+        "-k", "--api_key",
+        default="EMPTY",
+        type=str,
+        help="LLM API key for generating question/answer pairs."
+    )
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The size of each chunk in number of tokens")
+    parser.add_argument("-o","--output", type=str, default="./output/", help="The path at which to save the dataset")
+    parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.", choices=datasetFormats)
+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    logging.info("Initializing the process and loading configuration...")
+    args = parse_arguments()
+
+    api_config = load_config(args.config_path)
+    api_config["questions_per_chunk"] = args.questions_per_chunk
+    api_config["model"] = args.model
+    api_config["chunk_size"] = args.chunk_size
+    api_config["endpoint_url"] = args.endpoint_url
+    api_config["output"] = args.output
+    api_config["api_key"] = args.api_key
+    # if OPENAI_API_KEY is defined in the system environment, use it as the API key
+    if os.environ.get('API_KEY') is not None:
+        api_config["api_key"] = os.environ["API_KEY"]
+    logging.info(f"Configuration loaded. Generating {args.questions_per_chunk} question per chunk using model '{args.model}'.")
+    logging.info(f"Chunk size: {args.chunk_size}.")
+    logging.info(f"num_distract_docs: {api_config['num_distract_docs']}, refusal_probability: {api_config['refusal_probability']}")
+    logging.info(f"Will use endpoint_url: {args.endpoint_url}.")
+    logging.info(f"Output will be written to {args.output}.")
+    main(api_config)
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b433a4a0fe3927b367bcc076bd988ba0dc31614b
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml
@@ -0,0 +1,51 @@
+COT_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question: {question}\nContext: {context}\n
+  Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+  - First provide step-by-step reasoning on how to answer the question.
+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+  You MUST begin your final answer with the tag "<ANSWER>:". <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+question_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
+  some topic(s), generate {num_questions} example questions a user could ask and would be answered
+  using information from the chunk. For example, if the given context was a Wikipedia
+  paragraph about the United States, an example question could be 'How many states are
+  in the United States?
+  Your questions should be formulated in the same style as questions that users could ask in a search engine.
+  This means that your questions MUST NOT mention something like "according to the passage" or "context".
+  The questions should be able to be answered in 60 words or less. Include only the questions in your response.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+# question_prompt_template: >
+#   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a language model skilled in creating quiz questions.
+#   You will be provided with a document,
+#   read it and please generate factoid question and answer pairs that are most likely be asked by a user of Llama language models
+#   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
+#   Your factoid questions should be answerable with a specific, concise piece of factual information from the context.
+#   Your factoid questions should be formulated in the same style as questions users could ask in a search engine.
+#   This means that your factoid questions MUST NOT mention something like "according to the passage" or "context".
+#   please make sure you follow those rules:
+#   1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to
+#   model, training, fine-tuning and evaluation details of Llama language models,
+#   2. The questions can be answered based *solely* on the given passage.
+#   3. Avoid asking questions with similar meaning.
+#   4. Never use any abbreviation.
+#   5. The questions should be able to be answered in 60 words or less. Include only the questions in your response. <|eot_id|>
+#   <|start_header_id|>user<|end_header_id|>
+#   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+data_dir: "./data"
+
+xml_path: ""
+
+chunk_size: 1000
+
+questions_per_chunk: 5
+
+num_distract_docs: 4 # number of distracting documents to add to each chunk
+
+refusal_probability: 0.05 # probability of related documents to be added to each chunk
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..59dd649a62c57305e9a1c01fb592ca7edf521bc4
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py
@@ -0,0 +1,336 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+import logging
+import evaluate
+import argparse
+from config import load_config
+import json
+from langchain_openai import ChatOpenAI
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores.utils import DistanceStrategy
+from datetime import datetime
+from langchain_community.document_loaders import DirectoryLoader
+import re
+import string
+import pandas as pd 
+
+
+def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
+        # Use langchain to load the documents from data directory
+    # Load the RAFT model
+
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        temperature=0.0,
+        max_tokens=1000
+        )
+
+    all_tasks = [api_config['eval_prompt_template'].format(question=question) for question in question_list]
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
+        return []
+    return clean_text_list(generated_answers)
+def format_docs_raft(docs):
+    context = ""
+    for doc in docs:
+        context += "\n<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
+    return context
+def build_retriever(api_config,embedding_model_name,retrieved_docs_num=5):
+    # Use langchain to load the documents from data directory
+    loader = DirectoryLoader(api_config['data_dir'])
+    docs = loader.load()
+    # Split the document into chunks with a specified chunk size
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"],chunk_overlap=int(api_config["chunk_size"] / 10),separators= ["----------","\n\n", "\n", " ", ""],strip_whitespace=True)
+    docs_processed = text_splitter.split_documents(docs)
+    # Remove duplicates
+    unique_texts = {}
+    docs_processed_unique = []
+    for doc in docs_processed:
+        if doc.page_content not in unique_texts:
+            unique_texts[doc.page_content] = True
+            docs_processed_unique.append(doc)
+    logging.info(f"Total number of docs_processed used by vectorstore: {len(docs_processed_unique)}")
+    # Store the document into a vector store with a specific embedding model
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=embedding_model_name,
+        model_kwargs={"device": "cuda"},
+        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
+    )
+    vectorstore = FAISS.from_documents(docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE)
+    retriever = vectorstore.as_retriever(
+        search_kwargs={"k": retrieved_docs_num},
+    )
+    return retriever
+def generate_answers_with_RAG(model_name, question_list,api_config,retriever,api_url_overwrite=None):
+    api_url = api_config['model_endpoint_url']
+    if api_url_overwrite:
+        api_url = api_url_overwrite
+    key = api_config['api_key']
+    # Load the RAFT model
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        temperature=0.0,
+        max_tokens=1000
+        )
+    all_tasks = []
+    for q in question_list:
+        # retrive the top K documents
+        retrieved_docs = retriever.invoke(q)        
+        # format the documents into a string
+        documents = format_docs_raft(retrieved_docs)
+        # create a prompt
+        text = api_config["RAG_prompt_template"].format(context=documents,question=q)
+        all_tasks.append(text)
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
+        return []
+    return clean_text_list(generated_answers)
+def compute_rouge_score(generated : list, reference: list):
+    rouge_score = evaluate.load('rouge')
+    return rouge_score.compute(
+        predictions=generated,
+        references=reference,
+        use_stemmer=True,
+        use_aggregator=True
+    )
+def clean_text_list(text_list):
+    result = []
+    for text in text_list:
+        # for raft model, the answer will started with <ANSWER>
+        index = text.rfind("<ANSWER>")
+        if index!= -1:
+            text = text[index:]
+            text = text.replace("</ANSWER>:","")
+        text = text.replace("begin_quote","")
+        text = text.replace("end_quote","")
+        text = text.replace("##","")
+        text = text.strip()
+        result.append(text)
+    return result
+
+def normalize_answer(s):
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def exact_match_score(prediction, ground_truth):
+    """Computes EM score for a single prediction and ground truth answer."""
+    num_match = 0
+    assert len(prediction) == len(ground_truth), "Answer length does not match prediction length."
+    assert(len(ground_truth) > 0)
+    for idx, (pred,gold) in enumerate(zip(prediction, ground_truth)):
+        if (normalize_answer(pred) == normalize_answer(gold)):
+            num_match += 1
+    return num_match/len(ground_truth)
+def compute_judge_score(questions: list, generated : list, reference: list, api_config,api_url="http://localhost:8001/v1",key="EMPTY"):
+    correct_num = 0
+    model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        max_tokens=1000,
+        temperature=0.0)
+    all_tasks = []
+    for question,prediction,gold in zip(questions, generated,reference):
+        message = api_config['judge_prompt_template'].format(question=question,prediction=prediction,gold=gold)
+        all_tasks.append(message)
+    judge_responses = llm.batch(all_tasks)
+    judge_responses = ["YES" in item.content for item in judge_responses]
+    correct_num = sum(judge_responses)
+    return correct_num/len(questions),judge_responses
+def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_llm_as_judge=True):
+    # set metric to default -1, means no metric is computed
+    metric = {
+        "Rouge_score": -1,
+        "LLM_judge_score": -1,
+        "Exact_match": -1
+    }
+    if run_rouge:
+        rouge_score = compute_rouge_score(generated,reference)
+        metric["Rouge_score"] = rouge_score
+        print("Rouge_score:",rouge_score)
+    if api_config["judge_endpoint_url"] and run_llm_as_judge:
+        api_url = api_config["judge_endpoint_url"]
+        LLM_judge_score,judge_responses = compute_judge_score(questions, generated, reference, api_config,api_url=api_url)
+        metric["LLM_judge_score"] = LLM_judge_score
+        metric["LLM_judge_responses"] = judge_responses
+        print(f"LLM_judge_score: {LLM_judge_score}")
+    if run_exact_match:
+        exact_match = exact_match_score(generated,reference)
+        print(f"Exact_match_percentage: {exact_match:.4f}")
+        metric["Exact_match"] = exact_match
+    return metric
+def main(api_config):
+    # Since the eval set is small, we can run the eval without async functions
+    try:
+        api_url = api_config["model_endpoint_url"]
+        logging.info("Starting to generate answer given the eval set.")
+        questions,groud_truth = [],[]
+        if api_config["eval_file"].endswith(".parquet"):
+            eval_file = pd.read_parquet(api_config["eval_file"],filters=[('source', '=', 'pt_discuss_forum')])
+            for index, item in eval_file.iterrows():
+                questions.append(item["question"]+"\nDetails:\n"+item["context"])
+                groud_truth.append(item["answer"])
+        else:
+            with open(api_config["eval_file"]) as fp:
+                eval_file = json.load(fp)
+                for index, item in enumerate(eval_file):
+                    questions.append(item["question"])
+                    groud_truth.append(item["answer"])
+        generated_answers = {}            
+        # build retriver
+        retriever = build_retriever(api_config,"sentence-transformers/multi-qa-mpnet-base-cos-v1",api_config["rag_topk"])
+        # Generate answers for 8B models
+        model_name = api_config["model_name"]
+        generated_answers[model_name] = generate_answers_model_only(model_name,questions,api_url)
+        generated_answers[model_name+"_RAG"] = generate_answers_with_RAG(model_name, questions,api_config,retriever)
+        print("Finished generating answers for ", model_name)
+        large_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
+        large_api_url = api_config["judge_endpoint_url"]
+        generated_answers["70B_Base"] = generate_answers_model_only(large_model_name,questions,large_api_url)
+        generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,retriever,large_api_url)
+        print("Finished generating answers for ", large_model_name)
+        logging.info(f"Successfully generated {len(generated_answers[model_name+'_RAG'])} answers for all models.")
+        # for generate answer from each model, compute the score metric
+        all_metrics = []
+        output_file = api_config["output_log"]+str(datetime.now().strftime("%Y%m%d_%H%M%S"))
+
+        for model_name,model_answer in generated_answers.items():
+            if len(model_answer) != len(groud_truth):
+                print(f"The length of {model_name} answer is not equal to the length of ground truth.")
+                continue
+            metric = score_single(api_config,model_answer,groud_truth,questions)
+            print(f"The eval result for {model_name} is: {metric}")
+            with open(output_file,"a") as fp:
+                fp.write(f"Eval_result for {model_name} \n")
+                fp.write(f"Rouge_score: {metric['Rouge_score']} \n")
+                fp.write(f"Exact_match_percentage: {metric['Exact_match']} \n")
+                judge_responses = ["None"] * len(questions)
+                if api_config["judge_endpoint_url"]:
+                    fp.write(f"LLM_judge_score: {metric['LLM_judge_score']} \n")
+                    judge_responses = metric["LLM_judge_responses"]
+                    all_metrics.append((model_name,metric['LLM_judge_score'],metric["LLM_judge_responses"]))
+                fp.write(f"QA details: \n")
+                for item in zip(questions,model_answer,groud_truth,judge_responses):
+                    fp.write(f"question: {item[0]} \n")
+                    fp.write(f"generated_answers: {item[1]} \n")
+                    fp.write(f"groud_truth: {item[2]} \n")
+                    fp.write(f"LLM_judge_response: {item[3]} \n")
+                    fp.write("\n")
+                fp.write("\n------------------------------------\n")
+        # Now we want to take a closer look at the questions that are not answered the same by all the models.
+        judge_zip = list(zip(*[item[-1] for item in all_metrics]))
+        model_names = [item[0] for item in all_metrics]
+        with open(output_file,"a") as fp:
+            for item in all_metrics:
+                fp.write(f"Model_Name: {item[0]}, LLM_SCORE: {item[1]} \n")
+            for idx,item in enumerate(judge_zip):
+                # if all the responses are "YES", then we skip this question
+                if sum(item) == len(item):
+                    continue 
+                else:
+                    fp.write(f"Comparing interested question: {questions[idx]} \n")
+                    fp.write(f"groud_truth: {groud_truth[idx]} \n")
+                    for i in range(len(model_names)):
+                        fp.write(f"{item[i]} {model_names[i]}_answers: {generated_answers[model_names[i]][idx]} \n")
+                    fp.write("------------------------\n")
+            fp.write(json.dumps(all_metrics))
+        print("Finished evaluating the model.")
+
+
+        logging.info(f"Eval successfully, the eval result is saved to {api_config['output_log']}.")
+        # Saving the eval result to a log file
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
+
+def parse_arguments():
+    # Define command line arguments for the script
+    parser = argparse.ArgumentParser(
+        description="Generate question/answer pairs from documentation."
+    )
+    parser.add_argument(
+        "-m", "--model_name",
+        default=None,
+        help="Provide the model_name to use for evaluation. If not specified, the model_path in eval_config.yaml will be used."
+    )
+    parser.add_argument(
+        "-c", "--config_path",
+        default="raft_eval_config.yaml",
+        help="Set the configuration file path that has system prompt along with language, evalset path."
+    )
+    parser.add_argument(
+        "-d", "--data_dir",
+        default=None,
+        help="Provide the data folder path to build RAG for evaluation. If not specified, the data_dir in eval_config.yaml will be used."
+    )
+    parser.add_argument(
+        "-u", "--model_endpoint_url",
+        default="http://localhost:8000/v1",
+        type=str,
+        help="The raft model endpoint url for eval."
+    )
+    parser.add_argument(
+        "-j", "--judge_endpoint_url",
+        default=None,
+        type=str,
+        help="The large model endpoint url for judge as LLM."
+    )
+    parser.add_argument(
+        "-o", "--output_log",
+        default="./eval_result",
+        help="save the eval result to a log file. Default is eval_result[timestamp].log"
+    )
+    parser.add_argument(
+        "-k", "--api_key",
+        default="EMPTY",
+        type=str,
+        help="LLM API key for generating question/answer pairs."
+    )
+    parser.add_argument(
+        "-r", "--rag_topk",
+        default=5,
+        type=int,
+        help="set the number of top k documents the RAG needs to retrive."
+    )
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The character size of each chunk used in RAG")
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    logging.info("Initializing the process and loading configuration...")
+    args = parse_arguments()
+    api_config = load_config(args.config_path)
+    api_config["model_endpoint_url"] = args.model_endpoint_url
+    if args.data_dir:
+        api_config["data_dir"] = args.data_dir
+    if args.model_name:
+        api_config["model_name"] = args.model_name
+    api_config["judge_endpoint_url"] = args.judge_endpoint_url
+    api_config["output_log"] = args.output_log
+    api_config["api_key"] = args.api_key
+    api_config["chunk_size"] = args.chunk_size
+    api_config["rag_topk"] = args.rag_topk
+    if api_config["judge_endpoint_url"]:
+        logging.info(f"The judge model url is: '{args.judge_endpoint_url}'.")
+    main(api_config)
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cd5baa765c09e0310faa8694b38fc489e2e80b7
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml
@@ -0,0 +1,37 @@
+eval_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a AI assistant that skilled in answering questions related to Llama language models,
+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
+  Below is a question from a llama user, please the answer it with best of your knowledge,
+  The returned answer should be no more than 60 words. Please return the answers in text directly without any special tokens.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question:{question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+judge_prompt_template: >
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>You have been provided with a question, a teacher's answer and a student's answer below.
+    Given that question, you need to score the how good the student answer is compare to
+    the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
+    Here are the grade criterias to follow:
+    1. Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
+    2. Ensure that the student answer does not contain any conflicting statements.
+    3. It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.
+    YES means that the student's answer meets all of the criteria.
+    NO means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.
+    Only respond with "YES" or "NO", do not respond with anything else.<|eot_id|>
+    <|start_header_id|>user<|end_header_id|>
+    Question: {question} \n Teacher's Answer: {gold} \n Student's Answer: {prediction} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+RAG_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question: {question}\nContext: {context}\n
+  Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+  - First provide step-by-step reasoning on how to answer the question.
+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+  You MUST begin your final answer with the tag "<ANSWER>:". <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+eval_file: "./eval_llama.json"
+
+model_name: "raft-8b"
+
+data_dir: "./data"
+
+rag_topk: 5
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ae187b70f45bd6aab7199b1ad40d71eea7b6b9
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py
@@ -0,0 +1,245 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import os
+import logging
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from datasets import Dataset
+import random
+from langchain_community.document_loaders import SitemapLoader,DirectoryLoader
+from bs4 import BeautifulSoup
+from langchain_openai import ChatOpenAI
+import copy
+
+
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def strip_str(s: str) -> str:
+    """
+    Helper function for helping format strings returned by GPT-4.
+    """
+    l, r = 0, len(s)-1
+    beg_found = False
+    for i in range(len(s)):
+        if s[i].isalpha():
+            if not beg_found:
+                l = i
+                beg_found = True
+            else:
+                r = i
+    r += 2
+    return s[l:min(r, len(s))]
+def clean_documents(raw_text):
+    all_lines = []
+    for line in raw_text.split("\n"):
+        line = line.strip()
+        if len(line.split()) == 0:
+            continue
+        else:
+            all_lines.append(line)
+    result = " ".join(all_lines)
+    return result
+def clean_text(content: BeautifulSoup) -> str:
+    # Find all 'nav' and 'header' elements in the BeautifulSoup object
+    nav_elements = content.find_all("nav")
+    header_elements = content.find_all("header")
+    mydivs = content.find_all("div", {"role": "list"})
+    # Remove each 'nav' and 'header' element from the BeautifulSoup object
+    for element in nav_elements + header_elements+mydivs:
+        element.decompose()
+    raw_text = content.get_text("\n")
+    return clean_documents(raw_text)
+# Read
+def read_file_content(xml_path: str, data_folder: str) -> str:
+    if xml_path and data_folder:
+        logging.info(f"Error: both xml_path and data_folder are provided, will only read from xml for now")
+    if not xml_path and not data_folder:
+        logging.info(f"Error: both xml_path and data_folder are not provided")
+        return ""
+    if xml_path:
+        if not os.path.exists(xml_path):
+            logging.info(f"Error: {xml_path} does not exist")
+            return ""
+        # Use langchain to load the documents from webpage links in the xml file
+        sitemap_loader = SitemapLoader(web_path=xml_path,is_local=True,parsing_function=clean_text)
+        sitemap_loader.requests_kwargs = {"verify": False}
+        docs = sitemap_loader.load()
+        return docs
+    elif len(data_folder) != 0:
+        if not os.path.exists(data_folder):
+            logging.info(f"Error: {data_folder} does not exist")
+            return ""
+        # Use langchain to load the documents from data folder
+        loader = DirectoryLoader(data_folder)
+        docs = loader.load()
+        return docs
+
+
+
+def get_chunks(
+    docs: list,
+    chunk_size: int = 1000,
+    api_config: dict = None,
+) -> list[str]:
+    """
+    Takes in a list of documents, breaks them down into chunks of size
+    `chunk_size`, and returns the chunks.
+    """
+    chunks = []
+    if  len(docs) == 0:
+        raise TypeError("Can not get chunks from empty text")
+    else:
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"],chunk_overlap=int(api_config["chunk_size"] / 10),separators= ["----------","\n\n", "\n", " "],strip_whitespace=True)
+        docs_processed = text_splitter.split_documents(docs)
+        logging.info(f"Total number of docs_processed: {len(docs_processed)}")
+        # Remove duplicates
+        unique_texts = {}
+        docs_processed_unique = []
+        for doc in docs_processed:
+            if doc.page_content not in unique_texts and len(doc.page_content) > 100 :
+                unique_texts[doc.page_content] = True
+                docs_processed_unique.append(doc)        
+        chunks = [chunk.page_content for chunk in docs_processed_unique]
+        logging.info(f"Total number of docs_processed_unique: {len(docs_processed_unique)}")
+    return chunks
+# read all the files in the data folder, then split them into chunks
+# generate questions for each chunk and return zip of chunk and related questions list
+def generate_questions(api_config):
+    # get documents from the data folder or xml file
+    api_url = api_config["endpoint_url"]
+    key = api_config["api_key"]
+    documents = read_file_content(api_config["xml_path"],api_config["data_dir"])
+    if len(documents) == 0:
+        logging.info(f"Error reading files, document_text is {len(documents)}")
+    document_batches = get_chunks(documents,api_config["chunk_size"],api_config)
+    # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=api_config["model"],
+        temperature=0.0,
+        max_tokens=500
+        )
+    all_tasks = [api_config['question_prompt_template'].format(num_questions=str(api_config['questions_per_chunk']),context=document) for document in document_batches]
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No model answers generated. Please check the input context or model configuration in ",api_config["model"])
+        return []
+    final_result = []
+    for result in generated_answers:
+        queries = result.split('\n')
+        queries = [strip_str(q) for q in queries]
+        queries = [q for q in queries if any(c.isalpha() for c in q)]
+        if len(queries) > int(api_config['questions_per_chunk']):
+            # As the model may have unrelated question at the begining of the result
+            # if queries is more than questions_per_chunk, then we need to truncate it and only keep last questions_per_chunk lines
+            queries = queries[-int(api_config['questions_per_chunk']):]
+        final_result.append(queries)
+    return list(zip(document_batches,final_result))
+
+# Generate COT answer for each question given the chunk context
+def generate_COT(chunk_questions_zip,api_config) -> dict:
+    all_tasks = []
+    chunk_questions = []
+    question_asked = set()
+    for document_content,questions in chunk_questions_zip:
+        for question in questions:
+            question = question.strip()
+            # avoid asking the same question twice
+            if question not in question_asked:
+                question_asked.add(question)
+                prompt = api_config['COT_prompt_template'].format(question=question,context=str(document_content))
+                all_tasks.append(prompt)
+                chunk_questions.append((document_content,question))
+    # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
+    llm = ChatOpenAI(
+        openai_api_key=api_config["api_key"],
+        openai_api_base=api_config["endpoint_url"],
+        model_name=api_config["model"],
+        temperature=0.0,
+        max_tokens=500
+        )
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    COT_results = []
+    # return a list of (chunk, question, generated_answer)
+    for (chunk, question),generated_answer in zip(chunk_questions,generated_answers):
+        COT_results.append((chunk,question,generated_answer))
+    return COT_results
+
+def add_chunk_to_dataset(
+    chunk_questions_zip: list,
+    api_config: dict,
+) -> None:
+    """
+    Given a chunk and related questions lists, create {Q, A, D} triplets and add them to the dataset.
+    """
+    num_distract = api_config["num_distract_docs"]
+    p = api_config["refusal_probability"]
+    chunks = [chunk for chunk, _ in chunk_questions_zip]
+    COT_results = generate_COT(chunk_questions_zip,api_config)
+    logging.info(f"COT generation completed, total num of COT results: {len(COT_results)}")
+    completed,refusal= 0,0
+    data_list = []
+    for chunk, q , cot in COT_results:
+        # The COT answer will be used as the label in the fine-tuning stage
+
+        datapt = {
+            "id": None,
+            "type": "general",
+            "question": q,
+            "context": None,
+            "oracle_context": None,
+            "cot_answer": cot
+        }
+        i = chunks.index(chunk)
+        datapt["id"] = f"seed_task_{len(data_list)}"
+        # add num_distract distractor docs
+        docs = [chunk]
+        indices = list(range(0, len(chunks)))
+        indices.remove(i)
+        for j in random.sample(indices, num_distract):
+            docs.append(chunks[j])
+        doc_copy = docs.copy()
+        random.shuffle(docs)
+        d = {
+            "title": [],
+            "sentences": []
+        }
+
+        d["title"].append(["placeholder_title"]*(num_distract+1))
+        d["sentences"].append(docs)
+        datapt["context"] = d
+        datapt["oracle_context"] = chunk
+
+        # construct model instruction
+        context = ""
+        for doc in docs:
+            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
+        context += q
+        # This instruction will be used in the fine-tuning stage
+        datapt["instruction"] = context
+        datapt_copy = copy.deepcopy(datapt)
+        # add to dataset
+        data_list.append(datapt)
+        # decides whether to add refusal example where the related documents are not provided
+        refusal = random.uniform(0, 1) <= p
+        if refusal:
+            doc_copy[0] = chunks[random.sample(indices, 1)[0]]
+            random.shuffle(doc_copy)
+            refusl_context = ""
+            for doc in doc_copy:
+                refusl_context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
+            refusl_context += q
+            # This instruction will be used in the fine-tuning stage
+            datapt_copy["id"] = f"refusal_task_{len(data_list)}"
+            datapt_copy["instruction"] = refusl_context
+            datapt_copy["cot_answer"] = "Sorry, I don't know the answer to this question because related documents are not found. Please try again."
+            data_list.append(datapt_copy)
+            refusal += 1
+        completed += 1
+        if completed % 100 == 0:
+            logging.info(f"refusal example added: {refusal}, total examples added: {completed}, total examples to be added: {len(COT_results)- completed}")
+    ds = Dataset.from_list(data_list)
+    return ds
diff --git a/recipes/use_cases/multilingual/README.md b/recipes/use_cases/multilingual/README.md
index 007eba089595575e7a745e44845e58355419117c..899c73fdb00596e5e65b795f2283d51d141a6c19 100644
--- a/recipes/use_cases/multilingual/README.md
+++ b/recipes/use_cases/multilingual/README.md
@@ -1,7 +1,8 @@
 # Extending Llama to a new language
 Authored by : Sarvam team
 In this recipe, we will see how to add a new language to the Llama family of models. The steps are quite general and can be easily adapted to other models as well. Using this recipe, you should be able to replicate the findings of [OpenHathi](https://huggingface.co/sarvamai/OpenHathi-7B-Hi-v0.1-Base).
-Please read more about OpenHathi [here](https://www.sarvam.ai/blog/announcing-openhathi-series)
+Please read more about OpenHathi [here](https://web.archive.org/web/20240418103408/https://www.sarvam.ai/blog/announcing-openhathi-series)
+
 ## Data
 The original OpenHathi model uses a combination of [Sangraha](https://huggingface.co/datasets/ai4bharat/sangraha) and Wikipedia as its primary data sources. If the reader is interested in using these sources, they would also have to preprocess the data: clean, filter, and deduplicate. See [Setu](https://github.com/AI4Bharat/setu) for an easy way to do this at scale.
 
diff --git a/requirements.txt b/requirements.txt
index d445c2a94b37d5a0e044b51d4b0626f3113cc2fc..b7ff89fcd9344dab072fe9cbb554d96a3ef28833 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ black[jupyter]
 datasets
 fire
 peft
-transformers>=4.43.0
+transformers>=4.43.1
 sentencepiece
 py7zr
 scipy
@@ -19,4 +19,13 @@ chardet
 openai
 typing-extensions==4.8.0
 tabulate
+evaluate
+rouge_score
+pyyaml==6.0.1
+faiss-gpu; python_version < '3.11'
+unstructured[pdf]
+langchain_openai
+langchain
+langchain_community
+sentence_transformers
 codeshield
diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
index 879140e77f4c407352d2f49d7f59f96b2e923f14..549a53935fb66101fcb0fa368791b7f9f1d05937 100644
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -3,28 +3,28 @@
 
 from dataclasses import dataclass
 
-    
+
 @dataclass
 class samsum_dataset:
     dataset: str =  "samsum_dataset"
     train_split: str = "train"
     test_split: str = "validation"
-    
-    
+    trust_remote_code: bool = False
+
+
 @dataclass
 class grammar_dataset:
     dataset: str = "grammar_dataset"
-    train_split: str = "src/llama_recipes/datasets/grammar_dataset/gtrain_10k.csv" 
+    train_split: str = "src/llama_recipes/datasets/grammar_dataset/gtrain_10k.csv"
     test_split: str = "src/llama_recipes/datasets/grammar_dataset/grammar_validation.csv"
 
-    
+
 @dataclass
 class alpaca_dataset:
     dataset: str = "alpaca_dataset"
     train_split: str = "train"
     test_split: str = "val"
     data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
-    
 
 @dataclass
 class custom_dataset:
@@ -32,6 +32,7 @@ class custom_dataset:
     file: str = "recipes/quickstart/finetuning/datasets/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
+    data_path: str = ""
     
 @dataclass
 class llamaguard_toxicchat_dataset:
diff --git a/src/llama_recipes/configs/fsdp.py b/src/llama_recipes/configs/fsdp.py
index aec168c28862f54deffbabbc51d0105218c66959..0e6f6a67426d3f340d29f63bb4aa05c4558423d2 100644
--- a/src/llama_recipes/configs/fsdp.py
+++ b/src/llama_recipes/configs/fsdp.py
@@ -14,7 +14,7 @@ class fsdp_config:
     hsdp : bool =False # Require HYBRID_SHARD to be set. This flag can extend the HYBRID_SHARD by allowing sharding a model on customized number of GPUs (Sharding_group) and Replicas over Sharding_group.
     sharding_group_size : int=0 # requires hsdp to be set. This specifies the sharding group size, number of GPUs that you model can fit into to form a replica of a model.
     replica_group_size: int=0 #requires hsdp to be set. This specifies the replica group size, which is world_size/sharding_group_size.
-    checkpoint_type: StateDictType = StateDictType.SHARDED_STATE_DICT  # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size.
+    checkpoint_type: StateDictType = StateDictType.SHARDED_STATE_DICT  # alternatively FULL_STATE_DICT can be used. SHARDED_STATE_DICT saves one file with sharded weights per rank while FULL_STATE_DICT will collect all weights on rank 0 and save them in a single file.
     fsdp_activation_checkpointing: bool=True
     fsdp_cpu_offload: bool=False
     pure_bf16: bool = False
diff --git a/src/llama_recipes/configs/training.py b/src/llama_recipes/configs/training.py
index 14d77f37ff100138e17e0d76e202f5b59dd774f1..acdbc890b0940947098a289e6fe54b273fd0fdfa 100644
--- a/src/llama_recipes/configs/training.py
+++ b/src/llama_recipes/configs/training.py
@@ -8,8 +8,8 @@ from dataclasses import dataclass
 class train_config:
     model_name: str="PATH/to/Model"
     tokenizer_name: str=None
-    enable_fsdp: bool=False
-    low_cpu_fsdp: bool=False
+    enable_fsdp: bool=False # shards model parameters, optimizer states and gradients across DDP ranks
+    low_cpu_fsdp: bool=False # saves cpu memory by loading pretrained model on rank0 only
     run_validation: bool=True
     batch_size_training: int=4
     batching_strategy: str="packing" #alternative: padding
@@ -23,14 +23,14 @@ class train_config:
     num_workers_dataloader: int=1
     lr: float=1e-4
     weight_decay: float=0.0
-    gamma: float= 0.85
+    gamma: float= 0.85 # multiplicatively decay the learning rate by gamma after each epoch
     seed: int=42
     use_fp16: bool=False
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
     peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
-    use_peft: bool=False
+    use_peft: bool=False # use parameter efficient fine tuning
     from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
diff --git a/src/llama_recipes/datasets/__init__.py b/src/llama_recipes/datasets/__init__.py
index 3ed91caaea2e3d121662770f145e97e0e2c324b0..e7382aecbeb393027b8dd12fc0a649ac34b2829e 100644
--- a/src/llama_recipes/datasets/__init__.py
+++ b/src/llama_recipes/datasets/__init__.py
@@ -1,7 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+from functools import partial
+
 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
+from llama_recipes.datasets.custom_dataset import get_custom_dataset
 from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
-from llama_recipes.datasets.toxicchat_dataset import get_llamaguard_toxicchat_dataset as get_llamaguard_toxicchat_dataset
\ No newline at end of file
+from llama_recipes.datasets.toxicchat_dataset import get_llamaguard_toxicchat_dataset as get_llamaguard_toxicchat_dataset
+
+DATASET_PREPROC = {
+    "alpaca_dataset": partial(get_alpaca_dataset),
+    "grammar_dataset": get_grammar_dataset,
+    "samsum_dataset": get_samsum_dataset,
+    "custom_dataset": get_custom_dataset,
+    "llamaguard_toxicchat_dataset": get_llamaguard_toxicchat_dataset,
+}
\ No newline at end of file
diff --git a/src/llama_recipes/datasets/custom_dataset.py b/src/llama_recipes/datasets/custom_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bcf0ed6cfa793a0ba1a62470be59388a4892d6a
--- /dev/null
+++ b/src/llama_recipes/datasets/custom_dataset.py
@@ -0,0 +1,37 @@
+import importlib
+from pathlib import Path
+
+def load_module_from_py_file(py_file: str) -> object:
+    """
+    This method loads a module from a py file which is not in the Python path
+    """
+    module_name = Path(py_file).name
+    loader = importlib.machinery.SourceFileLoader(module_name, py_file)
+    spec = importlib.util.spec_from_loader(module_name, loader)
+    module = importlib.util.module_from_spec(spec)
+
+    loader.exec_module(module)
+
+    return module
+
+
+def get_custom_dataset(dataset_config, tokenizer, split: str):
+    if ":" in dataset_config.file:
+        module_path, func_name = dataset_config.file.split(":")
+    else:
+        module_path, func_name = dataset_config.file, "get_custom_dataset"
+
+    if not module_path.endswith(".py"):
+        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+
+    module_path = Path(module_path)
+    if not module_path.is_file():
+        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+
+    module = load_module_from_py_file(module_path.as_posix())
+    try:
+        return getattr(module, func_name)(dataset_config, tokenizer, split)
+    except AttributeError as e:
+        print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
+        raise e
+
diff --git a/src/llama_recipes/datasets/samsum_dataset.py b/src/llama_recipes/datasets/samsum_dataset.py
index fffde97b352d0eb3851f6fc393fab665089bc18f..c0f11f97655f68bc60debd397ff3eba9ad2dcfd5 100644
--- a/src/llama_recipes/datasets/samsum_dataset.py
+++ b/src/llama_recipes/datasets/samsum_dataset.py
@@ -8,7 +8,9 @@ import datasets
 
 
 def get_preprocessed_samsum(dataset_config, tokenizer, split):
-    dataset = datasets.load_dataset("samsum", split=split)
+    if not hasattr(dataset_config, "trust_remote_code") or not dataset_config.trust_remote_code:
+        raise ValueError("The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum. To activate `trust_remote_code` option use this config: --samsum_dataset.trust_remote_code=True")
+    dataset = datasets.load_dataset("samsum", split=split, trust_remote_code=dataset_config.trust_remote_code)
 
     prompt = (
         f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n"
diff --git a/src/llama_recipes/datasets/toxicchat_dataset.py b/src/llama_recipes/datasets/toxicchat_dataset.py
index 3595cec4cbffe1db0c262a17b2019de8b2a6f696..eee54faf1f45ee9fd732984a447f31325cb8b831 100644
--- a/src/llama_recipes/datasets/toxicchat_dataset.py
+++ b/src/llama_recipes/datasets/toxicchat_dataset.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+# This software may be used and distributed according to the terms of the Llama 3.1 Community License Agreement.
 
-# For dataset details visit: https://huggingface.co/datasets/samsum
+# For dataset details visit: https://huggingface.co/datasets/lmsys/toxic-chat
 
 import copy
 import datasets
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
index 3bcaeea74c928a8467f0df803b5dfe52bbf09bf5..9de75a4134775b30a1307fc57ddb30248d2acbf1 100644
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -43,6 +43,7 @@ from llama_recipes.utils.config_utils import (
     generate_peft_config,
     generate_dataset_config,
     get_dataloader_kwargs,
+    check_fsdp_config,
 )
 from llama_recipes.utils.dataset_utils import get_preprocessed_dataset
 
@@ -181,6 +182,8 @@ def main(**kwargs):
 
     #setting up FSDP if enable_fsdp is enabled
     if train_config.enable_fsdp:
+        check_fsdp_config(fsdp_config)
+        
         if not train_config.use_peft and train_config.freeze_layers:
             freeze_transformer_layers(model, train_config.num_freeze_layers)
 
diff --git a/src/llama_recipes/model_checkpointing/__init__.py b/src/llama_recipes/model_checkpointing/__init__.py
index 9474f78ccaf3d0d84ebf921dacd6f2ca42aaa4e8..8116ba9047b4bcfe37a8fb55a4feb87d888f833e 100644
--- a/src/llama_recipes/model_checkpointing/__init__.py
+++ b/src/llama_recipes/model_checkpointing/__init__.py
@@ -3,6 +3,8 @@
 
 from llama_recipes.model_checkpointing.checkpoint_handler import (
     load_model_checkpoint,
+    save_fsdp_model_checkpoint_full,
+    save_peft_checkpoint,
     save_model_checkpoint,
     load_optimizer_checkpoint,
     save_optimizer_checkpoint,
diff --git a/src/llama_recipes/model_checkpointing/checkpoint_handler.py b/src/llama_recipes/model_checkpointing/checkpoint_handler.py
index b097df97daf75b3bc9ed57d89d02619e0dfecd4f..933c289084206b5c56623a9fc2622812c338037d 100644
--- a/src/llama_recipes/model_checkpointing/checkpoint_handler.py
+++ b/src/llama_recipes/model_checkpointing/checkpoint_handler.py
@@ -26,6 +26,7 @@ from torch.distributed.checkpoint.default_planner import (
 )
 
 
+from torch.distributed.checkpoint.state_dict import get_model_state_dict, StateDictOptions
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 import torch.distributed._shard.checkpoint as dist_cp
 import torch.distributed as dist
@@ -122,7 +123,7 @@ def save_model_and_optimizer_sharded(model, rank, cfg,optim=None):
         print(
             f"Checkpoint Time = {t1-t0:.4f}\n"
         )
-def save_model_checkpoint(
+def save_fsdp_model_checkpoint_full(
     model,
     optimizer,
     rank,
@@ -151,7 +152,7 @@ def save_model_checkpoint(
         )
         save_dir = Path.cwd() / folder_name
         save_dir.mkdir(parents=True, exist_ok=True)
-        save_name = cfg.model_name + "-" + str(epoch) + ".pt"
+        save_name = cfg.model_name.replace("/","--") + "-" + str(epoch) + ".pt"
         save_full_path = str(save_dir) + "/" + save_name
 
         # save model
@@ -264,4 +265,26 @@ def load_sharded_model_single_gpu(model,model_path):
     model.load_state_dict(state_dict["model"])
     
     print(f"Sharded state checkpoint loaded from {model_path}")
-    return model
\ No newline at end of file
+    return model
+
+def save_peft_checkpoint(model, model_path):
+    """save_pretrained peft model"""
+
+    options = StateDictOptions(full_state_dict=True, cpu_offload=True)
+    
+    if isinstance(model, FSDP):
+        state_dict = get_model_state_dict(model, options=options)
+        model.save_pretrained(model_path, state_dict=state_dict)
+    else:
+        model.save_pretrained(model_path)
+    
+    
+def save_model_checkpoint(model, output_dir):
+    """save model when not peft and on single device"""
+    
+    output_file = Path(output_dir) / "model.pt"
+    
+    state_dict = model.state_dict()
+    
+    torch.save(state_dict, output_file)
+    
diff --git a/src/llama_recipes/utils/config_utils.py b/src/llama_recipes/utils/config_utils.py
index 8d21adf7fd4d917294f401b42ba4884dbafa3eba..cb8936de7ffdfa654c0adb18dda8e1030f4c431c 100644
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -5,6 +5,7 @@ import inspect
 from dataclasses import asdict
 
 import torch.distributed as dist
+from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 from torch.utils.data import DistributedSampler
 from peft import (
     LoraConfig,
@@ -108,3 +109,18 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode,collate_fn=None
         else:
             raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}")
         return kwargs
+
+
+def check_fsdp_config(fsdp_config):
+    VALID_TYPES = (StateDictType.SHARDED_STATE_DICT, StateDictType.FULL_STATE_DICT)
+    if isinstance(fsdp_config.checkpoint_type, str):
+        str_to_obj = {
+            "StateDictType.SHARDED_STATE_DICT": StateDictType.SHARDED_STATE_DICT,
+            "StateDictType.FULL_STATE_DICT": StateDictType.FULL_STATE_DICT,
+        }
+        if fsdp_config.checkpoint_type in str_to_obj:
+            fsdp_config.checkpoint_type = str_to_obj[fsdp_config.checkpoint_type]
+        
+    if not fsdp_config.checkpoint_type in VALID_TYPES:
+        raise ValueError(f"Invalid checkpoint_type {fsdp_config.checkpoint_type}")
+    
\ No newline at end of file
diff --git a/src/llama_recipes/utils/dataset_utils.py b/src/llama_recipes/utils/dataset_utils.py
index 39c9b7d7b62b72a70ce9110b0d0166845e3d9c49..704db8ac1298e68885d0369acc5f80728079eb43 100644
--- a/src/llama_recipes/utils/dataset_utils.py
+++ b/src/llama_recipes/utils/dataset_utils.py
@@ -1,63 +1,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-import importlib
-from functools import partial
-from pathlib import Path
-
 import torch
 
-from llama_recipes.datasets import (
-    get_grammar_dataset,
-    get_alpaca_dataset,
-    get_samsum_dataset,
-    get_llamaguard_toxicchat_dataset,
-)
-
-
-def load_module_from_py_file(py_file: str) -> object:
-    """
-    This method loads a module from a py file which is not in the Python path
-    """
-    module_name = Path(py_file).name
-    loader = importlib.machinery.SourceFileLoader(module_name, py_file)
-    spec = importlib.util.spec_from_loader(module_name, loader)
-    module = importlib.util.module_from_spec(spec)
-
-    loader.exec_module(module)
-
-    return module
-
-
-def get_custom_dataset(dataset_config, tokenizer, split: str):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
-    else:
-        module_path, func_name = dataset_config.file, "get_custom_dataset"
-
-    if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
-
-    module_path = Path(module_path)
-    if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
-
-    module = load_module_from_py_file(module_path.as_posix())
-    try:
-        return getattr(module, func_name)(dataset_config, tokenizer, split)
-    except AttributeError as e:
-        print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
-        raise e
-
-
-DATASET_PREPROC = {
-    "alpaca_dataset": partial(get_alpaca_dataset),
-    "grammar_dataset": get_grammar_dataset,
-    "samsum_dataset": get_samsum_dataset,
-    "custom_dataset": get_custom_dataset,
-    "llamaguard_toxicchat_dataset": get_llamaguard_toxicchat_dataset,
-
-}
+from llama_recipes.data.concatenator import ConcatDataset
+from llama_recipes.datasets import DATASET_PREPROC, get_custom_dataset
+from llama_recipes.utils.config_utils import get_dataloader_kwargs
 
 
 def get_preprocessed_dataset(
@@ -78,3 +26,21 @@ def get_preprocessed_dataset(
         tokenizer,
         get_split(),
     )
+
+
+def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"):
+    dataset = get_preprocessed_dataset(tokenizer, dataset_config, split)
+    dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split)
+    
+    if split == "train" and train_config.batching_strategy == "packing":
+        dataset = ConcatDataset(dataset, chunk_size=train_config.context_length)
+
+    # Create data loader
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=train_config.num_workers_dataloader,
+        pin_memory=True,
+        **dl_kwargs,
+    )
+    return dataloader
+    
\ No newline at end of file
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index b81661adfff18fa7428e292002dcd6d92d8e2e38..bbbfe96a88fb084ec7458f2eb4a98fbeb9eb7c21 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -20,7 +20,7 @@ from transformers import LlamaTokenizer
 import json
 
 
-from llama_recipes.model_checkpointing import save_model_checkpoint, save_model_and_optimizer_sharded, save_optimizer_checkpoint
+from llama_recipes.model_checkpointing import save_fsdp_model_checkpoint_full, save_model_and_optimizer_sharded, save_optimizer_checkpoint, save_peft_checkpoint, save_model_checkpoint
 from llama_recipes.policies import fpSixteen,bfSixteen, get_llama_wrapper
 from llama_recipes.utils.memory_utils import MemoryTrace
 from accelerate.utils import is_xpu_available, is_ccl_available
@@ -244,7 +244,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                             print(f"we are about to save the PEFT modules")
                     else:
                         print(f"we are about to save the PEFT modules")
-                    model.save_pretrained(train_config.output_dir)
+                    save_peft_checkpoint(model, train_config.output_dir)
                     if train_config.enable_fsdp:
                         if rank==0:
                             print(f"PEFT modules are saved in {train_config.output_dir} directory")
@@ -252,27 +252,35 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                         print(f"PEFT modules are saved in {train_config.output_dir} directory")
 
                 else:
-                    if not train_config.use_peft and fsdp_config and fsdp_config.checkpoint_type == StateDictType.FULL_STATE_DICT:
-
-                        save_model_checkpoint(
+                    if not train_config.enable_fsdp:
+                        save_model_checkpoint(model, train_config.output_dir)
+                        
+                    elif fsdp_config.checkpoint_type == StateDictType.FULL_STATE_DICT:
+                        print(" Saving the FSDP model checkpoint using FULL_STATE_DICT")
+                        print("=====================================================")
+                        save_fsdp_model_checkpoint_full(
                             model, optimizer, rank, train_config, epoch=epoch
                         )
-                    elif not train_config.use_peft and fsdp_config and fsdp_config.checkpoint_type == StateDictType.SHARDED_STATE_DICT:
-                        print(" Saving the FSDP model checkpoints using SHARDED_STATE_DICT")
-                        print("=====================================================")
+                        
+                        if train_config.save_optimizer:
+                            print(" Saving the FSDP optimizer using FULL_STATE_DICT")
+                            print("=====================================================")
+                            save_optimizer_checkpoint(
+                                model, optimizer, rank, train_config, epoch=epoch
+                            )
+                        
+                    elif fsdp_config.checkpoint_type == StateDictType.SHARDED_STATE_DICT:
 
-                        save_model_and_optimizer_sharded(model, rank, train_config)
                         if train_config.save_optimizer:
+                            print(" Saving the FSDP model checkpoints using SHARDED_STATE_DICT")
+                            print("=====================================================")
                             save_model_and_optimizer_sharded(model, rank, train_config, optim=optimizer)
+                        else:
                             print(" Saving the FSDP model checkpoints and optimizer using SHARDED_STATE_DICT")
                             print("=====================================================")
+                            save_model_and_optimizer_sharded(model, rank, train_config)
 
-                    if not train_config.use_peft and  train_config.save_optimizer:
-                        save_optimizer_checkpoint(
-                            model, optimizer, rank, train_config, epoch=epoch
-                        )
-                        print(" Saving the FSDP model checkpoints and optimizer using FULL_STATE_DICT")
-                        print("=====================================================")
+                        
                 if train_config.enable_fsdp:
                     dist.barrier()
             checkpoint_end_time = time.perf_counter() - checkpoint_start_time
diff --git a/tools/benchmarks/llm_eval_harness/README.md b/tools/benchmarks/llm_eval_harness/README.md
index 38606770f406ae8705d2a32a1ae292203c23b200..409dbae5ea80e44b3e894cf94eac32e3962eb25c 100644
--- a/tools/benchmarks/llm_eval_harness/README.md
+++ b/tools/benchmarks/llm_eval_harness/README.md
@@ -62,6 +62,10 @@ There has been an study from [IBM on efficient benchmarking of LLMs](https://arx
 python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B,dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100
 ```
 
+### Reproducing Meta 3.1 Evaluation Metrics Using LM-Evaluation-Harness
+
+[meta_eval_reproduce](./meta_eval_reproduce/) folder provides a detailed guide on how to reproduce the Meta Llama 3.1 evaluation metrics reported in our [Meta Llama website](https://llama.meta.com/) using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) and our [3.1 evals Huggingface collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f). By following the steps outlined, users can replicate a evaluation process that is similar to Meta's, for specific tasks and compare their results with our reported metrics. While slight variations in results are expected due to differences in implementation and model behavior, we aim to provide a transparent and reproducible method for evaluating Meta Llama 3 models using third party library. Please check the [README.md](./meta_eval_reproduce/README.md) for more details.
+
 ### Reproducing Hugging Face Open-LLM-Leaderboard
 
 Here, we provided a list of tasks from `Open-LLM-Leaderboard` which can be used by passing `--open-llm-leaderboard-tasks` instead of `tasks` to the `eval.py`.
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cb15e9cef2050986d58d5a0c5f69a56e01ea693
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
@@ -0,0 +1,213 @@
+
+# Reproducing Meta 3.1 Evaluation Metrics Using LM-Evaluation-Harness
+
+As Meta Llama models gain popularity, evaluating these models has become increasingly important. We have released all the evaluation details for Meta-Llama 3.1 models as datasets in the [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f). This recipe demonstrates how to closely reproduce the Llama 3.1 reported benchmark numbers using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) library and our prompts from the 3.1 evals datasets on selected tasks.
+
+## Disclaimer
+
+
+1. **This recipe is not the official implementation** of Meta Llama evaluation. It is based on public third-party libraries, as this implementation is not mirroring Meta Llama evaluation, this may lead to minor differences in the reproduced numbers.
+2. **Model Compatibility**: This tutorial is specifically for Llama 3 based models, as our prompts include Meta Llama 3 special tokens, e.g. `<|start_header_id|>user<|end_header_id|>`. It will not work with models that are not based on Llama 3.
+
+## Insights from Our Evaluation Process
+
+Here are our insights about the differences in terms of the eval configurations and prompting methods between this implementation and Hugging Face [leaderboard implementation](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard).
+
+- **Prompts**: We use Chain-of-Thought(COT) prompts while Hugging Face leaderboard does not. The prompts that define the output format are also different.
+- **Metric calculation**: For MMLU-Pro, BBH, GPQA tasks, we ask the model to generate response and score the parsed answer from generated response, while Hugging Face leaderboard evaluation is comparing log likelihood of all label words, such as [ (A),(B),(C),(D) ].
+- **Parsers**: For generative tasks, where the final answer needs to be parsed before scoring, the parser functions can be different between ours and Hugging Face leaderboard evaluation, as our prompts that define the model output format are designed differently.
+- **Inference**: We use internal LLM inference solution that loads pytorch checkpoints and do not use padding, while Hugging Face leaderboard uses Hugging Face format model and sometimes will use padding depending on the tasks type and batch size.
+- **Tasks**  We run benchmarks on BBH and MMLU-Pro only for pretrained models and Math-Hard, IFeval, GPQA, only for pretrained models.
+
+Given those differences, our reproduced number can not be compared to the numbers in the Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard), even if the task names are the same.
+
+## Environment setups
+
+Please install lm-evaluation-harness and our llama-recipe repo by following:
+
+```
+pip install lm-eval[math,ifeval,sentencepiece,vllm]==0.4.3
+git clone git@github.com:meta-llama/llama-recipes.git
+cd llama-recipes
+pip install -U pip setuptools
+pip install -e .
+cd tools/benchmarks/llm_eval_harness/meta_eval_reproduce
+```
+
+To access our [3.1 evals Hugging Face collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f), you must:
+- Log in to the Hugging Face website and click the 3.1 evals dataset pages and agree to the terms.
+- Follow the [Hugging Face authentication instructions](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication) to gain read access for your machine.
+
+It is recommended to read the dataset card to understand the meaning of each column and use the viewer feature in the Hugging Face dataset to view our dataset. It is important to have some basic understanding of our dataset format and content before proceeding.
+
+### Task Selection
+
+Given the extensive number of tasks available (12 for pretrained models and 30 for instruct models), here we will focus on tasks that overlap with the popular Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) as shown in the following:
+
+- **Tasks for pretrained models**: BBH and MMLU-Pro
+- **Tasks for instruct models**: Math-Hard, IFeval, GPQA, and MMLU-Pro
+
+Here, we aim to reproduce the Meta reported benchmark numbers on the aforementioned tasks using Hugging Face [leaderboard implementation](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard). Please follow the instructions below to make necessary modifications to use our eval prompts and reproduce our reported metrics.
+
+
+### Run eval tasks
+
+1. We created [eval_config.yaml](./eval_config.yaml) to store all the arguments and hyperparameters. This is the main config file you need to change if you want to eval other models, and a part of eval_config.yaml looks like this:
+
+```yaml
+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+
+evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+
+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
+
+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
+
+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+
+...
+
+```
+
+  Change `model_name` to the model name you want to eval on and change the `evals_dataset` according to the model type and parameters.  Remember to adjust the `tensor_parallel_size` to 2 or more to load the 70B models and change the `data_parallel_size` accordingly so that `tensor_parallel_size * data_parallel_size` is the number of GPUs you have. Please read the comments inside this yaml for detailed explanations on other parameters.
+
+2. We already included all the related eval task yaml and python files in the [meta_template](./meta_template/) folder, which define all the task implementation. You do not need to change those manually, we will use [prepare_meta_eval.py](./prepare_meta_eval.py) to automatically change them later.
+
+3. Then we can run [prepare_meta_eval.py](./prepare_meta_eval.py) that reads the configuration from [eval_config.yaml](./eval_config.yaml), copies everything in the template folder to a working folder `work_dir`, makes modification to those templates accordingly, prepares dataset if needed and prints out the CLI command to run the `lm_eval`.
+
+  To run the [prepare_meta_eval.py](./prepare_meta_eval.py), we can do:
+
+```
+python prepare_meta_eval.py --config_path ./eval_config.yaml
+```
+
+  This script will load the default [eval_config.yaml](./eval_config.yaml) config and print out a `lm_eval` command to run `meta_instruct` group tasks,  which includes `meta_ifeval`, `meta_math_hard`, `meta_gpqa` and `meta_mmlu_pro_instruct`, for `meta-llama/Meta-Llama-3.1-8B-Instruct` model using `meta-llama/Meta-Llama-3.1-8B-Instruct-evals` dataset.
+
+  An example output from [prepare_meta_eval.py](./prepare_meta_eval.py) looks like this:
+
+```
+lm_eval --model vllm --model_args pretrained=meta-llama/Meta-Llama-3.1-8B-Instruct,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=4,max_model_len=8192,add_bos_token=True,seed=42 --tasks meta_instruct --batch_size auto --output_path eval_results --include_path ./work_dir --seed 42  --log_samples
+```
+
+4. Then just copy the `lm_eval` command printed by [prepare_meta_eval.py](./prepare_meta_eval.py) back to your terminal and run it to get our reproduced result, which will be saved into `eval_results` folder by default.
+
+**NOTE**: As for `--model vllm`, here we will use VLLM inference instead of Hugging Face inference because of the padding issue. By default, for the generative tasks, the `lm-eval --model_args="{...}" --batch_size=auto` command will use Hugging Face inference solution that uses a static batch method with [left padding](https://github.com/EleutherAI/lm-evaluation-harness/blob/8ad598dfd305ece8c6c05062044442d207279a97/lm_eval/models/huggingface.py#L773) using EOS_token for Llama models, but our internal evaluation will load python original checkpoints and handle individual generation request asynchronously without any padding. To simulate this, we will use VLLM inference solution to do dynamic batching without any padding.
+
+**NOTE**: As for `add_bos_token=True`, since our prompts in the evals dataset has already included all the special tokens required by instruct model, such as `<|start_header_id|>user<|end_header_id|>`, we will not use `--apply_chat_template` argument for instruct models anymore. However, we need to use `add_bos_token=True` flag to add the BOS_token back during VLLM inference, as the BOS_token is removed by default in [this PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/1465).
+
+**NOTE**: For `meta_math_hard` tasks, some of our internal math ground truth has been converted to scientific notation, e.g. `6\sqrt{7}` has been converted to `1.59e+1`, which will be later handled by our internal math evaluation functions. As the lm-evaluation-harness [math evaluation utils.py](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/leaderboard/math/utils.py) can not fully handle those conversion, we will use the original ground truth from the original dataset [lighteval/MATH-Hard](https://huggingface.co/datasets/lighteval/MATH-Hard) by joining the tables on the original input questions. The `get_math_data` function in the [prepare_meta_eval.py](./prepare_meta_eval.py) will handle this step and produce a local parquet dataset file.
+
+Moreover, we have modified this [math_hard/utils.py](./meta_template/math_hard/utils.py) to address two issues:
+
+1. This python script only use [a regular expression "Final Answer: The final answer is(.*?). I hope it is correct."](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/leaderboard/math/utils.py#L192) to get the final answer, because this format is shown in the previous 4 shot examples prompts. However, our MATH Hard task is using 0 shot COT prompts that ask model to put the final answer into this string format `Therefore, the final answer is: $\\boxed{answer}$. I hope it is correct.` which can not be captured by previous regular expression, so we will use `\\box{}` to parse the final answer instead.
+
+2. The [is_equiv(x1: str, x2: str)](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/leaderboard/math/utils.py#L144) function failed parse some ground truth, as we noticed some error logs like `[utils.py:158] couldn't parse one of [0,1) or [0,1)`, so all those questions will be marked as wrong. We raised [a issue to lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness/issues/2212) about this problem and will add a string equality check statement before going to is_equiv() function for now as a temporary solution.
+
+
+**NOTE**: For `meta_ifeval` tasks, we have to use the original configs, such as `instruction_id_list`, `kwargs`, from [wis-k/instruction-following-eval](https://huggingface.co/datasets/wis-k/instruction-following-eval) in order to use [lm-evaluation-harness IFeval evaluation](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard/ifeval). We will perform similar join back method using `get_ifeval_data` function in the [prepare_meta_eval.py](./prepare_meta_eval.py) to get a local parquet dataset file.
+
+
+### Yaml Config Deep Dive
+
+Here, we will use MMLU-Pro as a example to show the steps to create a yaml config with detailed explanations, so that people can follow this example to create other tasks configurations if they want. For more information, please read lm-evaluation-harness [new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md).
+
+**1.Define the config to load datasets**
+
+We can use our 3.1 evals dataset as the source dataset and the corresponding subset and define the test split to latest. For example, if we want to reproduce the MMLU_Pro metric for 3.1 8B instruct, the following configs are needed as explained below:
+
+```yaml
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+test_split: latest
+```
+
+If you want to run evaluation on 70B-Instruct, then it is recommended to change the `dataset_path` and  `dataset_name` from 8B to 70B, even though 70B-instruct and 8B-instruct share the same prompts, the `is_correct` column, which can be used to get the difference between current reproduced result and the reported results for each sample, is different.
+
+**Note**: Config files for Meta-Llama-3.1-8B-Instruct are already provided in each task subfolder under [meta_template folder](./meta_template/). Remember to change the eval dataset name according to the model type and DO NOT use pretrained evals dataset on instruct models or vice versa.
+
+**2.Configure preprocessing, prompts and ground truth**
+
+Here is the example yaml snippet in the MMLU-Pro that handles dataset preprocess, prompts and ground truth.
+
+```yaml
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+```
+
+- `process_docs` : Defines the preprocess function for our datasets. In this case, we uses the `process_docs` python function that is defined in [utils.py](./meta_template/mmlu_pro/utils.py). This function will take the original dataset and output a processed dataset that has a out_doc, which contains `problem` which is the input question, `gold` which is the ground truth. We also renamed the `is_correct` column to `previously_is_correct` to allow detailed comparison for the difference of each sample between previously reported score and the reproduced score. You must use eval dataset and model with same parameters and same model type to get a valid comparison.
+
+-  `doc_to_text`: Defines the prompts. In the MMLU-Pro case, the `input_final_prompts` column always contains a list of a prompt, so we just use a python function that returns `input_final_prompts[0]`.
+
+- `doc_to_target` Defines the ground truth, which in the MMLU-Pro case, is the `gold` that derived from `input_correct_responses[0]`.
+
+**3.Configure task type and parser**
+
+While Open LLM Leaderboard v2 uses [multiple choice format](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#multiple-choice-format) for MMLU-Pro, BBH, GPQA tasks by comparing log likelihood of all label words, such as [ (A),(B),(C),(D) ], we use generative task option, by asking the model to generate response in sentences given our carefully designed prompts, then using some parsers to grab the final answer, and scoring that final answer based on the ground truth. Here is a example config in the MMLU-Pro that enable the generative task and defines the regex parser:
+
+```yaml
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+```
+Since the MMLU-Pro task uses a 5-shot Chain-of-Thought(COT) prompts and the prompts are designed with explicitly instruction: "Your response should end with \"The best answer is [the_answer_letter].\" where the [the_answer_letter] is a letter from the provided choices.",  we will use a simple and intuitive regex expression `best answer is ([A-Z])` to parse the model response and take the last appearance as the final answer and this final answer will be scored based on the ground truth `gold` using exact match method.
+
+**4.Define generation and metric config**
+
+Then we need to define the generation and metric config, which looks like this:
+```yaml
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+```
+Here we set the `num_fewshot` to 0 as our prompts have already been converted to 5-shots, and the model generation will only stop if the generated output tokens exceeds 1024, as stated in the [mmlu-pro eval details](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/eval_details.md#mmlu-pro). We will set the `do_sample` to false and `temperature` to 0 as stated in our `eval_config` column in the dataset. We will use metric `exact_match` for this tasks and calculate the `mean` as our task aggregated number.
+
+**NOTE**: While we tried our best to create the template files, those configs and functions are created based on public third-party library and are not exactly the same as our internal implementation, so there is a chance that the reproduced numbers are slightly different.
+
+## Results
+
+Here is the comparison between our reported numbers and the reproduced numbers in this tutorial:
+
+| Model                        | MATH_HARD | GPQA_RAW | MMLU_PRO_RAW | IFeval  |
+|------------------------------|-----------|----------|--------------|---------|
+| 3.1 8B-Instruct reported     | 0.254     | 0.328    | 0.47         | 0.804   |
+| 3.1 8B-Instruct reproduced   | 0.2424    | 0.3259   | 0.4675       | 0.7782  |
+| 3.1 70B-Instruct reported    | 0.438     | 0.467    | 0.651        | 0.875   |
+| 3.1 70B-Instruct reproduced  | 0.4388    | 0.4799   | 0.6475       | 0.848   |
+
+| Model                  | BBH_RAW | MMLU_PRO_RAW |
+|------------------------|---------|--------------|
+| 3.1 8B reported        | 0.642   | 0.356        |
+| 3.1 8B reproduced      | 0.6515  | 0.3572       |
+| 3.1 70B reported       | 0.816   | 0.52         |
+| 3.1 70B reproduced     | 0.8191  | 0.5225       |
+
+From the table above, we can see that most of our reproduced results are very close to our reported number in the [Meta Llama website](https://llama.meta.com/).
+
+**NOTE**: We used the average of `inst_level_strict_acc,none` and `prompt_level_strict_acc,none` to get the final number for `IFeval` as stated [here](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about#task-evaluations-and-parameters).
+
+**NOTE**: In the [Meta Llama website](https://llama.meta.com/), we reported the `macro_avg` metric, which is the average of all subtask average score, for `MMLU-Pro `task, but here we are reproducing the `micro_avg` metric, which is the average score for all the individual samples, and those `micro_avg`  numbers can be found in the [eval_details.md](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/eval_details.md#mmlu-pro).
+
+**NOTE**: The reproduced numbers may be slightly different, as we observed around Â±0.01 differences between each reproduce run because the latest VLLM inference is not very deterministic even with temperature=0. This behavior maybe related [this issue](https://github.com/vllm-project/vllm/issues/5404).
+or it is expected due to 16-bits inference as stated in [this comment](https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535) and [this comment](https://github.com/vllm-project/vllm/issues/4112#issuecomment-2071115725).
+
+## Acknowledgement
+
+This tutorial is inspired by [leaderboard tasks implementation on the lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/leaderboard) created by Hugging Face [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) team.
+We also extend our gratitude to the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) github repo from [EleutherAI](https://www.eleuther.ai/).
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50bcd3653386434d86325e23af453dbd7bc4ba54
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
@@ -0,0 +1,32 @@
+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+
+evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+
+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
+
+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
+
+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+
+gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
+
+max_model_len: 8192 #The VLLM argument that speicify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
+
+batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
+
+output_path: "eval_results" # the output folder to store all the eval results and samples.
+
+#limit: 12 # Limit number of examples per task, set 'null' to run all.
+limit: null # Limit number of examples per task, set 'null' to run all.
+
+verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
+
+log_samples: true # If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.
+
+work_dir: ./work_dir # The work folder where the task template yaml files will be copied and modified, datasets will be downloaded for math_hard, ifeval.
+
+template_dir: ./meta_template #Path to the folder that contains all the meta task templates
+
+show_config: false # If True, shows the full config of all tasks at the end of the evaluation.
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e762101a1cf9114fe1c459094814f0f51f846a30
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
+dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
+task: meta_bbh
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'the answer is (.*?)\.'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1ed449c78053b7e042543df077d2aa82095531
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7c441490327ec06a448a353d91db042ca24e4d1
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
+task: meta_gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0349fca5edd8faf95d2eb04ab97384fc9c1012
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
@@ -0,0 +1,20 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7196d16d3f856bd54fe894d55c913ca7fa0a793
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_ifeval.parquet
+output_type: generate_until
+test_split: train
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7caecb1a2eecf213096a68be3097f142322f7b6e
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/ifeval/utils.py
@@ -0,0 +1,139 @@
+import dataclasses
+from typing import Dict, Optional, Union
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+                
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ebe931944ebf806f29551bc330ee3f22d2a0203
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/math_hard_0shot_cot.yaml
@@ -0,0 +1,21 @@
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_math.parquet
+task: meta_math_hard
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 5120
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b595ccfae6ac8dcc79ee3d139d2d9b821626c01
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/math_hard/utils.py
@@ -0,0 +1,268 @@
+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
+import re
+import signal
+from typing import Dict, List, Optional
+
+import datasets
+
+from lm_eval.utils import eval_logger
+
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": normalize_final_answer(
+                 remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+            "meta_target": doc["input_correct_responses"]
+        }
+        return out_doc
+    return dataset.map(_process_doc)
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+    last_boxed_string = last_boxed_only_string(candidates)
+    if not last_boxed_string:
+        # No boxed string found, so we can't evaluate
+        return {"exact_match": 0}
+    unnormalized_answer = remove_boxed(last_boxed_string)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f3a3d54546931ade239c97f23f3ea965e8f01ce
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_instruct.yaml
@@ -0,0 +1,6 @@
+group: meta_instruct
+task:
+  - meta_ifeval
+  - meta_math_hard
+  - meta_gpqa
+  - meta_mmlu_pro_instruct
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b21dd1abed1370cebfaa92469cc41ffbff26966
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/meta_pretrain.yaml
@@ -0,0 +1,4 @@
+group: meta_pretrain
+task:
+  - meta_bbh
+  - meta_mmlu_pro_pretrain
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..398ebabf40d017faeb74debc8828f8c3a1a811b4
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dad98120ca0ea945b6e3fed6db44a8a11ad650bd
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -0,0 +1,28 @@
+task: meta_mmlu_pro_pretrain
+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
+dataset_name: Meta-Llama-3.1-8B-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'answer is \(([A-Z])\)'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..51d9f71f92a0a8e8708c211359bde5ed65080f03
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/mmlu_pro/utils.py
@@ -0,0 +1,21 @@
+import string
+
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..a45b87a658f5b5d95e49c87b806eeea7c0a393c4
--- /dev/null
+++ b/tools/benchmarks/llm_eval_harness/meta_eval_reproduce/prepare_meta_eval.py
@@ -0,0 +1,237 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+
+import argparse
+import errno
+import shutil
+import glob
+import os
+from pathlib import Path
+import nltk
+import yaml
+from datasets import Dataset, load_dataset
+
+
+# get the ifeval  from the evals dataset and join it with the original ifeval datasets
+def get_ifeval_data(model_name, output_dir):
+    print(f"preparing the ifeval data using {model_name}'s evals dataset")
+    if model_name not in [
+        "Meta-Llama-3.1-8B-Instruct",
+        "Meta-Llama-3.1-70B-Instruct",
+        "Meta-Llama-3.1-405B-Instruct",
+    ]:
+        raise ValueError(
+            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for IFEval"
+        )
+    original_dataset_name = "wis-k/instruction-following-eval"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__ifeval__strict__details",
+        split="latest",
+    )
+    ifeval_data = load_dataset(original_dataset_name, split="train")
+    meta_data = meta_data.map(get_question)
+    meta_df = meta_data.to_pandas()
+    ifeval_df = ifeval_data.to_pandas()
+    ifeval_df = ifeval_df.rename(columns={"prompt": "input_question"})
+    # join the two datasets on the input_question column
+    joined = meta_df.join(ifeval_df.set_index("input_question"), on="input_question")
+    joined = joined.rename(columns={"input_final_prompts": "prompt"})
+    joined = joined.rename(columns={"is_correct": "previous_is_correct"})
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(
+        [
+            "input_question",
+            "prompt",
+            "previous_is_correct",
+            "instruction_id_list",
+            "kwargs",
+            "output_prediction_text",
+            "key",
+        ]
+    )
+    joined.rename_column("output_prediction_text", "previous_output_prediction_text")
+    joined.to_parquet(output_dir + "/joined_ifeval.parquet")
+
+
+# get the math_hard data from the evals dataset and join it with the original math_hard dataset
+def get_math_data(model_name, output_dir):
+    print(f"preparing the math data using {model_name}'s evals dataset")
+    if model_name not in [
+        "Meta-Llama-3.1-8B-Instruct",
+        "Meta-Llama-3.1-70B-Instruct",
+        "Meta-Llama-3.1-405B-Instruct",
+    ]:
+        raise ValueError(
+            "Only Meta-Llama-3.1-8B-Instruct, Meta-Llama-3.1-70B-Instruct, Meta-Llama-3.1-405B-Instruct models are supported for MATH_hard"
+        )
+    original_dataset_name = "lighteval/MATH-Hard"
+    meta_dataset_name = f"meta-llama/{model_name}-evals"
+    meta_data = load_dataset(
+        meta_dataset_name,
+        name=f"{model_name}-evals__math_hard__details",
+        split="latest",
+    )
+    math_data = load_dataset(original_dataset_name, split="test")
+    meta_df = meta_data.to_pandas()
+    math_df = math_data.to_pandas()
+    math_df = math_df.rename(columns={"problem": "input_question"})
+    # join the two datasets on the input_question column
+    joined = meta_df.join(math_df.set_index("input_question"), on="input_question")
+    joined = Dataset.from_pandas(joined)
+    joined = joined.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "solution",
+            "output_prediction_text",
+        ]
+    )
+    joined = joined.rename_column("is_correct", "previous_is_correct")
+    joined = joined.rename_column(
+        "output_prediction_text", "previous_output_prediction_text"
+    )
+
+    joined.to_parquet(output_dir + "/joined_math.parquet")
+
+
+# get the question from the ifeval dataset
+def get_question(example):
+    try:
+        example["input_question"] = (
+            eval(
+                example["input_question"]
+                .replace("null", "None")
+                .replace("true", "True")
+                .replace("false", "False")
+            )["dialog"][0]["body"]
+            .replace("Is it True that the first song", "Is it true that the first song")
+            .replace("Is the following True", "Is the following true")
+        )
+        example["input_final_prompts"] = example["input_final_prompts"][0]
+        return example
+    except:
+        print(example["input_question"])
+        return
+
+
+# change the yaml file to use the correct model name
+def change_yaml(args, base_name):
+    for yaml_file in glob.glob(args.template_dir + "**/*/*.yaml", recursive=True):
+        with open(yaml_file, "r") as sources:
+            lines = sources.readlines()
+        output_path = yaml_file.replace(args.template_dir, args.work_dir)
+        print(f"changing {yaml_file} to output_path: {output_path}")
+        path = Path(output_path)
+        yaml_dir = path.parent
+        with open(output_path, "w") as output:
+            for line in lines:
+                output.write(
+                    line.replace("Meta-Llama-3.1-8B", base_name).replace(
+                        "WORK_DIR", str(yaml_dir)
+                    )
+                )
+
+
+# copy the files and change the yaml file to use the correct model name
+def copy_and_prepare(args):
+    # nltk punkt_tab package is needed
+    nltk.download('punkt_tab')
+    if not os.path.exists(args.work_dir):
+        # Copy the all files, including yaml files and python files, from template folder to the work folder
+
+        copy_dir(args.template_dir, args.work_dir)
+    else:
+        print("work_dir already exists, no need to copy files")
+    # Use the template yaml to get the correct model name in work_dir yaml
+    base_name = (
+        args.evals_dataset.split("/")[-1].replace("-evals", "").replace("-Instruct", "")
+    )
+    change_yaml(args, base_name)
+
+
+def parse_eval_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="./eval_config.yaml",
+        help="the config yaml file that contains all the eval parameters",
+    )
+    return parser.parse_args()
+
+
+def prepare_datasets(args):
+    # Prepare the dataset for the IFeval and MATH_Hard tasks as we need to join the original dataset with the evals dataset by the actual questions.
+    # model_name are derived from the evals_dataset name
+    task_list = args.tasks.split(",")
+    model_name = args.evals_dataset.split("/")[-1].replace("-evals", "")
+    if "meta_instruct" in task_list:
+        get_ifeval_data(model_name, args.work_dir)
+
+        get_math_data(model_name, args.work_dir)
+    else:
+        if "meta_ifeval" in task_list:
+            get_ifeval_data(model_name, args.work_dir)
+        if "meta_math_hard" in task_list:
+            get_math_data(model_name, args.work_dir)
+
+
+# copy the files from src to dst
+def copy_dir(src, dst):
+    try:
+        shutil.copytree(src, dst)
+    except OSError as exc:  # python >2.5
+        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
+            shutil.copy(src, dst)
+        else:
+            raise
+
+
+# load the config yaml file
+def load_config(config_path: str = "./config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+
+
+if __name__ == "__main__":
+    args = parse_eval_args()
+    config = load_config(args.config_path)
+    # Create VLLM model args
+    for k, v in config.items():
+        args.__setattr__(k, v)
+    if not os.path.exists(args.template_dir):
+        raise ValueError("The template_dir does not exist, please check the path")
+    if args.evals_dataset not in [
+        "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
+        "meta-llama/Meta-Llama-3.1-70B-Instruct-evals",
+        "meta-llama/Meta-Llama-3.1-405B-Instruct-evals",
+        "meta-llama/Meta-Llama-3.1-8B-evals",
+        "meta-llama/Meta-Llama-3.1-70B-evals",
+        "meta-llama/Meta-Llama-3.1-405B-evals",
+    ]:
+        raise ValueError(
+            "The evals dataset is not valid, please double check the name, must use the name in the Llama 3.1 Evals collection"
+        )
+    args.model_args = f"pretrained={args.model_name},tensor_parallel_size={args.tensor_parallel_size},dtype=auto,gpu_memory_utilization={args.gpu_memory_utilization},data_parallel_size={args.data_parallel_size},max_model_len={args.max_model_len},add_bos_token=True,seed=42"
+    # Copy the all files from template folder to the work folder
+    copy_and_prepare(args)
+    # Prepare the datasets for the IFeval and MATH_Hard tasks as we need to join the original dataset
+    prepare_datasets(args)
+    print(
+        f"prepration for the {args.model_name} using {args.evals_dataset} is done, all saved the work_dir: {args.work_dir}"
+    )
+    command_str = f"lm_eval --model vllm   --model_args {args.model_args} --tasks {args.tasks} --batch_size auto --output_path { args.output_path} --include_path {os.path.abspath(args.work_dir)} --seed 42 "
+    if args.limit:
+        command_str += f" --limit {args.limit}"
+    if args.log_samples:
+        command_str += " --log_samples "
+    if args.show_config:
+        command_str += " --show_config "
+    print("please use the following command to run the meta reproduce evals:")
+    print(command_str)