diff --git a/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb b/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
index 721cb0d07af8449adc4e5d5d886515fe949dab3f..9d77b51f7b14c8fad6d477be90b965f02f24caf4 100644
--- a/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
+++ b/recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
@@ -145,7 +145,7 @@
     "class Args:\n",
     "    def __init__(self, \n",
     "                 max_examples=100, \n",
-    "                 sql_model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\", \n",
+    "                 sql_model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", \n",
     "                 gold_file_name=\"gold-test-set.jsonl\",\n",
     "                 training_file_name=\"generated_queries.jsonl\",\n",
     "                 num_to_generate=10):\n",
@@ -197,7 +197,7 @@
     }
    ],
    "source": [
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "question = \"\"\"Who is the highest paid NBA player?\"\"\"\n",
     "system = f\"\"\"You are an NBA analyst with 15 years of experience writing complex SQL queries. Consider the nba_roster table with the following schema:\n",
@@ -418,7 +418,7 @@
     "class ScoreStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -712,7 +712,7 @@
     "class ModelStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=300,\n",
     "        )\n",
     "\n",
@@ -808,7 +808,7 @@
     "class QuestionStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -1055,7 +1055,7 @@
    ],
    "source": [
     "args = Args()\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1601,7 +1601,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1798,7 +1798,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"generated_queries_v2.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1966,7 +1966,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_v2_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
diff --git a/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py b/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
index ca8d8e44b470434b6ef3a631d34706b9a2b4139c..b60e5fa0062834e06cd118aa526f54c0e59bc491 100644
--- a/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
+++ b/recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
@@ -16,7 +16,7 @@ def parse_arguments():
     parser.add_argument(
         "--sql-model-name",
         type=str,
-        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
         help="The model to use for text2sql",
         required=False,
     )
diff --git a/recipes/3p_integrations/llama_on_prem.md b/recipes/3p_integrations/llama_on_prem.md
index d43649a2299cf4fa77636ac418b20dbd4b0e94c2..f9fc877de58f8363e20ab676913bff4d52cf0fb6 100644
--- a/recipes/3p_integrations/llama_on_prem.md
+++ b/recipes/3p_integrations/llama_on_prem.md
@@ -8,7 +8,7 @@ We'll use the Amazon EC2 instance running Ubuntu with an A10G 24GB GPU as an exa
 
 The Colab notebook to connect via LangChain with Llama 3 hosted as the vLLM and TGI API services is [here](https://colab.research.google.com/drive/1rYWLdgTGIU1yCHmRpAOB2D-84fPzmOJg), also shown in the sections below.
 
-This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
+This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
 
 You'll also need your Hugging Face access token which you can get at your Settings page [here](https://huggingface.co/settings/tokens).
 
@@ -33,7 +33,7 @@ There are two ways to deploy Llama 3 via vLLM, as a general API server or an Ope
 Run the command below to deploy vLLM as a general Llama 3 service:
 
 ```
-python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal you can run:
@@ -68,13 +68,13 @@ Also, if you have multiple GPUs, you can add the `--tensor-parallel-size` argume
 git clone https://github.com/vllm-project/vllm
 cd vllm/vllm/entrypoints
 conda activate llama3
-python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 4
+python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct --tensor-parallel-size 4
 ```
 
 With multiple GPUs, you can also run replica of models as long as your model size can fit into targeted GPU memory. For example, if you have two A10G with 24 GB memory, you can run two Llama 3 8B models at the same time. This can be done by launching two api servers each targeting specific CUDA cores on different ports:
-`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 and
-`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 The benefit would be that you can balance incoming requests to both models, reaching higher batch size processing for a trade-off of generation latency.
 
 
@@ -83,14 +83,14 @@ The benefit would be that you can balance incoming requests to both models, reac
 You can also deploy the vLLM hosted Llama 3 as an OpenAI-Compatible service to easily replace code using OpenAI API. First, run the command below:
 
 ```
-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal, run:
 
 ```
 curl http://localhost:5000/v1/completions -H "Content-Type: application/json" -d '{
-        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "prompt": "Who wrote the book Innovators dilemma?",
         "max_tokens": 300,
         "temperature": 0
@@ -118,7 +118,7 @@ from langchain.llms import VLLMOpenAI
 llm = VLLMOpenAI(
     openai_api_key="EMPTY",
     openai_api_base="http://<vllm_server_ip_address>:5000/v1",
-    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
 )
 
 print(llm("Who wrote the book godfather?"))
@@ -136,7 +136,7 @@ You can now use the Llama 3 instance `llm` created this way in any of the demo a
 The easiest way to deploy Llama 3 with TGI is using its official docker image. First, replace `<your_hugging_face_access_token>` and set the three required shell variables (you may replace the `model` value above with another Llama 3 model):
 
 ```
-model=meta-llama/Meta-Llama-3-8B-Instruct
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data
 token=<your_hugging_face_access_token>
 ```
diff --git a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
index 2fed523f16615c89080ebe840c49b2cfef94e83b..1935c4c327f8bc0a4ba3dc5e52ffe62f38f06b83 100644
--- a/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
+++ b/recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
@@ -92,7 +92,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
+    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3.1-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
     "\n",
     "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
     "2. Use the same email address from Step (1) to login into Hugging Face.\n",
@@ -125,7 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
+    "model = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model)"
    ]
   },
diff --git a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
index 04135f6bce71a06a69b69ef268729773a430c9dd..999a51837475dfe9b46bf34f6c6db0631988edd9 100644
--- a/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
+++ b/recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
@@ -90,7 +90,7 @@
     "from llama_recipes.configs import train_config as TRAIN_CONFIG\n",
     "\n",
     "train_config = TRAIN_CONFIG()\n",
-    "train_config.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "train_config.model_name = \"meta-llama/Meta-Llama-3.1-8B\"\n",
     "train_config.num_epochs = 1\n",
     "train_config.run_validation = False\n",
     "train_config.gradient_accumulation_steps = 4\n",
diff --git a/recipes/responsible_ai/prompt_guard/Prompt Guard Tutorial.ipynb b/recipes/responsible_ai/prompt_guard/Prompt Guard Tutorial.ipynb
index 9067c3615cad5b7737195dfa9ac5a987bb140a0f..fa0013dd17c7735c29da56bac9222536da7a7593 100644
--- a/recipes/responsible_ai/prompt_guard/Prompt Guard Tutorial.ipynb	
+++ b/recipes/responsible_ai/prompt_guard/Prompt Guard Tutorial.ipynb	
@@ -55,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prompt_injection_model_name = 'meta-llama/PromptGuard'\n",
+    "prompt_injection_model_name = 'meta-llama/Prompt-Guard-86M'\n",
     "tokenizer = AutoTokenizer.from_pretrained(prompt_injection_model_name)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(prompt_injection_model_name)"
    ]
diff --git a/recipes/responsible_ai/prompt_guard/inference.py b/recipes/responsible_ai/prompt_guard/inference.py
index 4e5e35d62ddb83c6339532762db5c0f27b5cf19a..1cb0f7c06cc72895d0c86dcc6f9bb8d2b426ebff 100644
--- a/recipes/responsible_ai/prompt_guard/inference.py
+++ b/recipes/responsible_ai/prompt_guard/inference.py
@@ -11,12 +11,12 @@ Utilities for loading the PromptGuard model and evaluating text for jailbreaks a
 """
 
 
-def load_model_and_tokenizer(model_name='meta-llama/PromptGuard'):
+def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
     """
     Load the PromptGuard model from Hugging Face or a local model.
     
     Args:
-        model_name (str): The name of the model to load. Default is 'meta-llama/PromptGuard'.
+        model_name (str): The name of the model to load. Default is 'meta-llama/Prompt-Guard-86M'.
         
     Returns:
         transformers.PreTrainedModel: The loaded model.
diff --git a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
index ecfce02d87b41b37779e6f54f74eb7b6eb4114cd..3a86fcc840be2ccd82de2232888a4add37db4c42 100644
--- a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
+++ b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
@@ -418,7 +418,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model=meta-llama/Meta-Llama-3-8B-Instruct\n",
+    "model=meta-llama/Meta-Llama-3.1-8B-Instruct\n",
     "volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n",
     "token=#your-huggingface-token\n",
     "docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model"
diff --git a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
index 0858a7f5638474d8c1598401daca7d4e577a007a..c9eca4e5cdc7d4c9baaac5a7e0f6c9c4f299edbd 100644
--- a/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
+++ b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
@@ -934,11 +934,11 @@
       "source": [
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
         "import torch\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
         "# CPU Enabled uncomment below 👇🏽\n",
-        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
         "# GPU Enabled use below 👇🏽\n",
-        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
+        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
       ]
     },
     {
diff --git a/src/llama_recipes/inference/safety_utils.py b/src/llama_recipes/inference/safety_utils.py
index 74dd394dcf62a31205c3f14681a101da441a52e3..f81a05a3acbb8314dc8172f7bc93434602fcc868 100644
--- a/src/llama_recipes/inference/safety_utils.py
+++ b/src/llama_recipes/inference/safety_utils.py
@@ -160,7 +160,7 @@ class LlamaGuardSafetyChecker(object):
         from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
         from llama_recipes.inference.prompt_format_utils import build_default_prompt, create_conversation, LlamaGuardVersion
 
-        model_id = "meta-llama/LlamaGuard-7b"
+        model_id = "meta-llama/Llama-Guard-3-8B"
 
         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 
diff --git a/src/llama_recipes/tools/README.md b/src/llama_recipes/tools/README.md
index ebc05a0e8c181016058e89e0577e1688b7516805..95525f32c6e64f7a51567fecb1cd2a376a514d1e 100644
--- a/src/llama_recipes/tools/README.md
+++ b/src/llama_recipes/tools/README.md
@@ -7,7 +7,7 @@ This is the reverse conversion for `convert_llama_weights_to_hf.py` script from
 - Copy file params.json from the official llama download into that directory.
 - Run the conversion script. `model-path` can be a Hugging Face hub model or a local hf model directory.
 ```
-python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Meta-Llama-3-70B-Instruct --output-dir test70B --model-size 70B
+python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir test70B --model-size 70B
 ```
 
 ## Step 1: Run inference
diff --git a/src/tests/conftest.py b/src/tests/conftest.py
index b82d9a9e93c40cf966696d32c516460266d309cc..3cc791360dc1ac907ed8dc8bcf739e30aa9f88ff 100644
--- a/src/tests/conftest.py
+++ b/src/tests/conftest.py
@@ -6,7 +6,7 @@ import pytest
 from transformers import AutoTokenizer
 
 ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
-LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3-8B"]
+LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
 
 @pytest.fixture(params=LLAMA_VERSIONS)
 def llama_version(request):
diff --git a/src/tests/datasets/test_custom_dataset.py b/src/tests/datasets/test_custom_dataset.py
index 5b8028af0bfb9344e1da13d9363db4857b80ce1e..7ce03a38f28bd116dac9821a7adfbdfc334d95bb 100644
--- a/src/tests/datasets/test_custom_dataset.py
+++ b/src/tests/datasets/test_custom_dataset.py
@@ -11,7 +11,7 @@ EXPECTED_RESULTS={
         "example_1": "[INST] Who made Berlin [/INST] dunno",
         "example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
         "example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
     },
diff --git a/src/tests/datasets/test_grammar_datasets.py b/src/tests/datasets/test_grammar_datasets.py
index 718b72179b26dcf938096e4761d4eb44effb2303..e05e51ca9ee9237f9c0fcf06d9c0222f46ab4635 100644
--- a/src/tests/datasets/test_grammar_datasets.py
+++ b/src/tests/datasets/test_grammar_datasets.py
@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
         "label": 1152,
         "pos": 31,
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "label": 40,
         "pos": 26,
     },
diff --git a/src/tests/datasets/test_samsum_datasets.py b/src/tests/datasets/test_samsum_datasets.py
index 72f35d37ea1436490b9e143c7ea87519ef9edebc..4b6668b25cd75ebae4e0cd4995d49d8a3f0dcf4e 100644
--- a/src/tests/datasets/test_samsum_datasets.py
+++ b/src/tests/datasets/test_samsum_datasets.py
@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
         "label": 8432,
         "pos": 242,
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "label": 2250,
         "pos": 211,
     },
diff --git a/src/tests/test_batching.py b/src/tests/test_batching.py
index e22d8f0e2e3752d7e7855fd838289409f3f6e9ee..c450c18ac4667d3a9a986ff3a5d265754965176a 100644
--- a/src/tests/test_batching.py
+++ b/src/tests/test_batching.py
@@ -9,7 +9,7 @@ EXPECTED_SAMPLE_NUMBER ={
         "train": 96,
         "eval": 42,
     },
-    "meta-llama/Meta-Llama-3-8B": {
+    "meta-llama/Meta-Llama-3.1-8B": {
         "train": 79,
         "eval": 34,
     }
diff --git a/tools/benchmarks/inference/on_prem/README.md b/tools/benchmarks/inference/on_prem/README.md
index f29c8f8179eea7d1b23899ec137159b261bc1db6..afffd6ee5155f9641788128a3bacee23946ea6bb 100644
--- a/tools/benchmarks/inference/on_prem/README.md
+++ b/tools/benchmarks/inference/on_prem/README.md
@@ -17,8 +17,8 @@ For example, we have an instance from Azure that has 8xA100 80G GPUs, and we wan
 
 Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM.
 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
-CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
 ```
 Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal.
 
diff --git a/tools/benchmarks/llm_eval_harness/README.md b/tools/benchmarks/llm_eval_harness/README.md
index 274393ef5a9af552f7b204f228bd58e4e1368d60..38606770f406ae8705d2a32a1ae292203c23b200 100644
--- a/tools/benchmarks/llm_eval_harness/README.md
+++ b/tools/benchmarks/llm_eval_harness/README.md
@@ -39,7 +39,7 @@ pip install -e .
 To run evaluation for Hugging Face `Llama3 8B` model  on a single GPU please run the following,
 
 ```bash
-python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B --tasks hellaswag --device cuda:0   --batch_size 8
+python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B --tasks hellaswag --device cuda:0   --batch_size 8
 
 ```
 Tasks can be extended by using `,` between them for example `--tasks hellaswag,arc`.
@@ -51,7 +51,7 @@ To set the number of shots you can use `--num_fewshot` to set the number for few
 In case you have fine-tuned your model using PEFT you can set the PATH to the PEFT checkpoints using PEFT as part of model_args as shown below:
 
 ```bash
-python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B, dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8
+python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B, dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8
 ```
 
 ### Limit the number of examples in benchmarks
@@ -59,7 +59,7 @@ python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B, dt
 There has been an study from [IBM on efficient benchmarking of LLMs](https://arxiv.org/pdf/2308.11696.pdf), with main take a way that to identify if a model is performing poorly, benchmarking on wider range of tasks is more important than the number example in each task. This means you could run the evaluation harness with fewer number of example to have initial decision if the performance got worse from the base line. To limit the number of example here, it can be set using `--limit` flag with actual desired number. But for the full assessment you would need to run the full evaluation. Please read more in the paper linked above.
 
 ```bash
-python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B,dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100
+python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B,dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100
 ```
 
 ### Reproducing Hugging Face Open-LLM-Leaderboard
@@ -76,7 +76,7 @@ bash open_llm_eval_prep.sh
 Now we can run the eval benchmark:
 
 ```bash
-python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B,dtype="float",peft=../peft_output --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100 --open_llm_leaderboard_tasks
+python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B,dtype="float",peft=../peft_output --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100 --open_llm_leaderboard_tasks
 ```
 
 In the HF leaderboard, the [LLMs are evaluated on 7 benchmarks](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) from Language Model Evaluation Harness as described below:
@@ -107,7 +107,7 @@ To perform *data-parallel evaluation* (where each GPU loads a **separate full co
 ```bash
 accelerate config
 
-accelerate launch eval.py --model hf --model_args "pretrained=meta-llama/Meta-Llama-3-8B" --limit 100 --open-llm-leaderboard-tasks --output_path ./results.json --log_samples
+accelerate launch eval.py --model hf --model_args "pretrained=meta-llama/Meta-Llama-3.1-8B" --limit 100 --open-llm-leaderboard-tasks --output_path ./results.json --log_samples
 ```
 
 In case your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one.
@@ -119,7 +119,7 @@ In case your model is *too large to fit on a single GPU.*
 In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
 
 ```bash
-python eval.py --model hf --model_args "pretrained=meta-llama/Meta-Llama-3-8B,parallelize=True" --limit 100 --open_llm_leaderboard_tasks --output_path ./results.json --log_samples
+python eval.py --model hf --model_args "pretrained=meta-llama/Meta-Llama-3.1-8B,parallelize=True" --limit 100 --open_llm_leaderboard_tasks --output_path ./results.json --log_samples
 ```
 
 
@@ -138,7 +138,7 @@ These two options (`accelerate launch` and `parallelize=True`) are mutually excl
 Also `lm-evaluation-harness` supports vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
 
 ```bash
-python eval.py --model vllm --model_args "pretrained=meta-llama/Meta-Llama-3-8B,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=2" --limit 100 --open_llm_leaderboard_tasks --output_path ./results.json --log_samples --batch_size auto
+python eval.py --model vllm --model_args "pretrained=meta-llama/Meta-Llama-3.1-8B,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.8,data_parallel_size=2" --limit 100 --open_llm_leaderboard_tasks --output_path ./results.json --log_samples --batch_size auto
 ```
 For a full list of supported vLLM configurations, please to [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/076372ee9ee81e25c4e2061256400570354a8d1a/lm_eval/models/vllm_causallms.py#L44-L62).