diff --git a/README.md b/README.md
index 6982d6e568fef33faea070c41de1a7d9b25ca85a..a2224991007216bc901ff30af7cc0609c561d532 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,10 @@
-# Selecting Influential Data for 
-
+# LESS: Selecting Influential Data for Targeted Instruction Tuning
 🌟 [ArXiv Preprint](TODO) 
 
+This repo hosts the code for the paper "LESS: Selecting Influential Data for Targeted Instruction Tuning". In this work, we propose a data selection method to select influential data to induce a target capability.
+
 ## 🔗 Quick Links
-- [Selecting Influential Data for](#selecting-influential-data-for)
+- [LESS: Selecting Influential Data for Targeted Instruction Tuning](#less-selecting-influential-data-for-targeted-instruction-tuning)
   - [🔗 Quick Links](#-quick-links)
   - [Install Requirements](#install-requirements)
   - [Data Preparation](#data-preparation)
@@ -36,9 +37,7 @@ pip install -e .
 
 
 ## Data Preparation
-We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. You can also find a processed copy of files here [TODO]. 
-
-We also get the evaluation data ready in the same way. 
+We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. In our project, we utilize a combination of four training datasets: Flan v2, COT, Dolly, and Open Assistant. For the purposes of evaluation, we employ three additional datasets: MMLU, Tydiqa, and BBH. A processed version of these files will be made available [here] [TODO].
 
 ## Data Selection Pipeline
 
diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ca7236b83f542630d8a7dfc210dd1f2ab8b9fb9
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,14 @@
+## Evaluation
+
+We mainly employ three evaluation datasets to assess the performance of our data selection pipeline: **MMLU**, **Tydiqa**, and **BBH**. We use the evaluation pipeline [open-instruct](https://github.com/allenai/open-instruct/tree/main/eval). To evaluate a trained model, please follow the steps below:
+
+### Step 1: Install Open-Instruct
+```bash
+git clone https://github.com/allenai/open-instruct.git
+cd open-instruct
+pip install -e .
+```
+
+### Step 2: Evaluation
+Please check out the `eval_mmlu.sh`, `eval_tydiqa.sh`, and `eval_bbh.sh` scripts in the `evaluation` directory. These scripts contain the necessary commands to evaluate the model on the respective datasets. 
+
diff --git a/evaluation/eval.sh b/evaluation/eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..42d158baeb5d2ca51942ad2355cc7e1a89d87f1f
--- /dev/null
+++ b/evaluation/eval.sh
@@ -0,0 +1,22 @@
+set_save_dir() {
+    mdir=$1
+    if [[ -d $mdir ]]; then
+        save_dir=${mdir}/eval/$2
+    else
+        save_dir=$n/space10/out/$(basename $mdir)/eval/$2
+    fi
+}
+
+set_valid_dir() {
+    mdir=$1
+    if [[ -d $mdir ]]; then
+        save_dir=${mdir}/valid/$2
+    else
+        save_dir=$n/space10/out/$(basename $mdir)/valid/$2
+    fi
+}
+
+
+export set_save_dir
+export set_valid_dir
+
diff --git a/evaluation/eval_bbh.sh b/evaluation/eval_bbh.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d1b12177e82f5ac69f530985ae526664a523a056
--- /dev/null
+++ b/evaluation/eval_bbh.sh
@@ -0,0 +1,56 @@
+source eval.sh
+
+# main evaluation function
+eval_bbh() {
+    cd $n/space10/open-instruct
+    mdir=$1
+    type=$2
+    set_save_dir $mdir bbh
+    mkdir -p $save_dir
+    cmd="python -m eval.bbh.run_eval \
+    --data_dir $DATA_DIR/bbh \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 10 \
+    --convert_to_bf16 \
+    --max_num_examples_per_task 40"
+    eval "$cmd"
+}
+
+# evaluate the validation set, which is not supported yet
+valid_bbh() {
+    cd $n/space10/open-instruct
+    mdir=$1
+    type=$2
+    set_valid_dir $mdir bbh
+    echo $save_dir
+    mkdir -p $save_dir
+    cmd="python -m eval.bbh.run_eval \
+    --data_dir $DATA_DIR/bbh-valid \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 10 \
+    --convert_to_bf16 \
+    --eval_valid \
+    --max_num_examples_per_task 3"
+}
+
+# extract the results
+extract_bbh() {
+    mdir=$1
+    set_save_dir $mdir bbh-nonchat
+    result=$(jq .average_exact_match $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
+
+# extract the results for the validation set
+extract_valid_bbh() {
+    mdir=$1
+    set_valid_dir $mdir bbh-nonchat
+    result=$(jq .average_exact_match $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
diff --git a/evaluation/eval_mmlu.sh b/evaluation/eval_mmlu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a92bfb30db2b38f80c68addf18ca91f86dbd62a5
--- /dev/null
+++ b/evaluation/eval_mmlu.sh
@@ -0,0 +1,53 @@
+source eval.sh
+
+# main evaluation function
+eval_mmlu() {
+    mdir=$1
+    set_save_dir $mdir mmlu
+    mkdir -p $save_dir
+    cmd="python -m eval.mmlu.run_eval \
+    --ntrain 5 \
+    --data_dir $DATA_DIR/mmlu \
+    --save_dir $save_dir \
+    --model_name_or_path $mdir \
+    --tokenizer_name_or_path $mdir \
+    --eval_batch_size 4 \
+    --convert_to_bf16"
+    eval "$cmd"
+}
+
+# evaluate the validation set, which is not supported yet
+valid_mmlu() {
+    mdir=$1
+    type=$2
+    set_valid_dir $mdir mmlu
+    mkdir -p $save_dir
+    cmd="python -m eval.mmlu.run_eval \
+    --ntrain 5 \
+    --eval_valid \
+    --data_dir $DATA_DIR/mmlu \
+    --save_dir $save_dir \
+    --model_name_or_path $mdir \
+    --tokenizer_name_or_path $mdir \
+    --eval_batch_size 4 \
+    --convert_to_bf16"
+    eval "$cmd"
+}
+
+# extract the results
+extract_mmlu() {
+    mdir=$1
+    set_save_dir $mdir mmlu
+    result=$(jq .average_acc $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
+
+# extract the results for the validation set
+extract_valid_mmlu() {
+    mdir=$1
+    set_valid_dir $mdir mmlu
+    result=$(jq .average_acc $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
diff --git a/evaluation/eval_tydiqa.sh b/evaluation/eval_tydiqa.sh
new file mode 100644
index 0000000000000000000000000000000000000000..351303b0cafea3cbc31acf42f0888479443b7e6f
--- /dev/null
+++ b/evaluation/eval_tydiqa.sh
@@ -0,0 +1,56 @@
+source eval.sh
+
+# main evaluation function
+eval_tydiqa() {
+    mdir=$1
+    set_save_dir $mdir tydiqa
+    mkdir -p $save_dir
+    cmd="python -m eval.tydiqa.run_eval \
+    --data_dir $DATA_DIR/tydiqa/ \
+    --n_shot 1 \
+    --max_num_examples_per_lang 200 \
+    --max_context_length 512 \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 20 \
+    --use_chat_format \
+    --convert_to_bf16 \
+    --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
+}
+
+# evaluate the validation set, which is not supported yet
+valid_tydiqa() {
+    mdir=$1
+    set_valid_dir $mdir tydiqa
+    mkdir -p $save_dir
+    cmd="python -m eval.tydiqa.run_eval \
+    --data_dir $DATA_DIR/tydiqa/one-shot-valid \
+    --n_shot 0 \
+    --eval_valid \
+    --max_num_examples_per_lang 200 \
+    --max_context_length 512 \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 20 \
+    --use_chat_format \
+    --convert_to_bf16 \
+    --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
+}
+
+# extract the results
+extract_tydiqa() {
+    mdir=$1
+    set_save_dir $mdir tydiqa
+    result=$(jq .average.f1 $save_dir/metrics.json)
+    echo $result
+}
+
+# extract the results for the validation set
+extract_valid_tydiqa() {
+    mdir=$1
+    set_valid_dir $mdir tydiqa
+    result=$(jq .average.f1 $save_dir/metrics.json)
+    echo $result
+}
\ No newline at end of file