update

33deb347 · xiamengzhou · 12456064 · 33deb347 · 33deb347 · 33deb347
Commit 33deb347 authored 1 year ago by xiamengzhou
--- a/README.md
+++ b/README.md
-# Selecting Influential Data for 
+# LESS: Selecting Influential Data for Targeted Instruction Tuning
 🌟 [ArXiv Preprint](TODO) 
+This repo hosts the code for the paper "LESS: Selecting Influential Data for Targeted Instruction Tuning". In this work, we propose a data selection method to select influential data to induce a target capability.
 ## 🔗 Quick Links
- [Selecting Influential Data for](#selecting-influential-data-for)
+- [LESS: Selecting Influential Data for Targeted Instruction Tuning](#less-selecting-influential-data-for-targeted-instruction-tuning)
  - [🔗 Quick Links](#-quick-links)
  - [Install Requirements](#install-requirements)
  - [Data Preparation](#data-preparation)
@@ -36,9 +37,7 @@ pip install -e .
 ## Data Preparation
-We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. You can also find a processed copy of files here [TODO]. 
+We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. In our project, we utilize a combination of four training datasets: Flan v2, COT, Dolly, and Open Assistant. For the purposes of evaluation, we employ three additional datasets: MMLU, Tydiqa, and BBH. A processed version of these files will be made available [here] [TODO].
-We also get the evaluation data ready in the same way. 
 ## Data Selection Pipeline

--- a/evaluation/README.md
+++ b/evaluation/README.md
+## Evaluation
+We mainly employ three evaluation datasets to assess the performance of our data selection pipeline: **MMLU**, **Tydiqa**, and **BBH**. We use the evaluation pipeline [open-instruct](https://github.com/allenai/open-instruct/tree/main/eval). To evaluate a trained model, please follow the steps below:
+### Step 1: Install Open-Instruct
+```bash
+git clone https://github.com/allenai/open-instruct.git
+cd open-instruct
+pip install -e .
+```
+### Step 2: Evaluation
+Please check out the `eval_mmlu.sh`, `eval_tydiqa.sh`, and `eval_bbh.sh` scripts in the `evaluation` directory. These scripts contain the necessary commands to evaluate the model on the respective datasets. 
--- a/evaluation/eval.sh
+++ b/evaluation/eval.sh
+set_save_dir() {
+    mdir=$1
+    if [[ -d $mdir ]]; then
+        save_dir=${mdir}/eval/$2
+    else
+        save_dir=$n/space10/out/$(basename $mdir)/eval/$2
+    fi
+}
+set_valid_dir() {
+    mdir=$1
+    if [[ -d $mdir ]]; then
+        save_dir=${mdir}/valid/$2
+    else
+        save_dir=$n/space10/out/$(basename $mdir)/valid/$2
+    fi
+}
+export set_save_dir
+export set_valid_dir
--- a/evaluation/eval_bbh.sh
+++ b/evaluation/eval_bbh.sh
+source eval.sh
+# main evaluation function
+eval_bbh() {
+    cd $n/space10/open-instruct
+    mdir=$1
+    type=$2
+    set_save_dir $mdir bbh
+    mkdir -p $save_dir
+    cmd="python -m eval.bbh.run_eval \
+    --data_dir $DATA_DIR/bbh \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 10 \
+    --convert_to_bf16 \
+    --max_num_examples_per_task 40"
+    eval "$cmd"
+}
+# evaluate the validation set, which is not supported yet
+valid_bbh() {
+    cd $n/space10/open-instruct
+    mdir=$1
+    type=$2
+    set_valid_dir $mdir bbh
+    echo $save_dir
+    mkdir -p $save_dir
+    cmd="python -m eval.bbh.run_eval \
+    --data_dir $DATA_DIR/bbh-valid \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 10 \
+    --convert_to_bf16 \
+    --eval_valid \
+    --max_num_examples_per_task 3"
+}
+# extract the results
+extract_bbh() {
+    mdir=$1
+    set_save_dir $mdir bbh-nonchat
+    result=$(jq .average_exact_match $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
+# extract the results for the validation set
+extract_valid_bbh() {
+    mdir=$1
+    set_valid_dir $mdir bbh-nonchat
+    result=$(jq .average_exact_match $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
--- a/evaluation/eval_mmlu.sh
+++ b/evaluation/eval_mmlu.sh
+source eval.sh
+# main evaluation function
+eval_mmlu() {
+    mdir=$1
+    set_save_dir $mdir mmlu
+    mkdir -p $save_dir
+    cmd="python -m eval.mmlu.run_eval \
+    --ntrain 5 \
+    --data_dir $DATA_DIR/mmlu \
+    --save_dir $save_dir \
+    --model_name_or_path $mdir \
+    --tokenizer_name_or_path $mdir \
+    --eval_batch_size 4 \
+    --convert_to_bf16"
+    eval "$cmd"
+}
+# evaluate the validation set, which is not supported yet
+valid_mmlu() {
+    mdir=$1
+    type=$2
+    set_valid_dir $mdir mmlu
+    mkdir -p $save_dir
+    cmd="python -m eval.mmlu.run_eval \
+    --ntrain 5 \
+    --eval_valid \
+    --data_dir $DATA_DIR/mmlu \
+    --save_dir $save_dir \
+    --model_name_or_path $mdir \
+    --tokenizer_name_or_path $mdir \
+    --eval_batch_size 4 \
+    --convert_to_bf16"
+    eval "$cmd"
+}
+# extract the results
+extract_mmlu() {
+    mdir=$1
+    set_save_dir $mdir mmlu
+    result=$(jq .average_acc $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
+# extract the results for the validation set
+extract_valid_mmlu() {
+    mdir=$1
+    set_valid_dir $mdir mmlu
+    result=$(jq .average_acc $save_dir/metrics.json)
+    result=$(echo "$result * 100" | bc)
+    echo $result
+}
--- a/evaluation/eval_tydiqa.sh
+++ b/evaluation/eval_tydiqa.sh
+source eval.sh
+# main evaluation function
+eval_tydiqa() {
+    mdir=$1
+    set_save_dir $mdir tydiqa
+    mkdir -p $save_dir
+    cmd="python -m eval.tydiqa.run_eval \
+    --data_dir $DATA_DIR/tydiqa/ \
+    --n_shot 1 \
+    --max_num_examples_per_lang 200 \
+    --max_context_length 512 \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 20 \
+    --use_chat_format \
+    --convert_to_bf16 \
+    --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
+}
+# evaluate the validation set, which is not supported yet
+valid_tydiqa() {
+    mdir=$1
+    set_valid_dir $mdir tydiqa
+    mkdir -p $save_dir
+    cmd="python -m eval.tydiqa.run_eval \
+    --data_dir $DATA_DIR/tydiqa/one-shot-valid \
+    --n_shot 0 \
+    --eval_valid \
+    --max_num_examples_per_lang 200 \
+    --max_context_length 512 \
+    --save_dir $save_dir \
+    --model $mdir \
+    --tokenizer $mdir \
+    --eval_batch_size 20 \
+    --use_chat_format \
+    --convert_to_bf16 \
+    --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
+}
+# extract the results
+extract_tydiqa() {
+    mdir=$1
+    set_save_dir $mdir tydiqa
+    result=$(jq .average.f1 $save_dir/metrics.json)
+    echo $result
+}
+# extract the results for the validation set
+extract_valid_tydiqa() {
+    mdir=$1
+    set_valid_dir $mdir tydiqa
+    result=$(jq .average.f1 $save_dir/metrics.json)
+    echo $result
+}
\ No newline at end of file