diff --git a/README.md b/README.md index 6982d6e568fef33faea070c41de1a7d9b25ca85a..a2224991007216bc901ff30af7cc0609c561d532 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ -# Selecting Influential Data for - +# LESS: Selecting Influential Data for Targeted Instruction Tuning 🌟 [ArXiv Preprint](TODO) +This repo hosts the code for the paper "LESS: Selecting Influential Data for Targeted Instruction Tuning". In this work, we propose a data selection method to select influential data to induce a target capability. + ## 🔗 Quick Links -- [Selecting Influential Data for](#selecting-influential-data-for) +- [LESS: Selecting Influential Data for Targeted Instruction Tuning](#less-selecting-influential-data-for-targeted-instruction-tuning) - [🔗 Quick Links](#-quick-links) - [Install Requirements](#install-requirements) - [Data Preparation](#data-preparation) @@ -36,9 +37,7 @@ pip install -e . ## Data Preparation -We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. You can also find a processed copy of files here [TODO]. - -We also get the evaluation data ready in the same way. +We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. In our project, we utilize a combination of four training datasets: Flan v2, COT, Dolly, and Open Assistant. For the purposes of evaluation, we employ three additional datasets: MMLU, Tydiqa, and BBH. A processed version of these files will be made available [here] [TODO]. ## Data Selection Pipeline diff --git a/evaluation/README.md b/evaluation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9ca7236b83f542630d8a7dfc210dd1f2ab8b9fb9 --- /dev/null +++ b/evaluation/README.md @@ -0,0 +1,14 @@ +## Evaluation + +We mainly employ three evaluation datasets to assess the performance of our data selection pipeline: **MMLU**, **Tydiqa**, and **BBH**. We use the evaluation pipeline [open-instruct](https://github.com/allenai/open-instruct/tree/main/eval). To evaluate a trained model, please follow the steps below: + +### Step 1: Install Open-Instruct +```bash +git clone https://github.com/allenai/open-instruct.git +cd open-instruct +pip install -e . +``` + +### Step 2: Evaluation +Please check out the `eval_mmlu.sh`, `eval_tydiqa.sh`, and `eval_bbh.sh` scripts in the `evaluation` directory. These scripts contain the necessary commands to evaluate the model on the respective datasets. + diff --git a/evaluation/eval.sh b/evaluation/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..42d158baeb5d2ca51942ad2355cc7e1a89d87f1f --- /dev/null +++ b/evaluation/eval.sh @@ -0,0 +1,22 @@ +set_save_dir() { + mdir=$1 + if [[ -d $mdir ]]; then + save_dir=${mdir}/eval/$2 + else + save_dir=$n/space10/out/$(basename $mdir)/eval/$2 + fi +} + +set_valid_dir() { + mdir=$1 + if [[ -d $mdir ]]; then + save_dir=${mdir}/valid/$2 + else + save_dir=$n/space10/out/$(basename $mdir)/valid/$2 + fi +} + + +export set_save_dir +export set_valid_dir + diff --git a/evaluation/eval_bbh.sh b/evaluation/eval_bbh.sh new file mode 100644 index 0000000000000000000000000000000000000000..d1b12177e82f5ac69f530985ae526664a523a056 --- /dev/null +++ b/evaluation/eval_bbh.sh @@ -0,0 +1,56 @@ +source eval.sh + +# main evaluation function +eval_bbh() { + cd $n/space10/open-instruct + mdir=$1 + type=$2 + set_save_dir $mdir bbh + mkdir -p $save_dir + cmd="python -m eval.bbh.run_eval \ + --data_dir $DATA_DIR/bbh \ + --save_dir $save_dir \ + --model $mdir \ + --tokenizer $mdir \ + --eval_batch_size 10 \ + --convert_to_bf16 \ + --max_num_examples_per_task 40" + eval "$cmd" +} + +# evaluate the validation set, which is not supported yet +valid_bbh() { + cd $n/space10/open-instruct + mdir=$1 + type=$2 + set_valid_dir $mdir bbh + echo $save_dir + mkdir -p $save_dir + cmd="python -m eval.bbh.run_eval \ + --data_dir $DATA_DIR/bbh-valid \ + --save_dir $save_dir \ + --model $mdir \ + --tokenizer $mdir \ + --eval_batch_size 10 \ + --convert_to_bf16 \ + --eval_valid \ + --max_num_examples_per_task 3" +} + +# extract the results +extract_bbh() { + mdir=$1 + set_save_dir $mdir bbh-nonchat + result=$(jq .average_exact_match $save_dir/metrics.json) + result=$(echo "$result * 100" | bc) + echo $result +} + +# extract the results for the validation set +extract_valid_bbh() { + mdir=$1 + set_valid_dir $mdir bbh-nonchat + result=$(jq .average_exact_match $save_dir/metrics.json) + result=$(echo "$result * 100" | bc) + echo $result +} diff --git a/evaluation/eval_mmlu.sh b/evaluation/eval_mmlu.sh new file mode 100644 index 0000000000000000000000000000000000000000..a92bfb30db2b38f80c68addf18ca91f86dbd62a5 --- /dev/null +++ b/evaluation/eval_mmlu.sh @@ -0,0 +1,53 @@ +source eval.sh + +# main evaluation function +eval_mmlu() { + mdir=$1 + set_save_dir $mdir mmlu + mkdir -p $save_dir + cmd="python -m eval.mmlu.run_eval \ + --ntrain 5 \ + --data_dir $DATA_DIR/mmlu \ + --save_dir $save_dir \ + --model_name_or_path $mdir \ + --tokenizer_name_or_path $mdir \ + --eval_batch_size 4 \ + --convert_to_bf16" + eval "$cmd" +} + +# evaluate the validation set, which is not supported yet +valid_mmlu() { + mdir=$1 + type=$2 + set_valid_dir $mdir mmlu + mkdir -p $save_dir + cmd="python -m eval.mmlu.run_eval \ + --ntrain 5 \ + --eval_valid \ + --data_dir $DATA_DIR/mmlu \ + --save_dir $save_dir \ + --model_name_or_path $mdir \ + --tokenizer_name_or_path $mdir \ + --eval_batch_size 4 \ + --convert_to_bf16" + eval "$cmd" +} + +# extract the results +extract_mmlu() { + mdir=$1 + set_save_dir $mdir mmlu + result=$(jq .average_acc $save_dir/metrics.json) + result=$(echo "$result * 100" | bc) + echo $result +} + +# extract the results for the validation set +extract_valid_mmlu() { + mdir=$1 + set_valid_dir $mdir mmlu + result=$(jq .average_acc $save_dir/metrics.json) + result=$(echo "$result * 100" | bc) + echo $result +} diff --git a/evaluation/eval_tydiqa.sh b/evaluation/eval_tydiqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..351303b0cafea3cbc31acf42f0888479443b7e6f --- /dev/null +++ b/evaluation/eval_tydiqa.sh @@ -0,0 +1,56 @@ +source eval.sh + +# main evaluation function +eval_tydiqa() { + mdir=$1 + set_save_dir $mdir tydiqa + mkdir -p $save_dir + cmd="python -m eval.tydiqa.run_eval \ + --data_dir $DATA_DIR/tydiqa/ \ + --n_shot 1 \ + --max_num_examples_per_lang 200 \ + --max_context_length 512 \ + --save_dir $save_dir \ + --model $mdir \ + --tokenizer $mdir \ + --eval_batch_size 20 \ + --use_chat_format \ + --convert_to_bf16 \ + --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format" +} + +# evaluate the validation set, which is not supported yet +valid_tydiqa() { + mdir=$1 + set_valid_dir $mdir tydiqa + mkdir -p $save_dir + cmd="python -m eval.tydiqa.run_eval \ + --data_dir $DATA_DIR/tydiqa/one-shot-valid \ + --n_shot 0 \ + --eval_valid \ + --max_num_examples_per_lang 200 \ + --max_context_length 512 \ + --save_dir $save_dir \ + --model $mdir \ + --tokenizer $mdir \ + --eval_batch_size 20 \ + --use_chat_format \ + --convert_to_bf16 \ + --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format" +} + +# extract the results +extract_tydiqa() { + mdir=$1 + set_save_dir $mdir tydiqa + result=$(jq .average.f1 $save_dir/metrics.json) + echo $result +} + +# extract the results for the validation set +extract_valid_tydiqa() { + mdir=$1 + set_valid_dir $mdir tydiqa + result=$(jq .average.f1 $save_dir/metrics.json) + echo $result +} \ No newline at end of file