Skip to content
Snippets Groups Projects
Commit 33deb347 authored by xiamengzhou's avatar xiamengzhou
Browse files

update

parent 12456064
No related branches found
No related tags found
No related merge requests found
# Selecting Influential Data for # LESS: Selecting Influential Data for Targeted Instruction Tuning
🌟 [ArXiv Preprint](TODO) 🌟 [ArXiv Preprint](TODO)
This repo hosts the code for the paper "LESS: Selecting Influential Data for Targeted Instruction Tuning". In this work, we propose a data selection method to select influential data to induce a target capability.
## 🔗 Quick Links ## 🔗 Quick Links
- [Selecting Influential Data for](#selecting-influential-data-for) - [LESS: Selecting Influential Data for Targeted Instruction Tuning](#less-selecting-influential-data-for-targeted-instruction-tuning)
- [🔗 Quick Links](#-quick-links) - [🔗 Quick Links](#-quick-links)
- [Install Requirements](#install-requirements) - [Install Requirements](#install-requirements)
- [Data Preparation](#data-preparation) - [Data Preparation](#data-preparation)
...@@ -36,9 +37,7 @@ pip install -e . ...@@ -36,9 +37,7 @@ pip install -e .
## Data Preparation ## Data Preparation
We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. You can also find a processed copy of files here [TODO]. We follow the [open-instruct](https://github.com/allenai/open-instruct?tab=readme-ov-file#dataset-preparation) repo to prepare hour instruction tuning datasets. In our project, we utilize a combination of four training datasets: Flan v2, COT, Dolly, and Open Assistant. For the purposes of evaluation, we employ three additional datasets: MMLU, Tydiqa, and BBH. A processed version of these files will be made available [here] [TODO].
We also get the evaluation data ready in the same way.
## Data Selection Pipeline ## Data Selection Pipeline
......
## Evaluation
We mainly employ three evaluation datasets to assess the performance of our data selection pipeline: **MMLU**, **Tydiqa**, and **BBH**. We use the evaluation pipeline [open-instruct](https://github.com/allenai/open-instruct/tree/main/eval). To evaluate a trained model, please follow the steps below:
### Step 1: Install Open-Instruct
```bash
git clone https://github.com/allenai/open-instruct.git
cd open-instruct
pip install -e .
```
### Step 2: Evaluation
Please check out the `eval_mmlu.sh`, `eval_tydiqa.sh`, and `eval_bbh.sh` scripts in the `evaluation` directory. These scripts contain the necessary commands to evaluate the model on the respective datasets.
set_save_dir() {
mdir=$1
if [[ -d $mdir ]]; then
save_dir=${mdir}/eval/$2
else
save_dir=$n/space10/out/$(basename $mdir)/eval/$2
fi
}
set_valid_dir() {
mdir=$1
if [[ -d $mdir ]]; then
save_dir=${mdir}/valid/$2
else
save_dir=$n/space10/out/$(basename $mdir)/valid/$2
fi
}
export set_save_dir
export set_valid_dir
source eval.sh
# main evaluation function
eval_bbh() {
cd $n/space10/open-instruct
mdir=$1
type=$2
set_save_dir $mdir bbh
mkdir -p $save_dir
cmd="python -m eval.bbh.run_eval \
--data_dir $DATA_DIR/bbh \
--save_dir $save_dir \
--model $mdir \
--tokenizer $mdir \
--eval_batch_size 10 \
--convert_to_bf16 \
--max_num_examples_per_task 40"
eval "$cmd"
}
# evaluate the validation set, which is not supported yet
valid_bbh() {
cd $n/space10/open-instruct
mdir=$1
type=$2
set_valid_dir $mdir bbh
echo $save_dir
mkdir -p $save_dir
cmd="python -m eval.bbh.run_eval \
--data_dir $DATA_DIR/bbh-valid \
--save_dir $save_dir \
--model $mdir \
--tokenizer $mdir \
--eval_batch_size 10 \
--convert_to_bf16 \
--eval_valid \
--max_num_examples_per_task 3"
}
# extract the results
extract_bbh() {
mdir=$1
set_save_dir $mdir bbh-nonchat
result=$(jq .average_exact_match $save_dir/metrics.json)
result=$(echo "$result * 100" | bc)
echo $result
}
# extract the results for the validation set
extract_valid_bbh() {
mdir=$1
set_valid_dir $mdir bbh-nonchat
result=$(jq .average_exact_match $save_dir/metrics.json)
result=$(echo "$result * 100" | bc)
echo $result
}
source eval.sh
# main evaluation function
eval_mmlu() {
mdir=$1
set_save_dir $mdir mmlu
mkdir -p $save_dir
cmd="python -m eval.mmlu.run_eval \
--ntrain 5 \
--data_dir $DATA_DIR/mmlu \
--save_dir $save_dir \
--model_name_or_path $mdir \
--tokenizer_name_or_path $mdir \
--eval_batch_size 4 \
--convert_to_bf16"
eval "$cmd"
}
# evaluate the validation set, which is not supported yet
valid_mmlu() {
mdir=$1
type=$2
set_valid_dir $mdir mmlu
mkdir -p $save_dir
cmd="python -m eval.mmlu.run_eval \
--ntrain 5 \
--eval_valid \
--data_dir $DATA_DIR/mmlu \
--save_dir $save_dir \
--model_name_or_path $mdir \
--tokenizer_name_or_path $mdir \
--eval_batch_size 4 \
--convert_to_bf16"
eval "$cmd"
}
# extract the results
extract_mmlu() {
mdir=$1
set_save_dir $mdir mmlu
result=$(jq .average_acc $save_dir/metrics.json)
result=$(echo "$result * 100" | bc)
echo $result
}
# extract the results for the validation set
extract_valid_mmlu() {
mdir=$1
set_valid_dir $mdir mmlu
result=$(jq .average_acc $save_dir/metrics.json)
result=$(echo "$result * 100" | bc)
echo $result
}
source eval.sh
# main evaluation function
eval_tydiqa() {
mdir=$1
set_save_dir $mdir tydiqa
mkdir -p $save_dir
cmd="python -m eval.tydiqa.run_eval \
--data_dir $DATA_DIR/tydiqa/ \
--n_shot 1 \
--max_num_examples_per_lang 200 \
--max_context_length 512 \
--save_dir $save_dir \
--model $mdir \
--tokenizer $mdir \
--eval_batch_size 20 \
--use_chat_format \
--convert_to_bf16 \
--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
}
# evaluate the validation set, which is not supported yet
valid_tydiqa() {
mdir=$1
set_valid_dir $mdir tydiqa
mkdir -p $save_dir
cmd="python -m eval.tydiqa.run_eval \
--data_dir $DATA_DIR/tydiqa/one-shot-valid \
--n_shot 0 \
--eval_valid \
--max_num_examples_per_lang 200 \
--max_context_length 512 \
--save_dir $save_dir \
--model $mdir \
--tokenizer $mdir \
--eval_batch_size 20 \
--use_chat_format \
--convert_to_bf16 \
--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format"
}
# extract the results
extract_tydiqa() {
mdir=$1
set_save_dir $mdir tydiqa
result=$(jq .average.f1 $save_dir/metrics.json)
echo $result
}
# extract the results for the validation set
extract_valid_tydiqa() {
mdir=$1
set_valid_dir $mdir tydiqa
result=$(jq .average.f1 $save_dir/metrics.json)
echo $result
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment