From d9558c11caca545ff31d1c3d1aaee0f1f40fec2d Mon Sep 17 00:00:00 2001 From: Kai Wu <kaiwu@meta.com> Date: Mon, 29 Apr 2024 11:40:18 -0700 Subject: [PATCH] changed context name and add more docs --- docs/multi_gpu.md | 13 +++++++++++-- docs/single_gpu.md | 13 +++++++++++-- recipes/finetuning/README.md | 2 +- recipes/finetuning/multigpu_finetuning.md | 9 ++++++--- recipes/finetuning/singlegpu_finetuning.md | 10 ++++++++-- requirements.txt | 1 + src/llama_recipes/utils/train_utils.py | 17 ++++++++--------- 7 files changed, 46 insertions(+), 19 deletions(-) diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 954fae5c..fd1bf4cd 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -8,7 +8,7 @@ To run fine-tuning on multi-GPUs, we will make use of two packages: Given the combination of PEFT and FSDP, we would be able to fine tune a Llama 2 model on multiple GPUs in one node or multi-node. -## Requirements +## Requirements To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details). **Please note that the llama_recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.** @@ -140,7 +140,10 @@ save_model: bool = False dist_checkpoint_root_folder: str="model_checkpoints" dist_checkpoint_folder: str="fine-tuned" save_optimizer: bool=False - +flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time. +flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS. +use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time. +profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler ``` * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets. @@ -167,3 +170,9 @@ save_optimizer: bool=False * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option. * `pure_bf16` it moves the model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary. + +## FLOPS Counting and Pytorch Profiling + +To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. + +Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. diff --git a/docs/single_gpu.md b/docs/single_gpu.md index 5b183094..fa35ca08 100644 --- a/docs/single_gpu.md +++ b/docs/single_gpu.md @@ -8,7 +8,7 @@ To run fine-tuning on a single GPU, we will make use of two packages Given combination of PEFT and Int8 quantization, we would be able to fine_tune a Llama 2 7B model on one consumer grade GPU such as A10. -## Requirements +## Requirements To run the examples, make sure to install the llama-recipes package (See [README.md](../README.md) for details). **Please note that the llama-recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.** @@ -97,9 +97,18 @@ save_model: bool = False dist_checkpoint_root_folder: str="model_checkpoints" dist_checkpoint_folder: str="fine-tuned" save_optimizer: bool=False - +flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time. +flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS. +use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time. +profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler ``` * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets. * [peft config file](../src/llama_recipes/configs/peft.py) provides the supported PEFT methods and respective settings that can be modified. + +## FLOPS Counting and Pytorch Profiling + +To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. + +Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. diff --git a/recipes/finetuning/README.md b/recipes/finetuning/README.md index dd99a91c..50ce31f6 100644 --- a/recipes/finetuning/README.md +++ b/recipes/finetuning/README.md @@ -98,4 +98,4 @@ You'll be able to access a dedicated project or run link on [wandb.ai](https://w To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. -Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). This would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. +Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. diff --git a/recipes/finetuning/multigpu_finetuning.md b/recipes/finetuning/multigpu_finetuning.md index f938ac71..2f5c55cd 100644 --- a/recipes/finetuning/multigpu_finetuning.md +++ b/recipes/finetuning/multigpu_finetuning.md @@ -9,7 +9,7 @@ We will also need 2 packages: 1. [PEFT](https://github.com/huggingface/peft) to use parameter-efficient finetuning. 2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](./LLM_finetuning_overview.md#2-full-partial-parameter-finetuning). -> [!NOTE] +> [!NOTE] > The llama-recipes package will install PyTorch 2.0.1 version. In case you want to use FSDP with PEFT for multi GPU finetuning, please install the PyTorch nightlies ([details](../../README.md#pytorch-nightlies)) > > INT8 quantization is not currently supported in FSDP @@ -30,7 +30,7 @@ Get access to a machine with multiple GPUs (in this case we tested with 4 A100 a <details> <summary>Multi-node Multi-GPU</summary> Here we use a slurm script to schedule a job with slurm over multiple nodes. - + # Change the num nodes and GPU per nodes in the script before running. sbatch ./multi_node.slurm @@ -95,7 +95,7 @@ torchrun --nnodes 1 --nproc_per_node 4 finetuning.py --enable_fsdp --model_name ## [TIP] Slow interconnect between nodes? -In case you are dealing with slower interconnect network between nodes, to reduce the communication overhead you can make use of `--hsdp` flag. +In case you are dealing with slower interconnect network between nodes, to reduce the communication overhead you can make use of `--hsdp` flag. HSDP (Hybrid sharding Data Parallel) helps to define a hybrid sharding strategy where you can have FSDP within `sharding_group_size` which can be the minimum number of GPUs you can fit your model and DDP between the replicas of the model specified by `replica_group_size`. @@ -107,5 +107,8 @@ torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_f ``` +## FLOPS Counting and Pytorch Profiling +To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. +Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. diff --git a/recipes/finetuning/singlegpu_finetuning.md b/recipes/finetuning/singlegpu_finetuning.md index 751f00de..5f7ffce0 100644 --- a/recipes/finetuning/singlegpu_finetuning.md +++ b/recipes/finetuning/singlegpu_finetuning.md @@ -24,10 +24,10 @@ The args used in the command above are: * `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`. * `--quantization` boolean flag to enable int8 quantization -> [!NOTE] +> [!NOTE] > In case you are using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id`. - + ### How to run with different datasets? Currently 3 open source datasets are supported that can be found in [Datasets config file](../../src/llama_recipes/configs/datasets.py). You can also use your custom dataset (more info [here](./datasets/README.md)). @@ -60,3 +60,9 @@ python -m finetuning.py --use_peft --peft_method lora --quantization --dataset python -m finetuning.py --use_peft --peft_method lora --quantization --dataset samsum_dataset --model_name /patht_of_model_folder/7B --output_dir Path/to/save/PEFT/model ``` + +## FLOPS Counting and Pytorch Profiling + +To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter. + +Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6. The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy. diff --git a/requirements.txt b/requirements.txt index 721cc252..df2c66fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ gradio chardet openai typing-extensions==4.8.0 +tabulate diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py index 21365369..06b38918 100644 --- a/src/llama_recipes/utils/train_utils.py +++ b/src/llama_recipes/utils/train_utils.py @@ -31,7 +31,7 @@ def set_tokenizer_params(tokenizer: LlamaTokenizer): tokenizer.padding_side = "left" @contextlib.contextmanager -def throughput_measure_context(cfg, local_rank=None): +def profile(cfg, local_rank=None): use_profiler: bool = cfg.use_profiler use_flop_counter: bool = cfg.flop_counter if use_flop_counter and use_profiler: @@ -41,7 +41,7 @@ def throughput_measure_context(cfg, local_rank=None): wait_step, warmup_step, active_step = 1, 2, 3 min_step = wait_step + warmup_step + active_step + 1 if cfg.max_train_step > 0 and cfg.max_train_step < min_step: - raise ValueError(f"pytorch profiler requires at least {min_step} train steps, please increase the max_train_step, current max_train_step {cfg.max_train_step}") + raise ValueError(f"pytorch profiler requires at least {min_step} train steps to finish the warm-up and recording stage, {wait_step} for wait_step, {warmup_step} for warmup_step, {active_step} for profiling step, please increase the max_train_step, current max_train_step {cfg.max_train_step}") print(f"pytorch profiling is activated and results will be saved in {cfg.profiler_dir}") with torch.profiler.profile( activities=[ @@ -97,7 +97,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext - train_prep = [] train_loss = [] val_prep = [] @@ -127,7 +126,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche total_loss = 0.0 total_length = len(train_dataloader)//gradient_accumulation_steps pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True) - with throughput_measure_context(train_config,local_rank) as measure_context: + with profile(train_config,local_rank) as profile_context: for step, batch in enumerate(train_dataloader): total_train_steps += 1 # stop when the maximum number of training steps is reached @@ -138,7 +137,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche break if train_config.flop_counter and total_train_steps == train_config.flop_counter_start: print("start flop counting at the step: ", total_train_steps) - measure_context.start_counting() + profile_context.start_counting() for key in batch.keys(): if train_config.enable_fsdp: if is_xpu_available(): @@ -185,10 +184,10 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche optimizer.zero_grad() pbar.update(1) if train_config.use_profiler: - measure_context.step() - if train_config.flop_counter and measure_context.is_ready(): - TFlops = measure_context.get_total_flops() / 1e12 - measure_context.stop_counting() + profile_context.step() + if train_config.flop_counter and profile_context.is_ready(): + TFlops = profile_context.get_total_flops() / 1e12 + profile_context.stop_counting() if wandb_run: if not train_config.enable_fsdp or rank==0: wandb_run.log({ -- GitLab