From d9558c11caca545ff31d1c3d1aaee0f1f40fec2d Mon Sep 17 00:00:00 2001
From: Kai Wu <kaiwu@meta.com>
Date: Mon, 29 Apr 2024 11:40:18 -0700
Subject: [PATCH] changed context name and add more docs

---
 docs/multi_gpu.md                          | 13 +++++++++++--
 docs/single_gpu.md                         | 13 +++++++++++--
 recipes/finetuning/README.md               |  2 +-
 recipes/finetuning/multigpu_finetuning.md  |  9 ++++++---
 recipes/finetuning/singlegpu_finetuning.md | 10 ++++++++--
 requirements.txt                           |  1 +
 src/llama_recipes/utils/train_utils.py     | 17 ++++++++---------
 7 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md
index 954fae5c..fd1bf4cd 100644
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
@@ -8,7 +8,7 @@ To run fine-tuning on multi-GPUs, we will  make use of two packages:
 
 Given the combination of PEFT and FSDP, we would be able to fine tune a Llama 2 model on multiple GPUs in one node or multi-node.
 
-## Requirements 
+## Requirements
 To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details).
 
 **Please note that the llama_recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.**
@@ -140,7 +140,10 @@ save_model: bool = False
 dist_checkpoint_root_folder: str="model_checkpoints"
 dist_checkpoint_folder: str="fine-tuned"
 save_optimizer: bool=False
-
+flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time.
+flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS.
+use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time.
+profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler
 ```
 
 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
@@ -167,3 +170,9 @@ save_optimizer: bool=False
 * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option.
 
 * `pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
+
+## FLOPS Counting and Pytorch Profiling
+
+To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
+
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
diff --git a/docs/single_gpu.md b/docs/single_gpu.md
index 5b183094..fa35ca08 100644
--- a/docs/single_gpu.md
+++ b/docs/single_gpu.md
@@ -8,7 +8,7 @@ To run fine-tuning on a single GPU, we will  make use of two packages
 
 Given combination of PEFT and Int8 quantization, we would be able to fine_tune a Llama 2 7B model on one consumer grade GPU such as A10.
 
-## Requirements 
+## Requirements
 To run the examples, make sure to install the llama-recipes package (See [README.md](../README.md) for details).
 
 **Please note that the llama-recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.**
@@ -97,9 +97,18 @@ save_model: bool = False
 dist_checkpoint_root_folder: str="model_checkpoints"
 dist_checkpoint_folder: str="fine-tuned"
 save_optimizer: bool=False
-
+flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time.
+flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS.
+use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time.
+profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler
 ```
 
 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
 
 * [peft config file](../src/llama_recipes/configs/peft.py) provides the supported PEFT methods and respective settings that can be modified.
+
+## FLOPS Counting and Pytorch Profiling
+
+To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
+
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
diff --git a/recipes/finetuning/README.md b/recipes/finetuning/README.md
index dd99a91c..50ce31f6 100644
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
@@ -98,4 +98,4 @@ You'll be able to access a dedicated project or run link on [wandb.ai](https://w
 
 To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
 
-Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). This would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
diff --git a/recipes/finetuning/multigpu_finetuning.md b/recipes/finetuning/multigpu_finetuning.md
index f938ac71..2f5c55cd 100644
--- a/recipes/finetuning/multigpu_finetuning.md
+++ b/recipes/finetuning/multigpu_finetuning.md
@@ -9,7 +9,7 @@ We will also need 2 packages:
 1. [PEFT](https://github.com/huggingface/peft) to use parameter-efficient finetuning.
 2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](./LLM_finetuning_overview.md#2-full-partial-parameter-finetuning).
 
-> [!NOTE]  
+> [!NOTE]
 > The llama-recipes package will install PyTorch 2.0.1 version. In case you want to use FSDP with PEFT for multi GPU finetuning, please install the PyTorch nightlies ([details](../../README.md#pytorch-nightlies))
 >
 > INT8 quantization is not currently supported in FSDP
@@ -30,7 +30,7 @@ Get access to a machine with multiple GPUs (in this case we tested with 4 A100 a
 <details>
 <summary>Multi-node Multi-GPU</summary>
 Here we use a slurm script to schedule a job with slurm over multiple nodes.
-    
+
     # Change the num nodes and GPU per nodes in the script before running.
     sbatch ./multi_node.slurm
 
@@ -95,7 +95,7 @@ torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name
 
 
 ## [TIP] Slow interconnect between nodes?
-In case you are dealing with slower interconnect network between nodes, to reduce the communication overhead you can make use of `--hsdp` flag. 
+In case you are dealing with slower interconnect network between nodes, to reduce the communication overhead you can make use of `--hsdp` flag.
 
 HSDP (Hybrid sharding Data Parallel) helps to define a hybrid sharding strategy where you can have FSDP within `sharding_group_size` which can be the minimum number of GPUs you can fit your model and DDP between the replicas of the model specified by `replica_group_size`.
 
@@ -107,5 +107,8 @@ torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_f
 
 ```
 
+## FLOPS Counting and Pytorch Profiling
 
+To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
 
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
diff --git a/recipes/finetuning/singlegpu_finetuning.md b/recipes/finetuning/singlegpu_finetuning.md
index 751f00de..5f7ffce0 100644
--- a/recipes/finetuning/singlegpu_finetuning.md
+++ b/recipes/finetuning/singlegpu_finetuning.md
@@ -24,10 +24,10 @@ The args used in the command above are:
 * `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
 * `--quantization` boolean flag to enable int8 quantization
 
-> [!NOTE]  
+> [!NOTE]
 > In case you are using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id`.
 
- 
+
 ### How to run with different datasets?
 
 Currently 3 open source datasets are supported that can be found in [Datasets config file](../../src/llama_recipes/configs/datasets.py). You can also use your custom dataset (more info [here](./datasets/README.md)).
@@ -60,3 +60,9 @@ python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset
 python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/7B --output_dir Path/to/save/PEFT/model
 
 ```
+
+## FLOPS Counting and Pytorch Profiling
+
+To help with benchmarking effort, we are adding the support for counting the FLOPS during the fine-tuning process. You can achieve this by setting `--flop_counter` when launching your single/multi GPU fine-tuning. Use `--flop_counter_start` to choose which step to count the FLOPS. It is recommended to allow a warm-up stage before using the FLOPS counter.
+
+Similarly, you can set `--use_profiler` flag and pass a profiling output path using `--profiler_dir` to capture the profile traces of your model using [PyTorch profiler](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). To get accurate profiling result, the pytorch profiler requires a warm-up stage and the current config is wait=1, warmup=2, active=3, thus the profiler will start the profiling after step 3 and will record the next 3 steps. Therefore, in order to use pytorch profiler, the --max-train-step has been greater than 6.  The pytorch profiler would be helpful for debugging purposes. However, the `--flop_counter` and `--use_profiler` can not be used in the same time to ensure the measurement accuracy.
diff --git a/requirements.txt b/requirements.txt
index 721cc252..df2c66fd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ gradio
 chardet
 openai
 typing-extensions==4.8.0
+tabulate
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
index 21365369..06b38918 100644
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -31,7 +31,7 @@ def set_tokenizer_params(tokenizer: LlamaTokenizer):
     tokenizer.padding_side = "left"
 
 @contextlib.contextmanager
-def throughput_measure_context(cfg, local_rank=None):
+def profile(cfg, local_rank=None):
     use_profiler: bool = cfg.use_profiler
     use_flop_counter: bool = cfg.flop_counter
     if use_flop_counter and use_profiler:
@@ -41,7 +41,7 @@ def throughput_measure_context(cfg, local_rank=None):
         wait_step, warmup_step, active_step = 1, 2, 3
         min_step = wait_step + warmup_step + active_step + 1
         if cfg.max_train_step > 0 and cfg.max_train_step < min_step:
-            raise ValueError(f"pytorch profiler requires at least {min_step} train steps, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
+            raise ValueError(f"pytorch profiler requires at least {min_step} train steps to finish the warm-up and recording stage, {wait_step} for wait_step, {warmup_step} for warmup_step, {active_step} for profiling step, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
         print(f"pytorch profiling is activated and results will be saved in {cfg.profiler_dir}")
         with torch.profiler.profile(
             activities=[
@@ -97,7 +97,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
 
     autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext
-
     train_prep = []
     train_loss = []
     val_prep = []
@@ -127,7 +126,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
             total_loss = 0.0
             total_length = len(train_dataloader)//gradient_accumulation_steps
             pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True)
-            with throughput_measure_context(train_config,local_rank) as measure_context:
+            with profile(train_config,local_rank) as profile_context:
                 for step, batch in enumerate(train_dataloader):
                     total_train_steps += 1
                     # stop when the maximum number of training steps is reached
@@ -138,7 +137,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                         break
                     if train_config.flop_counter and total_train_steps == train_config.flop_counter_start:
                         print("start flop counting at the step: ", total_train_steps)
-                        measure_context.start_counting()
+                        profile_context.start_counting()
                     for key in batch.keys():
                         if train_config.enable_fsdp:
                             if is_xpu_available():
@@ -185,10 +184,10 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                             optimizer.zero_grad()
                             pbar.update(1)
                     if train_config.use_profiler:
-                        measure_context.step()
-                    if train_config.flop_counter and measure_context.is_ready():
-                        TFlops = measure_context.get_total_flops() / 1e12
-                        measure_context.stop_counting()
+                        profile_context.step()
+                    if train_config.flop_counter and profile_context.is_ready():
+                        TFlops = profile_context.get_total_flops() / 1e12
+                        profile_context.stop_counting()
                     if wandb_run:
                         if not train_config.enable_fsdp or rank==0:
                             wandb_run.log({
-- 
GitLab