diff --git a/README.md b/README.md
index ff6adee358caf328feda4152ac041b6a5d264dca..90800bcd521cd071c3b2c56337a7031486a5e3fa 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ Here we make use of Parameter Efficient Methods (PEFT) as described in the next
 
 ### Multiple GPUs One Node:
 
-**NOTE** please make sure to use PyTorch Nightlies for using PEFT+FSDP .
+**NOTE** please make sure to use PyTorch Nightlies for using PEFT+FSDP. Also, note that int8 quantization from bit&bytes currently is not supported in FSDP.
 
 ```bash
 
diff --git a/docs/mutli_gpu.md b/docs/mutli_gpu.md
index b0ca2e9f01653bbb342b3c5738c4565e58402714..5695ccf51f8a2ff8f2e7ed4a71fece9f6361c65f 100644
--- a/docs/mutli_gpu.md
+++ b/docs/mutli_gpu.md
@@ -26,6 +26,8 @@ This runs with the `samsum_dataset` for summarization application by default.
 
 **Multiple GPUs one node**:
 
+**NOTE** please make sure to use PyTorch Nightlies for using PEFT+FSDP. Also, note that int8 quantization from bit&bytes currently is not supported in FSDP.
+
 ```bash
 
 torchrun --nnodes 1 --nproc_per_node 4  ../llama_finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
diff --git a/requirements.txt b/requirements.txt
index cba786e075b2e1f5d0643451c31d648ee8528f8c..9258c3e4e75e7714034a4fd177cad7e4615dffef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ black[jupyter]
 datasets
 fire
 git+https://github.com/huggingface/peft.git
-transformers
+transformers>=4.31.0
 sentencepiece
 py7zr
 scipy