From a49a2c2804b36cbe336c89426af83dbac1dbdb4c Mon Sep 17 00:00:00 2001
From: Hamid Shojanazeri <hamid.nazeri2010@gmail.com>
Date: Fri, 28 Jul 2023 03:51:34 +0000
Subject: [PATCH] adding PT cuda allocation expand flag

---
 utils/train_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/train_utils.py b/utils/train_utils.py
index d03f62aa..27cd93f0 100644
--- a/utils/train_utils.py
+++ b/utils/train_utils.py
@@ -253,6 +253,7 @@ def setup_environ_flags(rank):
     os.environ["TORCH_SHOW_CPP_STACKTRACES"] = str(1)
     os.environ["NCCL_ASYNC_ERROR_HANDLING"] = str(1)
     os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+    os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
     if rank == 0:
         print(f"--> Running with torch dist debug set to detail")
 
-- 
GitLab