From a49a2c2804b36cbe336c89426af83dbac1dbdb4c Mon Sep 17 00:00:00 2001 From: Hamid Shojanazeri <hamid.nazeri2010@gmail.com> Date: Fri, 28 Jul 2023 03:51:34 +0000 Subject: [PATCH] adding PT cuda allocation expand flag --- utils/train_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/train_utils.py b/utils/train_utils.py index d03f62aa..27cd93f0 100644 --- a/utils/train_utils.py +++ b/utils/train_utils.py @@ -253,6 +253,7 @@ def setup_environ_flags(rank): os.environ["TORCH_SHOW_CPP_STACKTRACES"] = str(1) os.environ["NCCL_ASYNC_ERROR_HANDLING"] = str(1) os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL" + os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True' if rank == 0: print(f"--> Running with torch dist debug set to detail") -- GitLab