From 52077b84761ce8a4c227e534459204db4b559a0c Mon Sep 17 00:00:00 2001 From: Ming Ding <dm_thu@qq.com> Date: Thu, 21 Oct 2021 07:09:43 +0000 Subject: [PATCH] fixed mutiple save bug --- training/model_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/training/model_io.py b/training/model_io.py index df6b752..20b53b2 100644 --- a/training/model_io.py +++ b/training/model_io.py @@ -35,7 +35,9 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler, args): """Save a model checkpoint.""" if args.deepspeed: - save_ds_checkpoint(iteration, model, lr_scheduler, args) + if mpu.get_data_parallel_rank() == 0: + print('Saving Model...') + save_ds_checkpoint(iteration, model, lr_scheduler, args) else: raise ValueError("training without deepspeed is not supported.") # Wait so everyone is done (necessary) @@ -70,8 +72,6 @@ def save_ds_checkpoint_no_optim(model, save_dir, tag=None, client_state={}, save os.makedirs(save_dir, exist_ok=True) # Ensure tag is a string tag = str(tag) - # Ensure checkpoint tag is consistent across ranks - model._checkpoint_tag_validation(tag) # Real save via deepspeed model._create_checkpoint_file(save_dir, tag, False) model._save_checkpoint(save_dir, tag, client_state=client_state) -- GitLab