diff --git a/training/model_io.py b/training/model_io.py index df6b7524f6ac8334243af401f4147c9a34dfffae..20b53b2a1ac3246ff17f2b14dcbbce0e387b69eb 100644 --- a/training/model_io.py +++ b/training/model_io.py @@ -35,7 +35,9 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler, args): """Save a model checkpoint.""" if args.deepspeed: - save_ds_checkpoint(iteration, model, lr_scheduler, args) + if mpu.get_data_parallel_rank() == 0: + print('Saving Model...') + save_ds_checkpoint(iteration, model, lr_scheduler, args) else: raise ValueError("training without deepspeed is not supported.") # Wait so everyone is done (necessary) @@ -70,8 +72,6 @@ def save_ds_checkpoint_no_optim(model, save_dir, tag=None, client_state={}, save os.makedirs(save_dir, exist_ok=True) # Ensure tag is a string tag = str(tag) - # Ensure checkpoint tag is consistent across ranks - model._checkpoint_tag_validation(tag) # Real save via deepspeed model._create_checkpoint_file(save_dir, tag, False) model._save_checkpoint(save_dir, tag, client_state=client_state)