diff --git a/model/mixins.py b/model/mixins.py
index 78befb8b27637deb7bf7c98bdcfde408eb28b483..125f8b0e2c7ccda41c4125892637046a041f7186 100644
--- a/model/mixins.py
+++ b/model/mixins.py
@@ -26,7 +26,7 @@ class BaseMixin(torch.nn.Module):
 
 class PositionEmbeddingMixin(BaseMixin):
     def __init__(self, additional_sequence_length, hidden_size, 
-                init_method_std=0.02, reinit_slice=(-1024, None)
+                init_method_std=0.02, reinit_slice=slice(-1024, None)
         ):
         super(PositionEmbeddingMixin, self).__init__()
         self.reinit_slice = reinit_slice
diff --git a/training/deepspeed_training.py b/training/deepspeed_training.py
index 90190a6f95eb775808cc058b9f912d3a9b6555d2..094d49b25f28f857d25572157071802307b1b641 100644
--- a/training/deepspeed_training.py
+++ b/training/deepspeed_training.py
@@ -337,10 +337,10 @@ def train_step(data_iterator, model, optimizer, lr_scheduler,
         # and all reduce metrics by the way
         loss_checker = lm_loss.detach()
         for name in metrics:
-            metrics[name] = metrics[name].detach()
+            metrics[name] = metrics[name].detach().clone()
             torch.distributed.all_reduce(metrics[name].data)
             metrics[name].data /= args.world_size
-            loss_checker += metrics[name]
+            loss_checker = loss_checker + metrics[name]
         if loss_checker.isnan().any() or loss_checker.isinf().any():
             print('Skipping backward and optimizer step for nan or inf in forwarding metrics/loss!')
             return lm_loss.detach(), 1, metrics