diff --git a/utils/train_utils.py b/utils/train_utils.py
index 8a68f0c18b7e151d7c2b3b13de5627a87a8aa901..e41f503ea1ba7140f84b202a4ecd72c88ee0285b 100644
--- a/utils/train_utils.py
+++ b/utils/train_utils.py
@@ -199,7 +199,7 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer):
                 if train_config.enable_fsdp:
                     batch[key] = batch[key].to(local_rank)
                 else:
-                    batch[key] = batch[key].to('cuda')
+                    batch[key] = batch[key].to('cuda:0')
             # Ensure no gradients are computed for this scope to save memory
             with torch.no_grad():
                 # Forward pass and compute loss