Fix attention mask

e9ae3132 · Zhengxiao Du · 7c35e218 · e9ae3132
Commit e9ae3132 authored 3 years ago by Zhengxiao Du
--- a/mpu/transformer.py
+++ b/mpu/transformer.py
@@ -52,9 +52,8 @@ def standard_attention(query_layer, key_layer, value_layer, attention_mask,
    if log_attention_weights is not None:
        attention_scores += log_attention_weights
    
-    # if attention_mask.shape[-2] > 1: # if auto-regressive, skip
-    #     attention_scores = torch.mul(attention_scores, attention_mask) - \
-    #                 10000.0 * (1.0 - attention_mask)
+    attention_scores = torch.mul(attention_scores, attention_mask) - \
+                10000.0 * (1.0 - attention_mask)

    attention_probs = F.softmax(attention_scores, dim=-1)