From adf4133f084cb9e25ff8406911801c71068467a6 Mon Sep 17 00:00:00 2001
From: duzx16 <zx-du20@mails.tsinghua.edu.cn>
Date: Fri, 10 Dec 2021 11:24:21 +0800
Subject: [PATCH] Update script for backward test

---
 examples/t5/config/config_t5_large.json | 34 +++++++++++++++++++++++
 examples/t5/inference_t5.py             | 36 ++++++++++++++++---------
 examples/t5/scripts/generate_t5.sh      |  3 ++-
 examples/t5/test_t5.py                  | 11 +++++---
 4 files changed, 66 insertions(+), 18 deletions(-)
 create mode 100644 examples/t5/config/config_t5_large.json

diff --git a/examples/t5/config/config_t5_large.json b/examples/t5/config/config_t5_large.json
new file mode 100644
index 0000000..25d7bf7
--- /dev/null
+++ b/examples/t5/config/config_t5_large.json
@@ -0,0 +1,34 @@
+{
+  "train_micro_batch_size_per_gpu": 16,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 100,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 2,
+    "contiguous_gradients": false,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "allgather_bucket_size": 500000000
+  },
+  "bfloat16": {
+    "enabled": true
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0002,
+      "weight_decay": 0.1,
+      "betas": [
+        0.9,
+        0.98
+      ],
+      "eps": 1e-6
+    }
+  },
+  "activation_checkpointing": {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false
+  },
+  "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/examples/t5/inference_t5.py b/examples/t5/inference_t5.py
index c2af4e7..38a008a 100644
--- a/examples/t5/inference_t5.py
+++ b/examples/t5/inference_t5.py
@@ -26,6 +26,7 @@ from SwissArmyTransformer.model.mixins import CachedAutoregressiveMixin
 from SwissArmyTransformer.generation.autoregressive_sampling import filling_sequence, evaluate_perplexity
 from SwissArmyTransformer.generation.sampling_strategies import BeamSearchStrategy, BaseStrategy
 from SwissArmyTransformer.generation.utils import timed_name, generate_continually
+from SwissArmyTransformer.training.deepspeed_training import setup_model_and_optimizer
 
 
 def get_masks_and_position_ids_glm(seq, mask_position, context_length):
@@ -49,28 +50,37 @@ def main(args):
     args.do_train = False
     initialize_distributed(args)
     tokenizer = get_tokenizer(args)
-    # build model
-    model = T5Model(args)
-    if args.fp16:
-        model = model.half()
-    model = model.to(args.device)
     # load_checkpoint(model, args)
     set_random_seed(args.seed)
-    missing_keys, unexpected_keys = model.load_state_dict(
+
+    # Model, optimizer, and learning rate.
+    model_cls = T5Model
+    model, optimizer = setup_model_and_optimizer(args, model_cls=model_cls)
+
+    missing_keys, unexpected_keys = model.module.load_state_dict(
         torch.load("/dataset/fd5061f6/yanan/huggingface_models/t5-large/model_states.pt")["module"])
-    from SwissArmyTransformer.model.encoder_decoder_model import EncoderFinalMixin
+    optimizer.refresh_fp32_params()
     model.eval()
     input_ids = tokenizer.EncodeAsIds("The <extra_id_0> walks in <extra_id_1> park").tokenization
     input_ids = input_ids + [tokenizer.get_command("eos").Id]
-    input_ids = torch.cuda.LongTensor([input_ids])
-    # input_ids = torch.cuda.LongTensor([[37, 32099, 10681, 16, 32098, 2447, 1]])
+    input_ids = torch.LongTensor([input_ids])
     decoder_input_ids = tokenizer.EncodeAsIds('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>').tokenization
     decoder_input_ids = decoder_input_ids + [tokenizer.get_command("eos").Id]
-    decoder_input_ids = torch.cuda.LongTensor([decoder_input_ids])
-    # decoder_input_ids = torch.cuda.LongTensor([[32099, 5295, 1782, 32098, 8, 32097, 1]])
+    decoder_input_ids = torch.LongTensor([decoder_input_ids])
+    data = {'text': input_ids, 'loss_mask': input_ids.new_ones(input_ids.shape), 'target': decoder_input_ids,
+            'attention_mask': input_ids.new_ones(input_ids.shape)}
+    tokens, decoder_tokens, labels, loss_mask, attention_mask = get_batch(data, args)
+    encoder_outputs, logits, *_ = model(enc_input_ids=tokens, dec_input_ids=decoder_tokens,
+                                        enc_attention_mask=attention_mask)
+    losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(), labels)
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask)
+    if loss_mask.sum().item() > 0:
+        loss = loss / loss_mask.sum()
+    loss.backward()
+
     breakpoint()
-    output = model(enc_input_ids=input_ids, dec_input_ids=decoder_input_ids)
-    print(output)
+
     end_tokens = [tokenizer.get_command('eop').Id, tokenizer.get_command('eos').Id]
     # define function for each query
     if args.sampling_strategy == 'BaseStrategy':
diff --git a/examples/t5/scripts/generate_t5.sh b/examples/t5/scripts/generate_t5.sh
index c01f772..c4bc602 100644
--- a/examples/t5/scripts/generate_t5.sh
+++ b/examples/t5/scripts/generate_t5.sh
@@ -15,10 +15,11 @@ TOPP=0
 script_path=$(realpath $0)
 script_dir=$(dirname $script_path)
 
-config_json="$script_dir/ds_config.json"
+config_json="$script_dir/config_t5_large.json"
 
 python -m torch.distributed.launch --nproc_per_node=$MPSIZE --master_port $MASTER_PORT inference_t5.py \
        --deepspeed \
+       --deepspeed-config ${config_json} \
        --mode inference \
        --model-parallel-size $MPSIZE \
        $MODEL_ARGS \
diff --git a/examples/t5/test_t5.py b/examples/t5/test_t5.py
index 805f228..692ed91 100644
--- a/examples/t5/test_t5.py
+++ b/examples/t5/test_t5.py
@@ -1,7 +1,10 @@
 from transformers import T5Model, T5ForConditionalGeneration, T5Tokenizer
 tokenizer = T5Tokenizer.from_pretrained("t5-large")
-model = T5ForConditionalGeneration.from_pretrained("/dataset/fd5061f6/yanan/huggingface_models/t5-large")
-input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-decoder_input_ids = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+model = T5Model.from_pretrained("/dataset/fd5061f6/yanan/huggingface_models/t5-large")
+model = model.to('cuda')
+model.eval()
+input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids.to('cuda')
+decoder_input_ids = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids.to('cuda')
+output = model(input_ids=input_ids, labels=decoder_input_ids)
+output.loss.backward()
 breakpoint()
\ No newline at end of file
-- 
GitLab