del some unused things

e3f4751f · Ming Ding · b32472a9 · b32472a9 · e3f4751f · e3f4751f
Commit e3f4751f authored 3 years ago by Ming Ding
--- a/generate_samples.py
+++ b/generate_samples.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Sample Generate GPT2"""
-
-import os
-import stat
-import random
-import numpy as np
-import torch
-import torch.nn.functional as F
-import argparse
-import time
-from datetime import datetime
-from arguments import get_args
-from utils import Timers
-from pretrain_gpt2 import initialize_distributed
-from pretrain_gpt2 import set_random_seed
-from utils import load_checkpoint, get_checkpoint_iteration
-from data_utils import get_tokenizer
-import mpu
-import deepspeed
-
-from model import GPT2Model
-from utils import print_rank_0
-from pretrain_gpt2 import get_model
-import math
-from copy import deepcopy
-from tqdm import tqdm
-from generation import get_batch, filling_sequence, add_interlacing_beam_marks, magnify, inverse_prompt_score, filling_sequence_local, filling_sequence_cuda_2d
-from torchvision.utils import save_image
-import torch.distributed as dist
-
-
-def setup_model(args):
-    """Setup model and optimizer."""
-    model = get_model(args)
-
-    if args.load is not None:
-        if args.deepspeed:
-            iteration, release, success = get_checkpoint_iteration(args)
-            path = os.path.join(args.load, str(iteration), "mp_rank_00_model_states.pt")
-            print('current device:', torch.cuda.current_device())
-            checkpoint = torch.load(path, map_location=torch.device('cpu'))
-            model.load_state_dict(checkpoint["module"])
-            print(f"Load model file {path}")
-        else:
-            _ = load_checkpoint(
-                model, None, None, args, load_optimizer_states=False)
-
-    return model
-
-def _parse_and_to_tensor(text, img_size=256, query_template='{}'):
-    tokenizer = get_tokenizer()
-    text = query_template.format(*text.split('\t'))
-    seq = tokenizer.parse_query(text, img_size=img_size)
-    seq = torch.cuda.LongTensor(seq)
-    return seq
-
-def get_context(args, query_template='{}'):
-    tokenizer = get_tokenizer()
-    terminate_runs = 0
-    img_size = 256 if args.generation_task != 'low-level super-resolution' else 128
-    ml = max(args.max_position_embeddings, args.max_position_embeddings_finetune)
-    output_path = args.output_path
-
-    if args.input_source == 'interactive':
-        assert not args.with_id, '--with-id is only used with file inputs.'
-        if args.generation_task == 'post-selection':
-            raise ValueError('post-selection only takes file inputs!')
-        while True:
-            raw_text = input("\nPlease Input Query (stop to exit) >>> ") 
-            if not raw_text:
-                print('Query should not be empty!')
-                continue
-            if raw_text == "stop":
-                return 
-            try:
-                seq = _parse_and_to_tensor(raw_text, img_size=img_size, query_template=query_template)
-            except (ValueError, FileNotFoundError) as e:
-                print(e)
-                continue
-            if len(seq) > ml:
-                print("\nSeq length", len(seq),
-                      f"\nPlease give smaller context than {ml}!")
-                continue
-            yield (raw_text, seq, output_path)
-    else:
-        with open(args.input_source, 'r') as fin:
-            inputs = fin.readlines()
-        for line_no, raw_text in enumerate(inputs):
-            if line_no % dist.get_world_size() != dist.get_rank():
-                continue
-            rk = dist.get_rank()
-            print(f'Working on No. {line_no} on {rk}... ')
-            raw_text = raw_text.strip()
-            if len(raw_text) == 0:
-                continue
-            if args.with_id: # with id
-                parts = raw_text.split('\t')
-                output_path = os.path.join(args.output_path, parts[0])
-                raw_text = '\t'.join(parts[1:])
-
-            if args.generation_task == 'post-selection':
-                parts = raw_text.split('\t')
-                seqs = []
-                for part in parts[1:]:
-                    try:
-                        seq_single = _parse_and_to_tensor('\t'.join([part, parts[0]]), img_size=img_size, query_template=query_template)
-                        seqs.append(seq_single)
-                    except (ValueError, FileNotFoundError) as e:
-                        print(e)
-                        continue
-                seq = torch.stack(seqs)
-            else:
-                try:
-                    seq = _parse_and_to_tensor(raw_text, img_size=img_size, query_template=query_template)
-                except (ValueError, FileNotFoundError) as e:
-                    print(e)
-                    continue
-                if len(seq) > ml:
-                    print("\nSeq length", len(seq),
-                        f"\nPlease give smaller context than {ml}!")
-                    continue
-            yield (raw_text, seq, output_path)
-
-
-def generate_images_once(model, args, raw_text, seq=None, num=8, query_template='{}', output_path='./samples'):
-    tokenizer = get_tokenizer()
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    if seq is None: # need parse
-        img_size = 256 if args.generation_task != 'low-level super-resolution' else 128
-        seq = _parse_and_to_tensor(raw_text, img_size=img_size, query_template=query_template)
-    model.eval()
-    with torch.no_grad():
-        print('show raw text:', raw_text)
-        start_time = time.time()
-        if args.generation_task in ['text2image', 'low-level super-resolution']:
-            invalid_slices = [slice(tokenizer.img_tokenizer.num_tokens, None)]
-        elif args.generation_task == 'image2text':
-            invalid_slices = [slice(0, tokenizer.img_tokenizer.num_tokens)]
-        else:
-            NotImplementedError
-
-        mbz = args.max_inference_batch_size
-        add_interlacing_beam_marks(seq, nb=min(num, mbz))
-        assert num < mbz or num % mbz == 0
-        output_tokens_list = []
-        for tim in range(max(num // mbz, 1)):
-            # import line_profiler
-            # from mpu.sparse_transformer import standard_attention
-            # profile = line_profiler.LineProfiler(model.module.forward)
-            # profile = line_profiler.LineProfiler(standard_attention)
-            # profile.enable()
-            fill_fn = filling_sequence_cuda_2d if args.generation_task == 'cuda-2d generation' else filling_sequence
-            output_tokens_list.append(fill_fn(model, seq.clone(), args))
-            # torch.cuda.empty_cache()
-            # profile.disable()  # 停止分析
-            # import sys
-            # profile.print_stats(sys.stdout)
-
-        output_tokens_list = torch.cat(output_tokens_list, dim=0)
-        
-        print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True)
-        print("\nContext:", raw_text, flush=True)
-        imgs, txts = [], []
-        for seq in output_tokens_list:
-            decoded_txts, decoded_imgs = tokenizer.DecodeIds(seq.tolist())
-            for i in range(len(decoded_imgs)):
-                if decoded_imgs[i].shape[-1] < 512:
-                    decoded_imgs[i] = torch.nn.functional.interpolate(decoded_imgs[i], size=(512, 512))
-                # decoded_imgs[i].view(3, 32, 16, 32, 16)[:, :, :4, :, :4] = 0
-                # decoded_imgs[i].view(3, 32, 16, 32, 16)[0, :, :4, :, :4] = 1
-                # decoded_imgs[i].view(3, 32, 16, 32, 16)[1, :12, :4, :16, :4] = 1
-            if args.debug:
-                imgs.extend(decoded_imgs)
-            else:
-                imgs.append(decoded_imgs[-1]) # only the last image (target)
-            txts.append(decoded_txts)
-        if args.generation_task == 'image2text':
-            print(txts)
-            return 
-        if args.debug:
-            output_file_prefix = raw_text.replace('/', '')[:20]
-            output_file = os.path.join(output_path, f"{output_file_prefix}-{datetime.now().strftime('%m-%d-%H-%M-%S')}.jpg")
-            imgs = torch.cat(imgs, dim=0)
-            print(txts)
-            print("\nSave to: ", output_file, flush=True)
-            save_image(imgs, output_file, normalize=True)
-        else:
-            print("\nSave to: ", output_path, flush=True)
-            for i in range(len(imgs)):
-                save_image(imgs[i], os.path.join(output_path,f'{i}.jpg'), normalize=True)
-                os.chmod(os.path.join(output_path,f'{i}.jpg'), stat.S_IRWXO+stat.S_IRWXG+stat.S_IRWXU)
-            save_image(torch.cat(imgs, dim=0), os.path.join(output_path,f'concat.jpg'), normalize=True)
-            os.chmod(os.path.join(output_path,f'concat.jpg'), stat.S_IRWXO+stat.S_IRWXG+stat.S_IRWXU)
-
-def generate_images_continually(model, args):
-    if args.generation_task == 'text2image':
-        query_template = '[ROI1] {} [BASE] [BOI1] [MASK]*1024'
-    elif args.generation_task == 'image2text':
-        query_template = '[BASE] [BOI1] [Image]{} [EOI1] [ROI1] [MASK]*20'
-    elif args.generation_task == 'low-level super-resolution':
-        query_template = '[ROI1] {} [BASE] [BOI1] [Image]{} [EOI1] [ROI2] [POS0] [BASE] [BOI2] [MASK]*1024'
-    elif args.generation_task == 'super-resolution':
-        query_template = '[ROI1] {} [BASE] [BOI1] [Image]{}'
-    elif args.generation_task == 'post-selection':
-        query_template = '[BASE] [BOI1] [Image]{} [EOI1] [ROI1] {}'
-    elif args.generation_task == 'cuda-2d generation':
-        query_template = '[ROI1] {} [BASE] [BOI1] [MASK]*1024 [EOI1] [MASK]*4096'
-    else:
-        raise NotImplementedError
-    for raw_text, seq, output_path in get_context(args, query_template):
-        if args.generation_task == 'super-resolution':
-            super_resolution(model, args, raw_text, seq, output_path=output_path)
-        elif args.generation_task == 'post-selection':
-            post_selection(model, args, raw_text, seq, output_path=output_path)
-        else:
-            generate_images_once(model, args, raw_text, seq, num=args.batch_size, output_path=output_path)
-
-def super_resolution(model, args, raw_text, seq, output_path="./samples"):
-    tokenizer = get_tokenizer()
-    model.eval()
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    with torch.no_grad():
-        start_time = time.time()
-        output_tokens_list = magnify(model, tokenizer, seq[-32**2:], seq[:-32**2], args)
-
-        print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True)
-        print("\nContext:", raw_text, flush=True)
-        output_file_prefix = raw_text.replace('/', '')[:20]
-        output_file = os.path.join(output_path, f"{output_file_prefix}-{datetime.now().strftime('%m-%d-%H-%M-%S')}.jpg")
-        imgs = []
-        if args.debug:
-            imgs.append(torch.nn.functional.interpolate(tokenizer.img_tokenizer.DecodeIds(seq[-32**2:]), size=(512, 512)))
-        for seq in output_tokens_list:
-            decoded_txts, decoded_imgs = tokenizer.DecodeIds(seq.tolist())
-            imgs.extend(decoded_imgs)
-        imgs = torch.cat(imgs, dim=0)
-        print("\nSave to: ", output_file, flush=True)
-        save_image(imgs, output_file, normalize=True)
-
-def post_selection(model, args, raw_text, seq, output_path):
-    tokenizer = get_tokenizer()
-    model.eval()
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    with torch.no_grad():
-        start_time = time.time()
-
-        num = seq.shape[0]
-        mbz = args.max_inference_batch_size
-        assert num < mbz or num % mbz == 0
-        scores = [inverse_prompt_score(model, seq[tim*mbz:(tim+1)*mbz], args)
-            for tim in range(max(num // mbz, 1))
-            ]
-        scores = torch.cat(scores, dim=0)
-        # scores = inverse_prompt_score(model, seq, args) # once
-
-        print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True)
-        print("\nContext:", raw_text, flush=True)
-        rank = dist.get_rank()
-        output_file = os.path.join(output_path, f"scores_rank_{rank}.txt")
-        with open(output_file, 'a') as fout:
-            fout.write(raw_text+'\n')
-            fout.write('\t'.join([str(x) for x in scores.tolist()])+'\n')
-        print("\nSave to: ", output_file, flush=True)
-       
-
-                
-def prepare_tokenizer(args):
-
-    tokenizer = get_tokenizer(args)
-
-    num_tokens = tokenizer.num_tokens
-    before = num_tokens
-    after = before
-    multiple = args.make_vocab_size_divisible_by * \
-               mpu.get_model_parallel_world_size()
-    while (after % multiple) != 0:
-        after += 1
-    print_rank_0('> padded vocab (size: {}) with {} dummy '
-                 'tokens (new size: {})'.format(
-        before, after - before, after))
-
-    args.vocab_size = after
-    print("prepare tokenizer done", flush=True)
-
-    return tokenizer
-
-
-def main():
-    """Main training program."""
-
-    print('Generate Samples')
-
-    # Disable CuDNN.
-    torch.backends.cudnn.enabled = False
-
-    # Arguments.
-    args = get_args()
-
-    # Pytorch distributed.
-    initialize_distributed(args)
-
-    # set device, this args.device is only used in inference
-    if args.device is not None:
-        device = int(args.device)
-        torch.cuda.set_device(device)
-
-    # Random seeds for reproducability.
-
-    # get the tokenizer
-    tokenizer = prepare_tokenizer(args)
-
-    # Model, optimizer, and learning rate.
-    model = setup_model(args)
-    set_random_seed(args.seed)
-
-    generate_images_continually(model, args)
-
-if __name__ == "__main__":
-    main()
--- a/generation/magnify.py
+++ b/generation/magnify.py
@@ -11,7 +11,6 @@ import os
 import sys
 import math
 import random
-from tqdm import tqdm

 import numpy as np
 import torch
@@ -30,7 +29,7 @@ def magnify(model, tokenizer, tokens_list, text_token_list, args):
        magnified_code = code.new_zeros((s * 2, s * 2), dtype=torch.long) - 1

        windows = [(0, 0, 18), (0, 1, 30), (0, 2, 30), (1, 1, 30), (1, 0, 30), (1, 2, 30), (2, 0, 32), (2, 1, 32), (2, 2, 32)]
-        for i, j, line in tqdm(windows):
+        for i, j, line in windows:
                code_part = code[8 * i: 8 * (i+2), 8 * j: 8 * (j+2)].reshape(-1)

                magnified_code_part = magnified_code[16 * i: 16 * i + line, 16 * j: 16 * (j+2)].reshape(-1)

--- a/generation/sampling.py
+++ b/generation/sampling.py
@@ -11,7 +11,6 @@ import os
 import sys
 import math
 import random
-from tqdm import tqdm

 import numpy as np
 import torch

--- a/requirements.txt
+++ b/requirements.txt
 torch
 deepspeed
-tqdm
 lmdb
-filelock
 sentencepiece
-mpi4py
 tensorboardX==1.8
\ No newline at end of file
--- a/scripts_old/cuda_2d_text2image.sh
+++ b/scripts_old/cuda_2d_text2image.sh
-#!/bin/bash
-
-CHECKPOINT_PATH=data/checkpoints/cogview-base
-# CHECKPOINT_PATH=data/checkpoints/cogview-compare
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=5184
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1.
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --sandwich-ln \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --sparse-type cuda_2d \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task "cuda-2d generation" \
-       --input-source ./input.txt \
-       --output-path samples_cuda_2d3 \
-       --batch-size 4 \
-       --max-inference-batch-size 4 \
-       --device 0 \
-       --finetune \
-       --no-load-optim \
-       --sparse-type cuda_2d \
-       --debug \
-       $@
-
-
--- a/scripts_old/image2text.sh
+++ b/scripts_old/image2text.sh
-#!/bin/bash
-
-CHECKPOINT_PATH=pretrained/cogview/cogview-caption
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=1089
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1.
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
- 
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --query-window 64 \
-       --key-window-times 4 \
-       --num-pivot 256 \
-       --is-sparse 0 \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task image2text \
-       --input-source interactive \
-       --output-path samples_image2text \
-       --batch-size 8 \
-       --debug \
-       --device 1 \
-       $@
-
-
--- a/scripts_old/low_level_super_resolution.sh
+++ b/scripts_old/low_level_super_resolution.sh
-#!/bin/bash
-
-CHECKPOINT_PATH=pretrained/cogview/cogview-sr
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=1345
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1.02
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
- 
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --query-window 64 \
-       --key-window-times 4 \
-       --num-pivot 256 \
-       --is-sparse 0 \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task "low-level super-resolution" \
-       --input-source interactive \
-       --output-path samples_low_level_sr \
-       --batch-size 4 \
-       --device 6 \
-       $@
-
-
--- a/scripts_old/post_selection.sh
+++ b/scripts_old/post_selection.sh
-#!/bin/bash
-
-CHECKPOINT_PATH=pretrained/cogview/cogview-caption
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=1089
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1.
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
- 
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --query-window 64 \
-       --key-window-times 4 \
-       --num-pivot 256 \
-       --is-sparse 0 \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task post-selection \
-       --input-source input_select.txt \
-       --output-path samples_post_selection \
-       --debug \
-       --device 2 \
-       $@
-# input-source is split by \t, instead of 4 spaces
-
--- a/scripts_old/pretrain_multiple_nodes.sh
+++ b/scripts_old/pretrain_multiple_nodes.sh
-#! /bin/bash
-
-# Change for multinode config
-
-NUM_WORKERS=19
-NUM_GPUS_PER_WORKER=8
-MP_SIZE=1
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-main_dir=$(dirname $script_dir)
-
-# OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_SOCKET_IFNAME=bond0 NCCL_IB_GID_INDEX=3 NCCL_NET_GDR_LEVEL=0"
-OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
-HOST_FILE_PATH="hostfile"
-# OPTIONS_NCCL=""
-# HOST_FILE_PATH="hostfile_single"
-
-small_data="/dataset/fd5061f6/cogview/cogdata_new/cogdata_task_4leveltokens/zijian/zijian.bin.part_0.cogdata"
-full_data="/dataset/fd5061f6/cogview/cogdata_new/cogdata_task_4leveltokens/merge.bin"
-
-config_json="$script_dir/ds_config_zero.json"
-gpt_options=" \
-       --experiment-name cogview-base-long \
-       --img-tokenizer-num-tokens 8192 \
-       --dataset-type CompactBinaryDataset \
-       --model-parallel-size ${MP_SIZE} \
-       --num-layers 48 \
-       --hidden-size 2560 \
-       --num-attention-heads 40 \
-       --train-iters 300000 \
-       --resume-dataloader \
-       --train-data ${full_data} \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr-decay-style cosine \
-       --warmup .1 \
-       --checkpoint-activations \
-       --deepspeed-activation-checkpointing \
-       --max-position-embeddings 1089 \
-       --max-memory-length 0 \
-       --sandwich-ln \
-       --txt-loss-scale 0.1 \
-       --sparse-type cuda_2d \
-       --fp16 \
-       --save-interval 2000 \
-       --no-load-optim \
-       --no-save-optim \
-       --eval-interval 1000 \
-       --save $main_dir/data/checkpoints \
-       --fast-load \
-       --load data/checkpoints/cogview-base \
-       --finetune 
-"
-          
-#        --finetune
-       # --save $main_dir/data/checkpoints \
-       #         --restart-iter 199000 
-      
-
-
-
-
-gpt_options="${gpt_options}
-       --deepspeed \
-       --deepspeed_config ${config_json} \
-"
-              
-
-run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_gpt2.py $@ ${gpt_options}"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
--- a/scripts_old/pretrain_single_node.sh
+++ b/scripts_old/pretrain_single_node.sh
-#! /bin/bash
-
-# Change for multinode config
-
-NUM_WORKERS=1
-NUM_GPUS_PER_WORKER=8
-MP_SIZE=1
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-main_dir=$(dirname $script_dir)
-
-# OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_SOCKET_IFNAME=bond0 NCCL_IB_GID_INDEX=3 NCCL_NET_GDR_LEVEL=0"
-OPTIONS_NCCL="NCCL_DEBUG=info"
-HOST_FILE_PATH="hostfile_single"
-
-
-config_json="$script_dir/ds_config_zero.json"
-gpt_options=" \
-       --experiment-name cogview-testlocal \
-       --img-tokenizer-num-tokens 8192 \
-       --dataset-type BinaryDataset \
-       --model-parallel-size ${MP_SIZE} \
-       --num-layers 48 \
-       --hidden-size 2560 \
-       --num-attention-heads 40 \
-       --save $main_dir/data/checkpoints \
-       --train-iters 100000 \
-       --resume-dataloader \
-       --train-data /dataset/fd5061f6/cogview/cogdata_new/cogdata_task_3leveltokens/merge.bin \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr-decay-style cosine \
-       --warmup .1 \
-       --checkpoint-activations \
-       --deepspeed-activation-checkpointing \
-       --max-position-embeddings 5184 \
-       --max-memory-length 0 \
-       --fp16 \
-       --txt-loss-scale 2 \
-       --sandwich-ln \
-       --sparse-type cuda_2d \
-       --save-interval 2500 
-"
-
-gpt_options="${gpt_options}
-               --deepspeed \
-               --deepspeed_config ${config_json} \
-"
-
-
-run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_gpt2.py $@ ${gpt_options}"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
--- a/scripts_old/super_resolution.sh
+++ b/scripts_old/super_resolution.sh
-#!/bin/bash
-
-CHECKPOINT_PATH=pretrained/cogview/cogview-sr
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=1345
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1.02
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
- 
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --query-window 64 \
-       --key-window-times 4 \
-       --num-pivot 256 \
-       --is-sparse 0 \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task "super-resolution" \
-       --input-source interactive \
-       --output-path samples_sr \
-       --debug \
-       --device 0 \
-       $@
-
-
--- a/scripts_old/testnan.sh
+++ b/scripts_old/testnan.sh
-#! /bin/bash
-
-# Change for multinode config
-
-NUM_WORKERS=1
-NUM_GPUS_PER_WORKER=1
-MP_SIZE=1
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-main_dir=$(dirname $script_dir)
-
-# OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_SOCKET_IFNAME=bond0 NCCL_IB_GID_INDEX=3 NCCL_NET_GDR_LEVEL=0"
-OPTIONS_NCCL="NCCL_DEBUG=info"
-HOST_FILE_PATH="hostfile_single"
-
-small_data="/dataset/fd5061f6/cogview/cogdata_new/cogdata_task_3leveltokens/zijian/zijian.bin.part_0.cogdata"
-full_data="/dataset/fd5061f6/cogview/cogdata_new/cogdata_task_3leveltokens/merge.bin"
-
-config_json="$script_dir/ds_config.json"
-gpt_options=" \
-       --experiment-name cogview-testlocal \
-       --img-tokenizer-num-tokens 8192 \
-       --dataset-type CompactBinaryDataset \
-       --model-parallel-size ${MP_SIZE} \
-       --num-layers 48 \
-       --hidden-size 2560 \
-       --num-attention-heads 40 \
-       --save $main_dir/data/checkpoints \
-       --train-iters 100000 \
-       --resume-dataloader \
-       --test-data ${full_data} \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr-decay-style cosine \
-       --warmup .1 \
-       --checkpoint-activations \
-       --deepspeed-activation-checkpointing \
-       --max-position-embeddings 1089 \
-       --max-memory-length 0 \
-       --txt-loss-scale 1 \
-       --sandwich-ln \
-       --sparse-type standard \
-       --save-interval 2500 \
-       --fp16 \
-       --eval-iters 1000 \
-       --load pretrained/cogview/cogview-base
-"
-       # 
-              # --load data/checkpoints/cogview-fixgrad-small08-25-09-38
-
-
-gpt_options="${gpt_options}
-
-"
-            #    --deepspeed \
-            #    --deepspeed_config ${config_json} \
-
-run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_gpt2.py $@ ${gpt_options}"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
--- a/scripts_old/text2image.sh
+++ b/scripts_old/text2image.sh
-#!/bin/bash
-
-# ==== tutorial settings: =====
-# CHECKPOINT_PATH=data/checkpoints/cogview-bird_animal_tutorial-12-1024-1608-10-09-38
-# NLAYERS=12
-# NHIDDEN=1024
-# NATT=16
-
-# CHECKPOINT_PATH=data/checkpoints/cogview-base
-CHECKPOINT_PATH=pretrained/cogview/cogview-base
-NLAYERS=48
-NHIDDEN=2560
-NATT=40
-MAXSEQLEN=1089
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-MPSIZE=1
-
-#SAMPLING ARGS
-TEMP=1
-#If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
-TOPK=200
-TOPP=0
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-
-MASTER_PORT=${MASTER_PORT} python generate_samples.py \
-       --deepspeed \
-       --model-parallel-size $MPSIZE \
-       --num-layers $NLAYERS \
-       --hidden-size $NHIDDEN \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads $NATT \
-       --max-position-embeddings 1089 \
-       --fp16 \
-       --temperature $TEMP \
-       --top_k $TOPK \
-       --top_p $TOPP \
-       --sandwich-ln \
-       --img-tokenizer-path pretrained/vqvae/vqvae_hard_biggerset_011.pt \
-       --sparse-type standard \
-       --max-position-embeddings-finetune $MAXSEQLEN \
-       --generation-task text2image \
-       --input-source ./input.txt \
-       --output-path samples_text2image \
-       --batch-size 8 \
-       --max-inference-batch-size 8 \
-       --device 0 \
-       --debug \
-       $@
-
-
--- a/tokenization/cogview/templates.py
+++ b/tokenization/cogview/templates.py
@@ -11,7 +11,6 @@ import os
 import sys
 import math
 import random
-from tqdm import tqdm

 import numpy as np
 import torch

--- a/tokenization/cogview/unified_tokenizer.py
+++ b/tokenization/cogview/unified_tokenizer.py
@@ -11,7 +11,6 @@ import os
 import sys
 import math
 import random
-from tqdm import tqdm

 import numpy as np
 import torch

--- a/tokenization/cogview/vqvae_tokenizer.py
+++ b/tokenization/cogview/vqvae_tokenizer.py
@@ -11,7 +11,6 @@ import os
 import sys
 import math
 import random
-from tqdm import tqdm

 import numpy as np
 import torch

--- a/training/deepspeed_training.py
+++ b/training/deepspeed_training.py
@@ -29,11 +29,11 @@ import deepspeed
 from .learning_rates import AnnealingLR
 from .model_io import load_checkpoint, save_checkpoint

-from utils import Timers
-from utils import report_memory
-from utils import print_args
-from utils import print_rank_0
-from utils import get_sample_writer
+from .utils import Timers
+from .utils import report_memory
+from .utils import print_args
+from .utils import print_rank_0
+from .utils import get_sample_writer

 import mpu
 from data_utils import make_loaders 

--- a/training/model_io.py
+++ b/training/model_io.py
@@ -16,7 +16,7 @@ import torch
 import numpy as np

 import mpu
-from utils import print_rank_0
+from .utils import print_rank_0

 def get_checkpoint_name(checkpoints_path, iteration, release=False, zero=False):
    if release:

--- a/utils.py
+++ b/utils.py