init commit

169a0234 · Ming Ding · 169a0234 · 169a0234 · 169a0234 · 169a0234
Commit 169a0234 authored 3 years ago by Ming Ding
--- a/.gitignore
+++ b/.gitignore
+.idea/
+*_jax*
+events.out*
+__pycache__/
+*.pt
+data
+core.*
+_cache*
+.vscode/
+samples/
+hostfile
+pretrained/checkpoints
+*.png
+*.jpg
+*.jpeg
+input*.txt
+samples*
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021 Ming Ding
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/arguments.py
+++ b/arguments.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""argparser configuration"""
+import argparse
+import os
+import torch
+import deepspeed
+import json
+def add_model_config_args(parser):
+    """Model arguments"""
+    group = parser.add_argument_group('model', 'model configuration')
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='dropout probability for attention weights')
+    group.add_argument('--num-attention-heads', type=int, default=16,
+                       help='num of transformer attention heads')
+    group.add_argument('--hidden-size', type=int, default=1024,
+                       help='tansformer hidden size')
+    group.add_argument('--num-layers', type=int, default=24,
+                       help='num decoder layers')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='layer norm epsilon')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='dropout probability for hidden state transformer')
+    group.add_argument('--max-position-embeddings', type=int, default=512,
+                       help='maximum number of position embeddings to use')
+    group.add_argument('--vocab-size', type=int, default=30522,
+                       help='vocab size to use for non-character-level '
+                            'tokenization. This value will only be used when '
+                            'creating a tokenizer')
+    group.add_argument('--deep-init', action='store_true',
+                       help='initialize bert model similar to gpt2 model.'
+                            'scales initialization of projection layers by a '
+                            'factor of 1/sqrt(2N). Necessary to train bert '
+                            'models larger than BERT-Large.')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                            'This is added for computational efficieny reasons.')
+    group.add_argument('--cpu-optimizer', action='store_true',
+                       help='Run optimizer on CPU')
+    group.add_argument('--cpu_torch_adam', action='store_true',
+                       help='Use Torch Adam as optimizer on CPU.')
+    group.add_argument('--max-position-embeddings-finetune', type=int, default=-1,
+                       help='maximum number of position embeddings to use in finetune')
+    return parser
+def add_fp16_config_args(parser):
+    """Mixed precision arguments."""
+    group = parser.add_argument_group('fp16', 'fp16 configurations')
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode')
+    group.add_argument('--fp32-embedding', action='store_true',
+                       help='embedding in fp32')
+    group.add_argument('--fp32-layernorm', action='store_true',
+                       help='layer norm in fp32')
+    group.add_argument('--fp32-tokentypes', action='store_true',
+                       help='embedding token types in fp32')
+    group.add_argument('--fp32-allreduce', action='store_true',
+                       help='all-reduce in fp32')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                            'values can improve fp16 convergence. If None, dynamic'
+                            'loss scaling is used.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale')
+    group.add_argument('--min-scale', type=float, default=1,
+                       help='Minimum loss scale for dynamic loss scale')
+    return parser
+def add_training_args(parser):
+    """Training arguments."""
+    group = parser.add_argument_group('train', 'training configurations')
+    group.add_argument('--experiment-name', type=str, default="CogView",
+                       help="The experiment name for summary and checkpoint")
+    group.add_argument('--batch-size', type=int, default=4,
+                       help='Data Loader batch size')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='weight decay coefficient for L2 regularization')
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='checkpoint activation to allow for training '
+                            'with larger models and sequences')
+    group.add_argument('--checkpoint-num-layers', type=int, default=1,
+                       help='chunk size (number of layers) for checkpointing')
+    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
+                       help='uses activation checkpointing from deepspeed')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='gradient clipping')
+    group.add_argument('--train-iters', type=int, default=1000000,
+                       help='total number of iterations to train over all training runs')
+    group.add_argument('--log-interval', type=int, default=50,
+                       help='report interval')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after this many new iterations.')
+    group.add_argument('--summary-dir', type=str, default="", help="The directory to store the summary")
+    group.add_argument('--seed', type=int, default=1234,
+                       help='random seed')
+    group.add_argument('--img-tokenizer-path', type=str, default=None,
+                       help='The checkpoint file path of image tokenizer.')
+    group.add_argument('--img-tokenizer-num-tokens', type=int, default=None,
+                       help='The num tokens of image tokenizer. ONLY use for pretraining with img-tokenizer UNKNOW.')
+    # Batch prodecuer arguments
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                            'end-of-document token.')
+    # Learning rate.
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay LR over,'
+                            ' If None defaults to `--train-iters`*`--epochs`')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'exponential'],
+                       help='learning rate decay function')
+    group.add_argument('--lr-decay-ratio', type=float, default=0.1)
+    group.add_argument('--lr', type=float, default=1.0e-4,
+                       help='initial learning rate')
+    group.add_argument('--warmup', type=float, default=0.01,
+                       help='percentage of data to warmup on (.01 = 1% of all '
+                            'training iters). Default 0.01')
+    # model checkpointing
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', type=int, default=5000,
+                       help='number of iterations between saves')
+    group.add_argument('--no-save-optim', action='store_true',
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true',
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Path to a directory containing a model checkpoint.')
+    group.add_argument('--no-load-optim', action='store_true',
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true',
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                            'or rng state from checkpoint and set iteration to 0. '
+                            'Assumed when loading a release checkpoint.')
+    group.add_argument('--resume-dataloader', action='store_true',
+                       help='Resume the dataloader when resuming training. '
+                            'Does not apply to tfrecords dataloader, try resuming'
+                            'with a different seed in this case.')
+    # distributed training args
+    group.add_argument('--distributed-backend', default='nccl',
+                       help='which backend to use for distributed '
+                            'training. One of [gloo, nccl]')
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher')
+    # loss scale
+    group.add_argument('--txt-loss-scale', type=float, default=1)
+    group.add_argument('--fast-load', action='store_true',
+                       help='load checkpoints without locks.')
+    return parser
+def add_evaluation_args(parser):
+    """Evaluation arguments."""
+    group = parser.add_argument_group('validation', 'validation configurations')
+    group.add_argument('--eval-batch-size', type=int, default=None,
+                       help='Data Loader batch size for evaluation datasets.'
+                            'Defaults to `--batch-size`')
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='number of iterations to run for evaluation'
+                            'validation/test for')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='interval between running evaluation on validation set')
+    return parser
+def add_text_generate_args(parser):
+    """Text generate arguments."""
+    group = parser.add_argument_group('Text generation', 'configurations')
+    group.add_argument("--temperature", type=float, default=1.0)
+    group.add_argument("--top_p", type=float, default=0.0)
+    group.add_argument("--top_k", type=int, default=0)
+    # group.add_argument("--out-seq-length", type=int, default=256)
+    group.add_argument("--generation-task", type=str,
+                       default='text2image',
+                       choices=['text2image',
+                                'image2text',
+                                'super-resolution',
+                                'low-level super-resolution',
+                                'post-selection',
+                                'raw'
+                                ],
+                       help='what type of inference task to use')
+    group.add_argument('--input-source', type=str, default='interactive',
+                       help='what input mode to use, interactive or path')
+    group.add_argument('--output-path', type=str, default='./samples',
+                       help='path to place the generated samples')
+    group.add_argument('--debug', action='store_true',
+                       help='Debug will merge all outputs.')
+    group.add_argument('--with-id', action='store_true',
+                       help='If each line is prepended with an id.')
+    group.add_argument('--max-inference-batch-size', type=int, default=12)
+    return parser
+def add_data_args(parser):
+    """Train/valid/test data arguments."""
+    group = parser.add_argument_group('data', 'data configurations')
+    group.add_argument('--model-parallel-size', type=int, default=1,
+                       help='size of the model parallel.')
+    group.add_argument('--shuffle', action='store_true',
+                       help='Shuffle data. Shuffling is deterministic '
+                            'based on seed and current epoch.')
+    group.add_argument('--train-data', nargs='+', default=None,
+                       help='Whitespace separated filenames or corpora names '
+                            'for training.')
+    group.add_argument('--valid-data', nargs='*', default=None,
+                       help="""Filename for validation data.""")
+    group.add_argument('--split', default='1000,1,1',
+                       help='comma-separated list of proportions for training,'
+                            ' validation, and test split')
+    group.add_argument('--test-data', nargs='*', default=None,
+                       help="""Filename for testing""")
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="""Number of workers to use for dataloading""")
+    group.add_argument('--dataset-type', type=str,
+                       default='TokenizedDataset',
+                       choices=['TokenizedDataset',
+                                'TextCodeDataset'],
+                       help='what type of dataset to use')
+    group.add_argument('--max-memory-length', type=int, default=2048,
+                       help="max memory buffer for attention")
+    group.add_argument('--new-dataset-path', type=str, default=None,
+                       help='The folder we will dynamically check for lmdbs during training.')
+    return parser
+def add_generation_api_args(parser):
+    """generation api arguments"""
+    group = parser.add_argument_group('api', 'api configurations')
+    group.add_argument('--img_folder_path', default='image/')
+    group.add_argument('--input_folder_path', default='input/')
+    group.add_argument('--input_rec_path', default='input/')
+    group.add_argument('--check_mode', default='code')
+    group.add_argument('--time_interval', default=10)
+    group.add_argument('--device', default=None)
+    return parser
+def add_sparse_args(parser):
+    """sparse attention arguments."""
+    group = parser.add_argument_group('Sparse Attention', 'sparse configurations')
+    group.add_argument('--is-sparse', type=int, default=0,
+                       choices=[0, 1, 2],
+                       help='whether use sparse attention. 0 not 1 train 2 inference') # TODO: Temporally not using is-sparse==2 (not optimized), use 0 for inference.
+    group.add_argument("--query-window", type=int, default=128)
+    group.add_argument("--key-window-times", type=int, default=6)
+    group.add_argument("--num-pivot", type=int, default=768)
+    return parser
+def get_args():
+    """Parse all the args."""
+    parser = argparse.ArgumentParser(description='PyTorch CogView Model')
+    parser = add_model_config_args(parser)
+    parser = add_fp16_config_args(parser)
+    parser = add_training_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_text_generate_args(parser)
+    parser = add_data_args(parser)
+    parser = add_generation_api_args(parser)
+    parser = add_sparse_args(parser)
+    # Include DeepSpeed configuration arguments
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    if not args.train_data:
+        print('WARNING: No training data specified')
+        assert args.is_sparse != 1, 'use is-sparse == 2 for inference'
+    elif args.is_sparse == 1 and (args.max_position_embeddings - 1) % args.query_window != 0:
+        raise ValueError('During sparse training, the sequence length must be exactly divided by window_size.') 
+    args.cuda = torch.cuda.is_available()
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+    if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
+        mpi_define_env(args)
+    elif os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+        # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+        local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+        local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+        # Possibly running with Slurm
+        num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+        nodeid = int(os.getenv('SLURM_NODEID', '0'))
+        args.local_rank = local_rank
+        args.rank = nodeid * local_size + local_rank
+        args.world_size = num_nodes * local_size
+    args.model_parallel_size = min(args.model_parallel_size, args.world_size)
+    if args.rank == 0:
+        print('using world size: {} and model-parallel size: {} '.format(
+            args.world_size, args.model_parallel_size))
+    args.dynamic_loss_scale = False
+    if args.loss_scale is None:
+        args.dynamic_loss_scale = True
+        if args.rank == 0:
+            print(' > using dynamic loss scaling')
+    # The args fp32_* or fp16_* meant to be active when the
+    # args fp16 is set. So the default behaviour should all
+    # be false.
+    if not args.fp16:
+        args.fp32_embedding = False
+        args.fp32_tokentypes = False
+        args.fp32_layernorm = False
+    if hasattr(args, "deepspeed") and args.deepspeed and args.deepspeed_config is not None:
+        with open(args.deepspeed_config) as file:
+            deepspeed_config = json.load(file)
+        if "train_micro_batch_size_per_gpu" in deepspeed_config:
+            args.batch_size = deepspeed_config["train_micro_batch_size_per_gpu"]
+        if "gradient_accumulation_steps" in deepspeed_config:
+            args.gradient_accumulation_steps = deepspeed_config["gradient_accumulation_steps"]
+        else:
+            args.gradient_accumulation_steps = None
+        if "optimizer" in deepspeed_config:
+            optimizer_params_config = deepspeed_config["optimizer"].get("params", {})
+            args.lr = optimizer_params_config.get("lr", args.lr)
+            args.weight_decay = optimizer_params_config.get("weight_decay", args.weight_decay)
+    return args
+def mpi_define_env(args):
+    ''' For training CogView via MPI to setup the connection.
+        Omit this function if use the basic deepspeed pdsh runner. 
+    '''
+    from mpi4py import MPI
+    import subprocess
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+    master_addr = None
+    if rank == 0:
+        hostname_cmd = ["hostname -I"]
+        result = subprocess.check_output(hostname_cmd, shell=True)
+        master_addr = result.decode('utf-8').split()[0]
+    master_addr = comm.bcast(master_addr, root=0)
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    args.local_rank = local_rank
+    args.world_size = world_size
+    args.rank = rank
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ['MASTER_PORT'] = "29500" # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+    print(
+        "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+        .format(os.environ['RANK'],
+                args.local_rank,
+                os.environ['WORLD_SIZE'],
+                os.environ['MASTER_ADDR'],
+                os.environ['MASTER_PORT']))
--- a/assets/coco_new.png
+++ b/assets/coco_new.png
--- a/assets/cogviewcase.png
+++ b/assets/cogviewcase.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/data/placeholder
+++ b/data/placeholder
--- a/data_utils/__init__.py
+++ b/data_utils/__init__.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   __init__.py
+@Time    :   2021/01/11 16:35:24
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+from .unified_tokenizer import get_tokenizer
+from .templates import *
+from .configure_data import make_loaders, detect_new_datasets
\ No newline at end of file
--- a/data_utils/configure_data.py
+++ b/data_utils/configure_data.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   configure_data.py
+@Time    :   2021/01/11 23:28:38
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from tqdm import tqdm
+import copy
+import numpy as np
+import torch
+import torch.nn.functional as F
+from bisect import bisect_right
+from .unified_tokenizer import get_tokenizer
+from .datasets import get_dataset_by_type
+from torch.utils import data
+from .samplers import DistributedBatchSampler
+import mpu
+def make_data_loader(dataset, batch_size, num_iters, args):
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
+    distributed = world_size > 1
+    sampler = torch.utils.data.SequentialSampler(dataset)
+    drop_last = distributed
+    # the GPUs in the same model parallel group receive the same data
+    if distributed:
+        batch_sampler = DistributedBatchSampler(sampler,
+                                                                    batch_size,
+                                                                    drop_last,
+                                                                    rank,
+                                                                    world_size,
+                                                                    gradient_accumulation_steps=args.gradient_accumulation_steps)
+    else:
+        batch_sampler = torch.utils.data.BatchSampler(sampler,
+                                                        batch_size,
+                                                        drop_last)
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_sampler=batch_sampler,
+                                              num_workers=args.num_workers,
+                                              pin_memory=True)
+    return data_loader
+def make_dataset(dataset_type, path, split, args, **kwargs):
+    """function to create datasets+tokenizers for common options"""
+    print('make dataset ...', path)
+    if split is None:
+        split = [1.]
+    assert isinstance(path, list)
+    # TODO other dsclass, e.g. odps
+    # ds = [get_dataset_by_type(dataset_type, p, args) for p in path]
+    # dataset object can be copied N times
+    ds = []
+    for p in path:
+        d = get_dataset_by_type(dataset_type, p, args)
+        if p.find('t2i') >= 0:
+            ds.extend([d] * 4)
+            print(f'Enlarge {p} 4 times...')
+        elif p.find('i2t') >= 0:
+            ds.extend([d] * 2)
+            print(f'Enlarge {p} 2 times...')
+        else:
+            ds.append(d)
+    ds = RandomMappingDataset(ConcatDataset(ds))
+    if should_split(split):
+        ds = split_ds(ds, split) # Large dataset, cannot shuffle, randomly mapping
+    # FIXME this will merge valid set and train set.
+    return ds
+def make_loaders(args):
+    """makes training/val/test"""
+    world_size = torch.distributed.get_world_size(
+        group=mpu.get_data_parallel_group())
+    batch_size = args.batch_size * world_size
+    eval_batch_size = batch_size
+    if args.eval_batch_size is not None:
+        eval_batch_size = args.eval_batch_size * world_size
+    split = get_split(args)
+    data_set_args = {
+        'path': args.train_data,
+        'dataset_type': args.dataset_type,
+        'split': split,
+    }
+    eval_set_args = copy.copy(data_set_args)
+    eval_set_args['split'] = [1.]
+    # make datasets splits and tokenizer
+    train = None
+    valid = None
+    test = None
+    if args.train_data is not None:
+        train = make_dataset(**data_set_args, args=args)
+        if should_split(split):
+            train, valid, test = train
+    # make training and val dataset if necessary
+    if valid is None and args.valid_data is not None:
+        eval_set_args['path'] = args.valid_data
+        valid = make_dataset(**eval_set_args, args=args)
+    if test is None and args.test_data is not None:
+        eval_set_args['path'] = args.test_data
+        test = make_dataset(**eval_set_args, args=args)
+    # wrap datasets with data loader
+    if train is not None and args.batch_size > 0:
+        train = make_data_loader(train, batch_size, args.train_iters, args)
+        args.do_train = True
+    else:
+        args.do_train = False
+    eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
+    if valid is not None:
+        valid = make_data_loader(valid, eval_batch_size, args.train_iters, args)
+        args.do_valid = True
+    else:
+        args.do_valid = False
+    if test is not None:
+        test = make_data_loader(test, eval_batch_size, len(test) // eval_batch_size + 1, args)
+        args.do_test = True
+    else:
+        args.do_test = False
+    return train, valid, test
+def get_split(args):
+    """
+    Get dataset splits from comma separated string list
+    """
+    splits = []
+    if args.split.find(',') != -1:
+        splits = [float(s) for s in args.split.split(',')]
+    elif args.split.find('/') != -1:
+        splits = [float(s) for s in args.split.split('/')]
+    else:
+        splits = [float(args.split)]
+    split_total = sum(splits)
+    if split_total < 1.:
+        splits.append(1-split_total)
+    while len(splits) < 3:
+        splits.append(0.)
+    splits = splits[:3]
+    if args.valid_data is not None:
+        splits[1] = 0.
+    if args.test_data is not None:
+        splits[2] = 0.
+    final_sum = sum(splits)
+    return [s/final_sum for s in splits]
+def should_split(split):
+    """
+    given split proportions checks if should split
+    Examples:
+    >>> should_split([10,0,0]) 
+    False
+    >>> should_split([1,.1,.2])
+    True
+    """
+    return max(split) / sum(split) != 1.
+def split_ds(ds, split=[.8,.2,.0]):
+    """
+    Split a dataset into subsets given proportions of how
+    much to allocate per split. If a split is 0% returns None for that split.
+    Purpose: Useful for creating train/val/test splits
+    Arguments:
+        ds (Dataset or array-like): Data to be split.
+        split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
+        shuffle (boolean): Randomly split dataset. Default: True
+    """
+    split_sum = sum(split)
+    if split_sum == 0:
+        raise Exception('Split cannot sum to 0.')
+    split = np.array(split)
+    split /= split_sum
+    ds_len = len(ds)
+    start_idx = 0
+    residual_idx = 0
+    rtn_ds = [None]*len(split)
+    for i, f in enumerate(split):
+        if f != 0:
+            proportion = ds_len*split[i]
+            residual_idx += proportion % 1
+            split_ = int(int(proportion) + residual_idx)
+            split_range = (start_idx, start_idx+max(split_, 1))
+            rtn_ds[i] = SplitDataset(ds, split_range)
+            start_idx += split_
+            residual_idx %= 1
+    return rtn_ds
+class ConcatDataset(data.Dataset):
+    """
+    Dataset to concatenate multiple datasets.
+    Purpose: useful to assemble different existing datasets, possibly
+    large-scale datasets as the concatenation operation is done in an
+    on-the-fly manner.
+    Arguments:
+        datasets (sequence): List of datasets to be concatenated.
+    """
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets, **kwargs):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, 'datasets should not be an empty iterable'
+        self.datasets = list(datasets)
+        self.cumulative_sizes = self.cumsum(self.datasets)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+class SplitDataset(data.Dataset):
+    """
+    Dataset wrapper to access a subset of another dataset.
+    Purpose: useful to index into existing datasets, possibly
+    large-scale datasets as the subindexing operation is done in an
+    on-the-fly manner.
+    Arguments:
+        ds (Dataset or array-like): List of datasets to be subindexed
+        split_range (Tuple): (Left, Right)
+    """
+    def __init__(self, ds, split_range, **kwargs):
+        self.split_range = split_range
+        self.wrapped_data = ds
+    def __len__(self):
+        return self.split_range[1] - self.split_range[0]
+    def __getitem__(self, index):
+        index += self.split_range[0]
+        assert index < self.split_range[1]
+        return self.wrapped_data[index]
+    def __iter__(self):
+        for idx in range(*self.split_range):
+            yield self.wrapped_data[idx]
+class RandomMappingDataset(data.Dataset):
+    '''
+    Dataset wrapper to randomly mapping indices to original order.
+    Will also enlarge the length
+    '''
+    def __init__(self, ds, **kwargs):
+        self.wrapped_data = ds
+    def __len__(self):
+        return len(self.wrapped_data) * 60
+    def __getitem__(self, index):
+        rng = random.Random(index)
+        rng = np.random.RandomState(seed=[rng.randint(0, 2**32-1) for _ in range(16)])
+        index = rng.randint(len(self.wrapped_data))
+        return self.wrapped_data[index]
+def detect_new_datasets(args):
+    if args.new_dataset_path is None:
+        return None
+    if not os.path.exists(args.new_dataset_path):
+        print('Warning: new_dataset_path not exists... skip detection.')
+        return None
+    current_datasets = [str(os.path.abspath(path)) for path in args.train_data]
+    found = []
+    for _p in os.listdir(args.new_dataset_path):
+        p = os.path.join(args.new_dataset_path, _p)
+        if str(p).endswith('lmdb') and not str(os.path.abspath(p)) in current_datasets:
+            found.append(p)
+    if len(found) == 0:
+        return None
+    else:
+        args.train_data = args.train_data + found
+        return make_loaders(args)    
--- a/data_utils/datasets.py
+++ b/data_utils/datasets.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   datasets.py
+@Time    :   2021/01/11 21:01:51
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from tqdm import tqdm
+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+import pickle
+from collections import namedtuple
+from torch.utils.data import Dataset
+import lmdb
+from .unified_tokenizer import get_tokenizer
+from .templates import TextCodeTemplate
+logger = logging.getLogger(__name__)
+class LMDBDataset(Dataset):
+    def __init__(self, path, process_fn):
+        self.env = lmdb.open(
+            path,
+            max_readers=32,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+        )
+        self.process_fn = process_fn
+        if not self.env:
+            raise IOError('Cannot open lmdb dataset', path)
+        with self.env.begin(write=False) as txn:
+            self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        with self.env.begin(write=False) as txn:
+            key = str(idx).encode('utf-8')
+            row = pickle.loads(txn.get(key))
+            return self.process_fn(row)
+def get_dataset_by_type(dataset_type, path: str, args, DS_CLASS=LMDBDataset):       
+    tokenizer = get_tokenizer()
+    if args.finetune and args.max_position_embeddings_finetune > args.max_position_embeddings:
+        ml = args.max_position_embeddings_finetune
+    else:
+        ml = args.max_position_embeddings
+    def pad_to_len(ret):
+        if len(ret) < ml: # pad
+            return np.concatenate((ret, 
+                np.array([tokenizer['[PAD]']] * (ml - len(ret)))),
+                axis=0), len(ret)
+        else:
+            if len(ret) > ml:
+                logger.warning('Out of max len, truncated.')
+            return ret[:ml], ml
+    if dataset_type == 'TokenizedDataset':
+        # already tokenized when saved
+        def process_fn(row):
+            ret, attention_mask_sep = pad_to_len(row.flatten())
+            return {'text': ret, 
+                'loss_mask':  np.array([1] * attention_mask_sep + [0] * (len(ret) - attention_mask_sep))
+                }
+    elif dataset_type == 'TextCodeDataset':
+        def process_fn(row):
+            text, code = row[0], row[1].flatten()
+            ret = TextCodeTemplate(text, code)
+            ret, attention_mask_sep = pad_to_len(ret)
+            return {'text': ret, 
+                'loss_mask':  np.array([1] * attention_mask_sep + [0] * (len(ret) - attention_mask_sep))
+                }
+    return DS_CLASS(path, process_fn)
--- a/data_utils/samplers.py
+++ b/data_utils/samplers.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""batch samplers that work with either random or sequential data samplers"""
+import math
+import os
+import sys
+import torch
+from torch.utils import data
+import numpy as np
+class RandomSampler(data.sampler.Sampler):
+    r"""
+    Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
+    but this class lets the user set an epoch like DistributedSampler
+    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify ``num_samples`` to draw.
+    Arguments:
+        data_source (Dataset): dataset to sample from
+        num_samples (int): number of samples to draw, default=len(dataset)
+        replacement (bool): samples are drawn with replacement if ``True``, default=False
+    """
+    def __init__(self, data_source, replacement=False, num_samples=None):
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.epoch = -1
+        if self._num_samples is not None and replacement is False:
+            raise ValueError("With replacement=False, num_samples should not be specified, "
+                             "since a random permute will be performed.")
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError("num_samples should be a positive integer "
+                             "value, but got num_samples={}".format(self.num_samples))
+        if not isinstance(self.replacement, bool):
+            raise ValueError("replacement should be a boolean value, but got "
+                             "replacement={}".format(self.replacement))
+    @property
+    def num_samples(self):
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+    def __iter__(self):
+        n = len(self.data_source)
+        g = torch.Generator()
+        if self.epoch >= 0:
+            g.manual_seed(self.epoch)
+        if self.replacement:
+            return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64, generator=g).tolist())
+        return iter(torch.randperm(n, generator=g).tolist())
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+class DistributedSequentialSampler(data.sampler.Sampler):
+    def __init__(self, num_samples, train_iters, batch_size, rank=-1, world_size=2):
+        super().__init__(num_samples)
+        if rank == -1:
+            rank = 0
+            world_size = 1
+        self.num_samples = num_samples
+        self.rank = rank
+        self.world_size = world_size
+        self.start_iter = 0
+        self.train_iters = train_iters
+        self.batch_size = batch_size
+        self.batch_bias = [i * (num_samples // batch_size) for i in range(batch_size)]
+    def __iter__(self):
+        for idx in range(self.start_iter, self.train_iters * 10):
+            batch = [(idx + bias) % self.num_samples for bias in self.batch_bias]
+            tbatch = self._batch(batch)
+            yield tbatch
+    def __len__(self):
+        return self.train_iters
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank*self.batch_size//self.world_size
+        end = (self.rank+1)*self.batch_size//self.world_size
+        return batch[start:end]
+class DistributedBatchSampler(data.sampler.BatchSampler):
+    """
+    similar to normal implementation of distributed sampler, except implementation is at the
+    batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
+    data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
+    """
+    def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False, gradient_accumulation_steps=None):
+        super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
+        if rank == -1:
+            assert False, 'should not be here'
+        self.rank = rank
+        self.world_size = world_size
+        self.sampler.wrap_around = 0
+        self.wrap_around = 0
+        self.wrap_last = wrap_last
+        self.start_iter = 0
+        self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps
+    def __iter__(self):
+        batch = []
+        i = 0
+        for idx in self.data_iterator(self.sampler, wrap_around=False):
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                tbatch = self._batch(batch)
+                if i >= self.start_iter * self.effective_batch_size:
+                    yield tbatch
+                    self.start_iter = 0
+                i += len(batch)
+                batch = []
+        batch_len = len(batch)
+        if batch_len > 0 and not self.drop_last:
+            if self.wrap_last:
+                self.sampler.wrap_around -= (self.batch_size)
+                self.wrap_around += (len(batch))
+                self.wrap_around %= self.batch_size
+                if isinstance(self.sampler, TransposedSampler):
+                    for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)):
+                        if i == 0:
+                            continue
+                        batch.append(idx)
+                        new_batch_len = len(batch)
+                        if len(batch) == self.batch_size:
+                            break
+            yield self._batch(batch)
+        if self.wrap_last:
+            self.sampler.wrap_around += self.batch_size
+    def data_iterator(self, _iter, wrap_around=False):
+        """iterates through data and handles wrap around"""
+        for i, idx in enumerate(_iter):
+            if i < self.wrap_around%self.batch_size:
+                continue
+            if wrap_around:
+                self.wrap_around += 1
+                self.wrap_around %= self.batch_size
+            yield idx
+    def _batch(self, batch):
+        """extracts samples only pertaining to this worker's batch"""
+        start = self.rank*self.batch_size//self.world_size
+        end = (self.rank+1)*self.batch_size//self.world_size
+        return batch[start:end]
--- a/data_utils/sp_tokenizer.py
+++ b/data_utils/sp_tokenizer.py
+"""
+from https://github.com/openai/gpt-2/, changed for chinese
+"""
+import json
+import os
+import sentencepiece as spm
+"""
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation 
+systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements 
+subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the 
+extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end 
+system that does not depend on language-specific pre/postprocessing.
+https://github.com/google/sentencepiece
+pip install sentencepiece
+or  git clone https://github.com/google/sentencepiece.git
+python setup.py install
+"""
+PRETRAINED_MODEL_FILE = "pretrained/chinese_sentencepiece/cog-pretrain.model"
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class Encoder:
+    def __init__(self, encoder, bpe_merges):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.max_len = 0
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        return [self.encoder.get(token, 1) for token in self.tokenize(text)]
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        return text
+    def tokenize(self, text):
+        bpe_tokens = []
+        bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
+        return bpe_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return [self.encoder.get(token, 1) for token in tokens]
+class Encoder_SP:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+    def encode(self, text):
+        """
+        text="...."
+        """
+        return self.sp.EncodeAsIds(text)
+    def decode(self, tokens):
+        """
+        tokens=[x1,x2,...]
+        """
+        text = [int(token) for token in tokens]
+        return self.sp.DecodeIds(text)
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+def get_encoder(encoder_file, bpe_file):
+    # 以下是为了同一个函数入兼容sentencepiece
+    filepath, filename = os.path.split(encoder_file)
+    shotname, extension = os.path.splitext(filename)
+    if (".model" == extension) and (bpe_file == ""):
+        return Encoder_SP(encoder_file)
+    else:
+        with open(encoder_file, 'r', encoding="utf-8") as f:
+            encoder = json.load(f)
+        with open(bpe_file, 'r', encoding="utf-8") as f:
+            bpe_data = f.read()
+        bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+        return Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
+def from_pretrained():
+    return get_encoder(PRETRAINED_MODEL_FILE, "")
\ No newline at end of file
--- a/data_utils/templates.py
+++ b/data_utils/templates.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   templates.py
+@Time    :   2021/01/11 22:28:57
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from tqdm import tqdm
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .unified_tokenizer import get_tokenizer
+from .vqvae_tokenizer import sqrt_int
+def concat_codes(*codes):
+    is_numpy = is_tensor = False
+    for code in codes:
+        if isinstance(code, np.ndarray):
+            is_numpy = True
+        if isinstance(code, torch.Tensor):
+            is_tensor = True
+            device = code.device
+    if is_tensor:
+        return torch.cat(
+            [
+                torch.tensor(code, device=device)
+                for code in codes
+            ]
+        )
+    elif is_numpy:
+        return np.concatenate(
+            [
+                np.array(code)
+                for code in codes
+            ],
+            axis=0
+        )
+    else:
+        ret = []
+        for code in codes:
+            ret = ret + code
+        return ret
+def TextCodeTemplate(text, code):
+    tokenizer = get_tokenizer()
+    text_ids = [tokenizer['[ROI1]']] + tokenizer(text)
+    code = tokenizer.wrap_code(code)
+    return concat_codes(text_ids, code)
+def Code2CodeTemplate(text, code0, code1):
+    tokenizer = get_tokenizer()
+    text_ids = tokenizer.parse_query(text) if isinstance(text, str) else text
+    code0 = tokenizer.wrap_code(code0)
+    code1 = tokenizer.wrap_code(code1, idx=2)
+    return concat_codes(text_ids, code0, code1)
+def PureTextTemplate(text):
+    tokenizer = get_tokenizer()
+    return tokenizer(text) + [tokenizer['[SEP]']]
--- a/data_utils/unified_tokenizer.py
+++ b/data_utils/unified_tokenizer.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   unified_tokenizer.py
+@Time    :   2021/01/11 16:36:33
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from tqdm import tqdm
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .sp_tokenizer import from_pretrained
+from .vqvae_tokenizer import VQVAETokenizer, sqrt_int
+class UnifiedTokenizer(object):
+    def __init__(self, img_tokenizer_path, device, img_tokenizer_num_tokens=None):
+        self.device = device
+        if img_tokenizer_path is None and img_tokenizer_num_tokens is not None:
+            # pretraining but only know the vocab size of VQVAE, which is developing fast
+            self.img_tokenizer = FakeTokenizer(img_tokenizer_num_tokens)
+        else:
+            self.img_tokenizer = VQVAETokenizer(model_path=img_tokenizer_path, device=self.device)
+        self.txt_tokenizer = from_pretrained()
+        self.num_tokens = self.img_tokenizer.num_tokens + self.txt_tokenizer.num_tokens
+        self.raw_command_tokens = [
+            ('[PAD]', 0),
+            ('[BOI1]', 1), # Begin
+            ('[BOI2]', 2),
+            ('[BOI3]', 3),
+            ('[EOI1]', 4), # End
+            ('[EOI2]', 5),
+            ('[EOI3]', 6),
+            ('[ROI1]', 7), # Reference
+            ('[ROI2]', 8),
+            ('[ROI3]', 9),
+            ('[SEP]', 10),
+            ('[MASK]', 11),
+            ('[CLS]', 12),
+            ('[ENC]', 13),
+            ('[TINY]', 14), # 8 * 8
+            ('[SMALL]', 15), # 16 * 16
+            ('[BASE]', 16), # 32 * 32
+            ('[BIG]', 17), # 64 * 64
+            ('[POS0]', 18), # 58210
+            ('[POS1]', 19),
+            ('[POS2]', 20),
+            ('[POS3]', 21),
+            ('[POS4]', 22),
+            ('[POS5]', 23),
+            ('[POS6]', 24),
+            ('[POS7]', 25),
+            ('[POS8]', 26)
+            # Please leave the ``size tokens'' at the back of command tokens
+        ]
+        self.command_tokens = {
+            k: v + self.num_tokens
+            for k, v in self.raw_command_tokens
+        }
+        self.num_tokens += len(self.raw_command_tokens)
+    def __getitem__(self, command_token):
+        return self.command_tokens[command_token]
+    def __len__(self):
+        """total number of tokens"""
+        return self.num_tokens
+    def __call__(self, inputs, process_fn=None):
+        """run preprocessing and encode inputs as Ids
+            CANNOT contain command tokens"""
+        if isinstance(inputs, torch.Tensor): # image
+            if len(inputs.shape) == 3:
+                inputs = inputs.unsqueeze(0)
+            return self.img_tokenizer.EncodeAsIds(inputs)
+        return self.EncodeAsIds(inputs, process_fn=process_fn)
+    def EncodeAsIds(self, text, process_fn=None):
+        processed_text = text
+        if process_fn is not None:
+            processed_text = process_fn(processed_text)
+        ids = self.txt_tokenizer.encode(processed_text)
+        return [x + self.img_tokenizer.num_tokens for x in ids]
+    def DecodeIds(self, ids):
+        ret, img_buffer, txt_buffer, ret_imgs = [], [], [], []
+        try:
+            for x in ids:
+                if self.num_tokens - len(self.raw_command_tokens) <= x:
+                    # command tokens
+                    token = self.raw_command_tokens[x - (self.num_tokens - len(self.raw_command_tokens))][0]
+                    if token.startswith('[EOI') and len(img_buffer) > 0:
+                        # dump image
+                        ret_imgs.append(self.img_tokenizer.DecodeIds(img_buffer))
+                        img_buffer = []
+                    if len(txt_buffer) > 0:
+                        # dump text
+                        ret.append(self.txt_tokenizer.decode(txt_buffer))
+                        txt_buffer = []
+                    ret.append(token)
+                elif x < self.img_tokenizer.num_tokens:
+                    img_buffer.append(x)
+                else:
+                    txt_buffer.append(x - self.img_tokenizer.num_tokens)
+            if len(img_buffer) > 0:
+                # dump image
+                ret_imgs.append(self.img_tokenizer.DecodeIds(img_buffer))
+                img_buffer = []
+            if len(txt_buffer) > 0:
+                # dump text
+                ret.append(self.txt_tokenizer.decode(txt_buffer))
+                txt_buffer = []
+        except ValueError:
+            print('Value error in tokenization, skipping...')
+        return ret, ret_imgs
+    def wrap_code(self, code, idx=1):
+        s = sqrt_int(len(code))
+        prefix = {8:'[TINY]', 16:'[SMALL]', 32:'[BASE]', 64:'[BIG]'}[s]
+        boi = {1:'[BOI1]', 2: '[BOI2]', 3:'[BOI3]'}[idx]
+        eoi = {1:'[EOI1]', 2: '[EOI2]', 3:'[EOI3]'}[idx]
+        if isinstance(code, list):
+            return [self.command_tokens[prefix], self.command_tokens[boi]] + \
+                code + [self.command_tokens[eoi]]
+        elif isinstance(code, np.ndarray):
+            return np.concatenate(
+                (
+                    np.array([self.command_tokens[prefix], self.command_tokens[boi]]),
+                    code,
+                    np.array([self.command_tokens[eoi]])
+                ),
+                axis=0
+            )
+        elif isinstance(code, torch.Tensor):
+            return torch.cat(
+                (
+                    torch.tensor([self.command_tokens[prefix], self.command_tokens[boi]]),
+                    code, 
+                    np.array([self.command_tokens[eoi]])
+                )
+            )
+        else:
+            raise ValueError('')
+    def parse_query(self, query, img_size=256):
+        text_buffer = []
+        ret = []
+        for part in query.split(' '):
+            if part in self.command_tokens:
+                if len(text_buffer) > 0:
+                    # dump text ids
+                    ret.extend(self.EncodeAsIds(' '.join(text_buffer)))
+                    text_buffer = []
+                if part == '[MASK]':
+                    ret.append(-1)
+                else:
+                    ret.append(self.command_tokens[part])
+            elif part.startswith('[MASK]*'): # special lang *N
+                c = int(part[7:])
+                assert c > 0
+                if len(text_buffer) > 0:
+                    # dump text ids
+                    ret.extend(self.EncodeAsIds(' '.join(text_buffer)))
+                    text_buffer = []
+                ret.extend([-1] * c)
+            elif part.startswith('[Image'): # [Image*N]path
+                c = part[6:]
+                assert len(c) > 0
+                num_codes, img_path = c.split(']')
+                if num_codes == '':
+                    num_codes = 1024
+                else:
+                    num_codes = int(num_codes)
+                raw_img = self.img_tokenizer.read_img(img_path, img_size=img_size)
+                img_codes = self.img_tokenizer.EncodeAsIds(raw_img) # [1, 32*32]
+                img_codes[0, num_codes:] = -1
+                img_codes = img_codes[0].tolist()
+                ret.extend(img_codes)
+            else:
+                text_buffer.append(part)
+        if len(text_buffer) > 0:
+            # dump text ids
+            ret.extend(self.EncodeAsIds(' '.join(text_buffer)))
+            text_buffer = []
+        return ret
+def get_tokenizer(args=None):
+    if not hasattr(get_tokenizer, 'tokenizer'):
+        # the first time to load the tokenizer, specify img_tokenizer_path
+        get_tokenizer.tokenizer = UnifiedTokenizer(
+            args.img_tokenizer_path, 
+            device=torch.cuda.current_device(),
+            img_tokenizer_num_tokens=args.img_tokenizer_num_tokens
+            )
+    return get_tokenizer.tokenizer
+class FakeTokenizer(object):
+    def __init__(self, num_tokens):
+        self.num_tokens = num_tokens
+    def __len__(self):
+        return self.num_tokens
\ No newline at end of file
--- a/data_utils/vqvae_tokenizer.py
+++ b/data_utils/vqvae_tokenizer.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   vqvae_tokenizer.py
+@Time    :   2021/01/11 17:57:43
+@Author  :   Ming Ding 
+@Contact :   dm18@mails.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+from tqdm import tqdm
+import numpy as np
+import torch
+import torch.nn.functional as F
+from vqvae import new_model, img2code, code2img
+from torchvision import transforms
+from PIL import Image
+def is_exp2(x):
+    t = math.log2(x)
+    return abs(t - int(t)) < 1e-4
+def sqrt_int(x):
+    r = int(math.sqrt(x) + 1e-4)
+    assert r * r == x
+    return r
+class VQVAETokenizer(object):
+    def __init__(self, 
+            model_path, 
+            device='cuda'
+        ):
+        ckpt = torch.load(model_path, map_location=torch.device(device))
+        model = new_model()
+        if list(ckpt.keys())[0].startswith('module.'):
+            ckpt = {k[7:]: v for k, v in ckpt.items()}
+        model.load_state_dict(ckpt)
+        model = model.to(device)
+        model.eval()
+        self.model = model
+        self.device = device
+        self.image_tokens = model.quantize_t.n_embed
+        self.num_tokens = model.quantize_t.n_embed
+    def __len__(self):
+        return self.num_tokens
+    def EncodeAsIds(self, img):
+        assert len(img.shape) == 4 # [b, c, h, w]
+        return img2code(self.model, img)
+    def DecodeIds(self, code, shape=None):
+        if shape is None:
+            if isinstance(code, list):
+                code = torch.tensor(code, device=self.device)
+            s = sqrt_int(len(code.view(-1)))
+            assert s * s == len(code.view(-1))
+            shape = (1, s, s)
+        code = code.view(shape)
+        out = code2img(self.model, code)
+        return out
+    def read_img(self, path, img_size=256):
+        tr = transforms.Compose([
+            transforms.Resize(img_size), 
+            transforms.CenterCrop(img_size),
+            transforms.ToTensor(),
+        ])
+        img = tr(Image.open(path))
+        if img.shape[0] == 4:
+            img = img[:-1]
+        tr_normalize = transforms.Normalize([0.79093, 0.76271, 0.75340], [0.30379, 0.32279, 0.32800])
+        img = tr_normalize(img)
+        img = img.unsqueeze(0).float().to(self.device) # size [1, 3, h, w]
+        return img  
\ No newline at end of file
--- a/env/dockerfile
+++ b/env/dockerfile
+FROM nvidia/cuda:11.1-devel-ubuntu18.04
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN  sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN  sed -i s@/security.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN rm /etc/apt/sources.list.d/nvidia-ml.list && rm /etc/apt/sources.list.d/cuda.list && apt-get clean
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-9-dev libsndfile-dev \
+        libcupti-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen jq psmisc dnsutils lsof musl-dev systemd
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt-get update && \
+    apt-get install -y git && \
+    git --version
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive froGm server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+##############################################################################
+# Mellanox OFED
+##############################################################################
+ENV MLNX_OFED_VERSION=5.1-0.6.6.0
+#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+RUN apt-get install -y libnuma-dev
+RUN cd ${STAGE_DIR} && \
+    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
+    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+    ./mlnxofedinstall --user-space-only --without-fw-update --umad-dev-rw --all -q && \
+    cd ${STAGE_DIR} && \
+    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+##############################################################################
+# nv_peer_mem
+##############################################################################
+ENV NV_PEER_MEM_VERSION=1.1
+ENV NV_PEER_MEM_TAG=1.1-0
+RUN mkdir -p ${STAGE_DIR} && \
+    git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+    cd ${STAGE_DIR}/nv_peer_memory && \
+    ./build_module.sh && \
+    cd ${STAGE_DIR} && \
+    tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+    cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+    apt-get update && \
+    apt-get install -y dkms && \
+    dpkg-buildpackage -us -uc && \
+    dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+##############################################################################
+# OPENMPI
+##############################################################################
+ENV OPENMPI_BASEVERSION=4.0
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
+#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+RUN cd ${STAGE_DIR} && \
+    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+    cd openmpi-${OPENMPI_VERSION} && \
+    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+    make -j"$(nproc)" install && \
+    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+    # Sanity check:
+    test -f /usr/local/mpi/bin/mpic++ && \
+    cd ${STAGE_DIR} && \
+    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+    chmod a+x /usr/local/mpi/bin/mpirun
+##############################################################################
+# Python
+##############################################################################
+ARG PYTHON_VERSION=3.8
+RUN curl -o ~/miniconda.sh https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \
+     rm ~/miniconda.sh && \
+     /opt/conda/bin/conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ && \
+     /opt/conda/bin/conda config --set show_channel_urls yes && \
+     /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include ninja cython typing && \
+     /opt/conda/bin/conda clean -ya
+ENV PATH /opt/conda/bin:$PATH
+RUN wget https://tuna.moe/oh-my-tuna/oh-my-tuna.py && python oh-my-tuna.py
+RUN pip install --upgrade pip setuptools
+##############################################################################
+# Some Packages
+##############################################################################
+RUN pip install psutil \
+                yappi \
+                cffi \
+                ipdb \
+                h5py \
+                pandas \
+                matplotlib \
+                py3nvml \
+                pyarrow \
+                graphviz \
+                astor \
+                boto3 \
+                tqdm \
+                sentencepiece \
+                msgpack \
+                requests \
+                pandas \
+                sphinx \
+                sphinx_rtd_theme \
+                nvidia-ml-py3 \
+                mpi4py \
+                filelock \
+                lmdb \
+                cupy-cuda111 && \
+    pip cache purge
+##############################################################################
+# PyTorch
+# The default NCCL from pytorch will be slower, but to download pytorch source code is too slow in China, so we gave up.
+##############################################################################
+# RUN git clone --branch v1.8.1 --recursive https://github.com/pytorch/pytorch /opt/pytorch
+# RUN cd /opt/pytorch && \
+#     git submodule sync && git submodule update --init --recursive
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX 8.0 8.6"
+# ENV NCCL_LIBRARY=/usr/lib/x86_64-linux-gnu
+# ENV NCCL_INCLUDE_DIR=/usr/include
+# RUN conda install -c pytorch magma-cuda111 && \
+#     cd /opt/pytorch && TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+#     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" USE_SYSTEM_NCCL=1 \
+#     pip install -v . && rm -rf /opt/pytorch
+ENV TENSORBOARDX_VERSION=1.8
+RUN pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && \
+    pip install tensorboardX==${TENSORBOARDX_VERSION} && \
+    pip cache purge
+##############################################################################
+# apex
+##############################################################################
+# RUN git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+COPY apex-master ${STAGE_DIR}/apex
+RUN cd ${STAGE_DIR}/apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && rm -rf ${STAGE_DIR}/apex
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+    rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+##############################################################################
+# DeepSpeed
+##############################################################################
+# RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+# COPY DeepSpeed ${STAGE_DIR}/DeepSpeed
+# RUN cd ${STAGE_DIR}/DeepSpeed && \
+#     git checkout . && \
+#     DS_BUILD_OPS=1 ./install.sh -r
+# RUN rm -rf ${STAGE_DIR}/DeepSpeed
+# RUN python -c "import deepspeed; print(deepspeed.__version__)"
+RUN pip install triton==0.2.3 && \
+    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_SPARSE_ATTN=1 DS_BUILD_UTILS=1 pip install deepspeed --global-option="build_ext" --global-option="-j8" && \
+    pip cache purge && \
+    ds_report
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ARG SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+    echo "PasswordAuthentication no" >> ${STAGE_DIR}/sshd_config && \
+    sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EXPOSE ${SSH_PORT}
+# Set SSH KEY
+RUN echo "StrictHostKeyChecking no \nUserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+ ssh-keygen -t rsa -f ~/.ssh/id_rsa -N "" && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
+   chmod og-wx ~/.ssh/authorized_keys
--- a/env/ip_list.txt
+++ b/env/ip_list.txt
+127.0.0.1
\ No newline at end of file
--- a/env/ip_list.txt.example
+++ b/env/ip_list.txt.example
+172.30.0.214 172.30.0.215 172.30.0.209
--- a/env/setup_connection.py
+++ b/env/setup_connection.py
+# -*- encoding: utf-8 -*-
+'''
+@File    :   setup_connection.py
+@Time    :   2021/01/16 16:50:36
+@Author  :   Ming Ding 
+@Contact :   dm18@mail.tsinghua.edu.cn
+'''
+# here put the import lib
+import os
+import sys
+import math
+import random
+import base64
+if __name__ == "__main__":
+    ssh_config = ''
+    line_format = 'Host node{}\n\tUser root\n\tPort 2222\n\tHostname {}\n'
+    for i, ip in enumerate(sys.argv[1:]):
+        ssh_config += line_format.format(i, ip)
+    ret = os.system(f'echo \"{ssh_config}\" > ~/.ssh/config && chmod 600 ~/.ssh/config')
+    assert ret == 0
+    hostfile_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'hostfile')
+    with open(hostfile_path, 'w') as fout:
+        for i, ip in enumerate(sys.argv[1:]):
+            fout.write(f'node{i} slots=8\n')
+    print(f'Successfully generating hostfile \'{hostfile_path}\'!')
--- a/env/start_docker.sh
+++ b/env/start_docker.sh
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+main_dir=$(dirname $script_dir)
+ip_list=$(cat $script_dir/ip_list.txt)
+docker run --gpus all -d --ipc=host --cap-add=IPC_LOCK -v /sys/class/net/:/sys/class/net/  --device=/dev/ --privileged --network=host -v $main_dir:/root/cogview --name bg-cogview cogview/cuda111_torch181_deepspeed040:base bash -c "/etc/init.d/ssh start && python /root/cogview/env/setup_connection.py $ip_list && sleep 365d"