From 049b6ffcdd6b766f83783bf3746f65693547a7ff Mon Sep 17 00:00:00 2001 From: duzx16 <zx-du20@mails.tsinghua.edu.cn> Date: Sun, 5 Dec 2021 23:14:55 +0800 Subject: [PATCH] Add T5 Tokenizer --- SwissArmyTransformer/tokenization/__init__.py | 7 +- .../tokenization/glm/tokenization.py | 20 +++--- .../tokenization/hf_tokenizer.py | 65 +++++++++++++++++++ examples/t5/config/model_t5_large.sh | 4 +- inference_t5.py | 15 +++-- 5 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 SwissArmyTransformer/tokenization/hf_tokenizer.py diff --git a/SwissArmyTransformer/tokenization/__init__.py b/SwissArmyTransformer/tokenization/__init__.py index 7e04852..ffdf5c3 100644 --- a/SwissArmyTransformer/tokenization/__init__.py +++ b/SwissArmyTransformer/tokenization/__init__.py @@ -29,7 +29,8 @@ def _export_vocab_size_to_args(args, original_num_tokens): print_rank_0('> padded vocab (size: {}) with {} dummy ' 'tokens (new size: {})'.format( before, after - before, after)) - args.vocab_size = after + if not args.vocab_size: + args.vocab_size = after print_rank_0("prepare tokenizer done") return tokenizer @@ -63,6 +64,10 @@ def get_tokenizer(args=None, outer_tokenizer=None): elif args.tokenizer_type == "glm_ChineseSPTokenizer": from .glm import ChineseSPTokenizer get_tokenizer.tokenizer = ChineseSPTokenizer(args.tokenizer_model_type, **kwargs) + elif args.tokenizer_type.startswith('hf'): + from .hf_tokenizer import HFT5Tokenizer + if args.tokenizer_type == "hf_T5Tokenizer": + get_tokenizer.tokenizer = HFT5Tokenizer(args.tokenizer_model_type) else: assert args.vocab_size > 0 get_tokenizer.tokenizer = FakeTokenizer(args.vocab_size) diff --git a/SwissArmyTransformer/tokenization/glm/tokenization.py b/SwissArmyTransformer/tokenization/glm/tokenization.py index 67be818..9b9a8ab 100644 --- a/SwissArmyTransformer/tokenization/glm/tokenization.py +++ b/SwissArmyTransformer/tokenization/glm/tokenization.py @@ -312,11 +312,11 @@ class Tokenizer(object): tokenization.tokenization = [self.IdToToken(idx) for idx in tokenization.tokenization] return tokenization - def IdToToken(self, Id): + def IdToToken(self, idx): """convert Id to token accounting for command tokens""" - if isinstance(Id, CommandToken): - return Id.token - return self.tokens[Id] + if isinstance(idx, CommandToken): + return idx.token + return self.tokens[idx] def TokenToId(self, token): """convert token to Id accounting for command tokens""" @@ -324,16 +324,16 @@ class Tokenizer(object): return token.Id return self.vocab[token] - def DecodeIds(self, Ids): + def DecodeIds(self, ids): """ convert Ids to tokens accounting for command tokens, tokens are joined and returned as a string. """ rtn_strs = [] current_str = [] - if isinstance(Ids, Tokenization): - Ids = Ids.tokenization - for Id in Ids: + if isinstance(ids, Tokenization): + ids = ids.tokenization + for Id in ids: if isinstance(Id, CommandToken): rtn_strs.append(self._decode(current_str)) current_str = [] @@ -353,11 +353,11 @@ class Tokenizer(object): output = self.clean_up_tokenization(output) return output - def DecodeTokens(self, Tokens): + def DecodeTokens(self, tokens): """ convert tokens to a string accounting for command and type tokens. """ - Ids = [self.TokenToId(token) for token in Tokens] + Ids = [self.TokenToId(token) for token in tokens] return self.DecodeIds(Ids) diff --git a/SwissArmyTransformer/tokenization/hf_tokenizer.py b/SwissArmyTransformer/tokenization/hf_tokenizer.py new file mode 100644 index 0000000..d671976 --- /dev/null +++ b/SwissArmyTransformer/tokenization/hf_tokenizer.py @@ -0,0 +1,65 @@ +from transformers import T5Tokenizer +from .glm.tokenization import Tokenization, CommandToken + + +class HFTokenizer: + def __init__(self, model_cls, model_type_or_path=None, cache_dir=None, command_tokens=None): + self.text_tokenizer = model_cls.from_pretrained(model_type_or_path, cache_dir=cache_dir) + self.num_tokens = len(self.text_tokenizer) + self._command_tokens = [] + self.command_name_map = {} + self.command_token_map = {} + self.command_id_map = {} + + @property + def command_tokens(self): + return self._command_tokens + + @command_tokens.setter + def command_tokens(self, command_tokens): + self._command_tokens = command_tokens + self.command_name_map = {tok.name: tok for tok in self.command_tokens} + self.command_token_map = {tok.token: tok for tok in self.command_tokens} + self.command_id_map = {tok.Id: tok for tok in self.command_tokens} + + def get_command(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def EncodeAsIds(self, text, process_fn=None): + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + ids = self.text_tokenizer.encode(processed_text, add_special_tokens=False) + tokenization = Tokenization(ids, processed_text, text) + return tokenization + + def DecodeIds(self, ids): + if isinstance(ids, Tokenization): + ids = ids.tokenization + return self.text_tokenizer.decode(ids) + + def DecodeTokens(self, tokens): + return self.text_tokenizer.convert_tokens_to_string(tokens) + + def IdToToken(self, Id): + if isinstance(Id, CommandToken): + return Id.token + return self.text_tokenizer.convert_ids_to_tokens(Id) + + def TokenToId(self, token): + if isinstance(token, CommandToken): + return token.Id + return self.text_tokenizer.convert_tokens_to_ids(token) + + +class HFT5Tokenizer(HFTokenizer): + def __init__(self, model_type_or_path=None, cache_dir=None): + super().__init__(T5Tokenizer, model_type_or_path=model_type_or_path, cache_dir=cache_dir) + command_tokens = [ + CommandToken('eos', '</s>', self.TokenToId("</s>")), + CommandToken('pad', '<pad>', self.TokenToId("<pad>")), + ] + for i in range(100): + command_tokens.append(CommandToken(f'MASK{i}', f'<extra_id_{i}>', self.TokenToId(f'<extra_id_{i}>'))) + self.command_tokens = command_tokens diff --git a/examples/t5/config/model_t5_large.sh b/examples/t5/config/model_t5_large.sh index ce4e277..0c20d55 100644 --- a/examples/t5/config/model_t5_large.sh +++ b/examples/t5/config/model_t5_large.sh @@ -10,6 +10,6 @@ MODEL_ARGS="--block-lm \ --max-sequence-length 513 \ --relative-attention-num-buckets 32 \ --layernorm-epsilon 1e-6 \ - --tokenizer-model-type roberta \ - --tokenizer-type glm_GPT2BPETokenizer \ + --tokenizer-type hf_T5Tokenizer \ + --tokenizer-model-type t5-large \ --load ${CHECKPOINT_PATH}/glm-large-en-blank" \ No newline at end of file diff --git a/inference_t5.py b/inference_t5.py index 8e938f8..c2af4e7 100644 --- a/inference_t5.py +++ b/inference_t5.py @@ -48,7 +48,7 @@ def get_masks_and_position_ids_glm(seq, mask_position, context_length): def main(args): args.do_train = False initialize_distributed(args) - # tokenizer = get_tokenizer(args) + tokenizer = get_tokenizer(args) # build model model = T5Model(args) if args.fp16: @@ -60,9 +60,16 @@ def main(args): torch.load("/dataset/fd5061f6/yanan/huggingface_models/t5-large/model_states.pt")["module"]) from SwissArmyTransformer.model.encoder_decoder_model import EncoderFinalMixin model.eval() - input_ids = torch.cuda.LongTensor([[37, 32099, 10681, 16, 32098, 2447, 1]]) - decoder_input_ids = torch.cuda.LongTensor([[32099, 5295, 1782, 32098, 8, 32097, 1]]) - output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + input_ids = tokenizer.EncodeAsIds("The <extra_id_0> walks in <extra_id_1> park").tokenization + input_ids = input_ids + [tokenizer.get_command("eos").Id] + input_ids = torch.cuda.LongTensor([input_ids]) + # input_ids = torch.cuda.LongTensor([[37, 32099, 10681, 16, 32098, 2447, 1]]) + decoder_input_ids = tokenizer.EncodeAsIds('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>').tokenization + decoder_input_ids = decoder_input_ids + [tokenizer.get_command("eos").Id] + decoder_input_ids = torch.cuda.LongTensor([decoder_input_ids]) + # decoder_input_ids = torch.cuda.LongTensor([[32099, 5295, 1782, 32098, 8, 32097, 1]]) + breakpoint() + output = model(enc_input_ids=input_ids, dec_input_ids=decoder_input_ids) print(output) end_tokens = [tokenizer.get_command('eop').Id, tokenizer.get_command('eos').Id] # define function for each query -- GitLab