diff --git a/SwissArmyTransformer/tokenization/base_tokenizer.py b/SwissArmyTransformer/tokenization/base_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8264e389cf648da61ba262024aaa51fa1fc922ba --- /dev/null +++ b/SwissArmyTransformer/tokenization/base_tokenizer.py @@ -0,0 +1,53 @@ +import os +from .utils import * + +class BaseTokenizer: + def __init__(self, **kwargs): + pass + + def __call__(self, text, **kwargs): + """run preprocessing and encode text as Ids""" + return self.EncodeAsIds(text, **kwargs) + + def __len__(self): + """total number of tokens""" + return self.num_tokens + + def __repr__(self): + """info interpretation for tokenizer""" + return "Base Tokenizer for SAT" + + @property + def command_tokens(self): + """get command tokens of the tokenizer""" + return None + + @property + def num_tokens(self): + """get total number of tokens""" + return 0 + + def from_pretrained(self, **kwargs): + """load tokenizer params from pretrained""" + pass + + def EncodeAsIds(self, text, **kwargs): + """encode to ids by tokenizer""" + raise NotImplementedError + + def EncodeAsTokens(self, text, **kwargs): + """encode to tokens by tokenizer""" + raise NotImplementedError + + def DecodeIds(self, ids, **kwargs): + """decode ids to original form by tokenizer""" + raise NotImplementedError + + def DecodeTokens(self, tokens, **kwargs): + """decode tokens to original form by tokenizer""" + raise NotImplementedError + + + + + \ No newline at end of file diff --git a/SwissArmyTransformer/tokenization/utils.py b/SwissArmyTransformer/tokenization/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..085b5ed41fff9bf184d10509104722c6d4d32a56 --- /dev/null +++ b/SwissArmyTransformer/tokenization/utils.py @@ -0,0 +1,15 @@ +import torch +import numpy as np +import tensorflow as tf + +def _is_list(x): + return isinstance(x, list) + +def _is_numpy(x): + return isinstance(x, np.ndarray) + +def _is_torch(x): + return isinstance(x, torch.Tensor) + +def _is_tensorflow(x): + return isinstance(x, tf.Tensor) \ No newline at end of file