Skip to content
Snippets Groups Projects
Commit b43d3553 authored by Ming Ding's avatar Ming Ding
Browse files

Merge branch 'package' of github.com:THUDM/SwissArmyTransformer into package

parents 91faa63a 7192cdac
No related branches found
No related tags found
No related merge requests found
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)""" """Utilities for using and training tokenizers (char, wordpiece, sentencepiece)"""
from collections import namedtuple from collections import namedtuple
import random import random
import os import os
...@@ -440,12 +441,12 @@ class ChineseSPTokenizer(Tokenizer): ...@@ -440,12 +441,12 @@ class ChineseSPTokenizer(Tokenizer):
CommandToken('sop', '<|startofpiece|>', num_tokens + 1), CommandToken('sop', '<|startofpiece|>', num_tokens + 1),
CommandToken('eop', '<|endofpiece|>', num_tokens + 2) CommandToken('eop', '<|endofpiece|>', num_tokens + 2)
]) ])
if model_type_or_path == 'glm-large': if model_type_or_path != 'glm-10b':
num_tokens += 3 num_tokens += 3
else: else:
num_tokens += 2 num_tokens += 2
if add_task_mask: if add_task_mask:
if model_type_or_path == 'glm-large': if model_type_or_path != 'glm-10b':
command_tokens.extend([ command_tokens.extend([
CommandToken('sMASK', '[sMASK]', num_tokens, lstrip=True), CommandToken('sMASK', '[sMASK]', num_tokens, lstrip=True),
CommandToken('gMASK', '[gMASK]', num_tokens + 1, lstrip=True) CommandToken('gMASK', '[gMASK]', num_tokens + 1, lstrip=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment