Skip to content
Snippets Groups Projects
Commit e554c1c8 authored by Yuanhao's avatar Yuanhao Committed by Matthias Reso
Browse files

The tokenizer will not add eos_token by default

parent 3038020a
No related branches found
No related tags found
No related merge requests found
...@@ -44,7 +44,7 @@ def format_tokens(dialogs, tokenizer): ...@@ -44,7 +44,7 @@ def format_tokens(dialogs, tokenizer):
[ [
tokenizer.encode( tokenizer.encode(
f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ", f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
) ) + [tokenizer.eos_token_id]
for prompt, answer in zip(dialog[::2], dialog[1::2]) for prompt, answer in zip(dialog[::2], dialog[1::2])
], ],
[], [],
...@@ -62,4 +62,4 @@ def format_tokens(dialogs, tokenizer): ...@@ -62,4 +62,4 @@ def format_tokens(dialogs, tokenizer):
def read_dialogs_from_file(file_path): def read_dialogs_from_file(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
dialogs = json.load(file) dialogs = json.load(file)
return dialogs return dialogs
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment