diff --git a/inference/chat_completion.py b/inference/chat_completion.py index 274f381f9c0f88c75b11c671665c452ec39b4483..bc5311d62ac735ced4043357ebae0144cf335884 100644 --- a/inference/chat_completion.py +++ b/inference/chat_completion.py @@ -62,13 +62,11 @@ def main( tokenizer = LlamaTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens( { - "eos_token": "</s>", - "bos_token": "</s>", - "unk_token": "</s>", - "pad_token": "[PAD]", + + "pad_token": "<PAD>", } ) - + chats = format_tokens(dialogs, tokenizer) with torch.no_grad(): diff --git a/inference/inference.py b/inference/inference.py index b6398d407f0cf95e9c7d22ee91689bff0ec3d620..58c094a5f4c2adc362a45a6a253378295ecdfe7d 100644 --- a/inference/inference.py +++ b/inference/inference.py @@ -7,6 +7,7 @@ import fire import torch import os import sys +import time from typing import List from transformers import LlamaTokenizer @@ -49,15 +50,13 @@ def main( # Set the seeds for reproducibility torch.cuda.manual_seed(seed) torch.manual_seed(seed) + model = load_model(model_name, quantization) - tokenizer = LlamaTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens( { - "eos_token": "</s>", - "bos_token": "</s>", - "unk_token": "</s>", - "pad_token": "[PAD]", + + "pad_token": "<PAD>", } ) @@ -88,7 +87,7 @@ def main( batch = tokenizer(user_prompt, return_tensors="pt") batch = {k: v.to("cuda") for k, v in batch.items()} - + start = time.perf_counter() with torch.no_grad(): outputs = model.generate( **batch, @@ -103,7 +102,8 @@ def main( length_penalty=length_penalty, **kwargs ) - + e2e_inference_time = (time.perf_counter()-start)*1000 + print(f"the inference time is {e2e_inference_time} ms") output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Safety check of the model output diff --git a/llama_finetuning.py b/llama_finetuning.py index 02aebdecb721776659cdaadd6de93e7ca4ef907b..4e324140781dabc6fd31a8915fe6293951346b9f 100644 --- a/llama_finetuning.py +++ b/llama_finetuning.py @@ -109,13 +109,11 @@ def main(**kwargs): # Load the tokenizer and add special tokens tokenizer = LlamaTokenizer.from_pretrained(train_config.model_name) tokenizer.add_special_tokens( - { - "eos_token": "</s>", - "bos_token": "</s>", - "unk_token": "</s>", - "pad_token": '[PAD]', - } - ) + { + + "pad_token": "<PAD>", + } + ) if train_config.use_peft: peft_config = generate_peft_config(train_config, kwargs) model = get_peft_model(model, peft_config)