From 30bc536d5f4fbca2824c65f1b165207f0e69dbe7 Mon Sep 17 00:00:00 2001 From: Kai Wu <kaiwu@meta.com> Date: Mon, 13 Jan 2025 17:49:28 -0800 Subject: [PATCH] remove double BOS manually during fine-tuning --- recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py b/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py index dabb7302..f5948e15 100644 --- a/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py +++ b/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py @@ -26,11 +26,11 @@ def replace_target(target, seq): def tokenize_dialogs(dialogs, images, processor): text_prompt = processor.apply_chat_template(dialogs) + text_prompt = [prompt.replace('<|begin_of_text|>','') for prompt in text_prompt] batch = processor( images=images, text=text_prompt, padding=True, - text_kwargs={"add_special_tokens": False}, return_tensors="pt", ) label_list = [] @@ -137,3 +137,4 @@ class OCRVQADataCollator: def get_data_collator(processor): return OCRVQADataCollator(processor) + -- GitLab