From 30bc536d5f4fbca2824c65f1b165207f0e69dbe7 Mon Sep 17 00:00:00 2001
From: Kai Wu <kaiwu@meta.com>
Date: Mon, 13 Jan 2025 17:49:28 -0800
Subject: [PATCH] remove double BOS manually during fine-tuning

---
 recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py b/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py
index dabb7302..f5948e15 100644
--- a/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py
+++ b/recipes/quickstart/finetuning/datasets/ocrvqa_dataset.py
@@ -26,11 +26,11 @@ def replace_target(target, seq):
 
 def tokenize_dialogs(dialogs, images, processor):
     text_prompt = processor.apply_chat_template(dialogs)
+    text_prompt = [prompt.replace('<|begin_of_text|>','') for prompt in text_prompt]
     batch = processor(
         images=images,
         text=text_prompt,
         padding=True,
-        text_kwargs={"add_special_tokens": False},
         return_tensors="pt",
     )
     label_list = []
@@ -137,3 +137,4 @@ class OCRVQADataCollator:
 
 def get_data_collator(processor):
     return OCRVQADataCollator(processor)
+
-- 
GitLab