Skip to content
Snippets Groups Projects
Commit 41a46d81 authored by Kai Wu's avatar Kai Wu Committed by Matthias Reso
Browse files

fix alpaca dataset by using 5% of the data as eval and make sure len((eval_loader)>0

parent d24ea27b
No related branches found
No related tags found
No related merge requests found
......@@ -26,10 +26,12 @@ PROMPT_DICT = {
class InstructionDataset(Dataset):
def __init__(self, dataset_config, tokenizer, partition="train"):
self.ann = json.load(open(dataset_config.data_path))
# Use 5% of the dataset for evaluation
eval_length = int(len(self.ann)/20)
if partition == "train":
self.ann = self.ann[200:]
self.ann = self.ann[eval_length:]
else:
self.ann = self.ann[:200]
self.ann = self.ann[:eval_length]
self.tokenizer = tokenizer
......
......@@ -250,6 +250,10 @@ def main(**kwargs):
pin_memory=True,
**val_dl_kwargs,
)
if len(eval_dataloader) == 0:
raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.")
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
# Initialize the optimizer and learning rate scheduler
if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment