diff --git a/src/llama_recipes/datasets/alpaca_dataset.py b/src/llama_recipes/datasets/alpaca_dataset.py index 21bd9643f8d23ef82be6440d24889909734d04a2..396551d1bdd98881a998337c8ef0bca88c3c644f 100644 --- a/src/llama_recipes/datasets/alpaca_dataset.py +++ b/src/llama_recipes/datasets/alpaca_dataset.py @@ -26,10 +26,12 @@ PROMPT_DICT = { class InstructionDataset(Dataset): def __init__(self, dataset_config, tokenizer, partition="train"): self.ann = json.load(open(dataset_config.data_path)) + # Use 5% of the dataset for evaluation + eval_length = int(len(self.ann)/20) if partition == "train": - self.ann = self.ann[200:] + self.ann = self.ann[eval_length:] else: - self.ann = self.ann[:200] + self.ann = self.ann[:eval_length] self.tokenizer = tokenizer diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index 3fef7222b486b8087a29d7b65710a03dfbcc424d..0bfea28839646b92d184902d5b796e99abe3620a 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -250,6 +250,10 @@ def main(**kwargs): pin_memory=True, **val_dl_kwargs, ) + if len(eval_dataloader) == 0: + raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.") + else: + print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") # Initialize the optimizer and learning rate scheduler if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision":