diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt index f9ff571b50a336ee39f383580e051cf60475abbe..c81eefa01d20b101ce0642f4a633f7a91120fc0a 100644 --- a/.github/scripts/spellcheck_conf/wordlist.txt +++ b/.github/scripts/spellcheck_conf/wordlist.txt @@ -1484,3 +1484,4 @@ uv 8xL40S xL EDA +DeepLearningai diff --git a/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md b/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md index 0f27972e66dd8c687e54cd049684f7a8626c7c19..deeee9a9cdd1317c0f406ecfa410701305891719 100644 --- a/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md +++ b/recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md @@ -2,10 +2,10 @@ The folder here containts the Llama 3 ported notebooks of the DLAI short course [Building Agentic RAG with Llamaindex](https://www.deeplearning.ai/short-courses/building-agentic-rag-with-llamaindex/). -1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder. +1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/DeepLearningai_Course_Notebooks/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder. 2. [Building Agentic RAG with Llamaindex L2 Tool Calling](Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb) shows how to use Llama 3 to not only pick a function to execute, but also infer an argument to pass through the function. 3. [Building Agentic RAG with Llamaindex L3 Building an Agent Reasoning Loop](Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb) shows how to define a complete agent reasoning loop to reason over tools and multiple steps on a complex question the user asks about a single document while maintaining memory. -3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity. \ No newline at end of file +3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity. diff --git a/recipes/experimental/long_context/H2O/README.md b/recipes/experimental/long_context/H2O/README.md index b73d8706a11235a95f2f1194dffbab91c58346b6..20167f50db59c3c0963f116515b758f0bffc2eb6 100644 --- a/recipes/experimental/long_context/H2O/README.md +++ b/recipes/experimental/long_context/H2O/README.md @@ -8,7 +8,7 @@ Besides, LLMs usually have poor generation to long sequence during inference. H2 Current implementation supports llama-1/2/3, from 7B to 70B. Since H2O only maintains the most important KV pairs, it might missing some important information in the middle content for some knowlege-intensive tasks. -More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; Blog: **https://allenz.work/?p=11**. +More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; **Note: this implementation is tested with transformers == 4.39.0** @@ -21,7 +21,7 @@ python run_summarization.py \ --input-path data/summarization/xsum.jsonl \ --output-path summarization_output/xsum_h2o.jsonl \ --model-name meta-llama/Meta-Llama-3-8B \ ---enable_h2o_generation +--enable_h2o_generation ``` ##### **Results** diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py index dab7fa25c3902dcccb809362eca1fb5a4c17ded4..548184e6ab85be6d473defd4d401afb9c9f1a093 100644 --- a/src/llama_recipes/finetuning.py +++ b/src/llama_recipes/finetuning.py @@ -1,61 +1,68 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -from collections import Counter +import dataclasses import os +import random +from collections import Counter +from warnings import warn -import dataclasses import fire -import random +import numpy as np import torch import torch.optim as optim -import numpy as np -from peft import get_peft_model, PeftModel -from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - ShardingStrategy -) -from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload -from torch.optim.lr_scheduler import StepLR -from transformers import ( - AutoConfig, - AutoTokenizer, - BitsAndBytesConfig, - AutoProcessor, - LlamaForCausalLM, - MllamaForConditionalGeneration, -) -from transformers.models.llama.modeling_llama import LlamaDecoderLayer -from transformers.models.mllama.modeling_mllama import MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer +from accelerate.utils import is_xpu_available -from llama_recipes.configs import fsdp_config as FSDP_CONFIG -from llama_recipes.configs import train_config as TRAIN_CONFIG -from llama_recipes.configs import quantization_config as QUANTIZATION_CONFIG +from llama_recipes.configs import ( + fsdp_config as FSDP_CONFIG, + quantization_config as QUANTIZATION_CONFIG, + train_config as TRAIN_CONFIG, +) from llama_recipes.data.concatenator import ConcatDataset from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing from llama_recipes.utils import fsdp_auto_wrap_policy from llama_recipes.utils.config_utils import ( - update_config, - generate_peft_config, + check_fsdp_config, generate_dataset_config, + generate_peft_config, get_dataloader_kwargs, - check_fsdp_config, + update_config, +) +from llama_recipes.utils.dataset_utils import ( + get_custom_data_collator, + get_preprocessed_dataset, ) -from llama_recipes.utils.dataset_utils import get_preprocessed_dataset,get_custom_data_collator from llama_recipes.utils.fsdp_utils import hsdp_device_mesh from llama_recipes.utils.train_utils import ( - train, + clear_gpu_cache, freeze_transformer_layers, + get_policies, + print_model_size, setup, setup_environ_flags, - clear_gpu_cache, - print_model_size, - get_policies, + train, +) +from peft import get_peft_model, PeftModel +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy +from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload +from torch.optim.lr_scheduler import StepLR +from transformers import ( + AutoConfig, + AutoProcessor, + AutoTokenizer, + BitsAndBytesConfig, + LlamaForCausalLM, + MllamaForConditionalGeneration, +) +from transformers.models.llama.modeling_llama import LlamaDecoderLayer +from transformers.models.mllama.modeling_mllama import ( + MllamaCrossAttentionDecoderLayer, + MllamaSelfAttentionDecoderLayer, + MllamaVisionEncoderLayer, ) -from accelerate.utils import is_xpu_available -from warnings import warn + def setup_wandb(train_config, fsdp_config, **kwargs): try: @@ -66,6 +73,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs): "Please install it using pip install wandb" ) from llama_recipes.configs import wandb_config as WANDB_CONFIG + wandb_config = WANDB_CONFIG() update_config(wandb_config, **kwargs) init_dict = dataclasses.asdict(wandb_config) @@ -74,6 +82,7 @@ def setup_wandb(train_config, fsdp_config, **kwargs): run.config.update(fsdp_config, allow_val_change=True) return run + def main(**kwargs): # Update the configuration for the training and sharding process train_config, fsdp_config = TRAIN_CONFIG(), FSDP_CONFIG() @@ -103,18 +112,23 @@ def main(**kwargs): wandb_run = None if train_config.use_wandb: - if not train_config.enable_fsdp or rank==0: + if not train_config.enable_fsdp or rank == 0: wandb_run = setup_wandb(train_config, fsdp_config, **kwargs) - - #setting quantization configs + + # setting quantization configs bnb_config = None if train_config.quantization: if type(train_config.quantization) == type(True): - warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning) + warn( + "Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", + FutureWarning, + ) train_config.quantization = "8bit" if train_config.quantization == "8bit" and train_config.enable_fsdp: - raise ValueError("8bit quantization is not supported with FSDP, please use 4bit quantization") + raise ValueError( + "8bit quantization is not supported with FSDP, please use 4bit quantization" + ) quant_config = QUANTIZATION_CONFIG() update_config(quant_config, **kwargs) @@ -126,14 +140,22 @@ def main(**kwargs): if config.model_type == "mllama": is_vision = True model = MllamaForConditionalGeneration.from_pretrained( - train_config.model_name, - quantization_config=bnb_config, - attn_implementation="sdpa" if train_config.use_fast_kernels else None, - device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None, - torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, - ) - processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name) - processor.tokenizer.padding_side='right' + train_config.model_name, + quantization_config=bnb_config, + attn_implementation="sdpa" if train_config.use_fast_kernels else None, + device_map=( + "auto" + if train_config.quantization and not train_config.enable_fsdp + else None + ), + torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, + ) + processor = AutoProcessor.from_pretrained( + train_config.model_name + if train_config.tokenizer_name is None + else train_config.tokenizer_name + ) + processor.tokenizer.padding_side = "right" model.supports_gradient_checkpointing = True model.language_model.supports_gradient_checkpointing = True elif config.model_type == "llama": @@ -143,32 +165,50 @@ def main(**kwargs): quantization_config=bnb_config, use_cache=use_cache, attn_implementation="sdpa" if train_config.use_fast_kernels else None, - device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None, + device_map=( + "auto" + if train_config.quantization and not train_config.enable_fsdp + else None + ), torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16, ) else: - raise ValueError(f"Model type {config.model_type} is not supported. Please use llama or mllama model.") + raise ValueError( + f"Model type {config.model_type} is not supported. Please use llama or mllama model." + ) # Load the tokenizer and add special tokens - tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name) - if not tokenizer.pad_token_id: + tokenizer = AutoTokenizer.from_pretrained( + train_config.model_name + if train_config.tokenizer_name is None + else train_config.tokenizer_name + ) + if not tokenizer.pad_token_id: tokenizer.pad_token_id = tokenizer.eos_token_id - + # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + print( + "WARNING: Resizing the embedding matrix to match the tokenizer vocab size." + ) model.resize_token_embeddings(len(tokenizer)) print_model_size(model, train_config, rank if train_config.enable_fsdp else 0) # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled - if train_config.enable_fsdp and fsdp_config.pure_bf16 and not train_config.quantization: + if ( + train_config.enable_fsdp + and fsdp_config.pure_bf16 + and not train_config.quantization + ): model.to(torch.bfloat16) - + if train_config.use_peft: # Load the pre-trained peft model checkpoint and setup its configuration if train_config.from_peft_checkpoint: - model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True) + model = PeftModel.from_pretrained( + model, train_config.from_peft_checkpoint, is_trainable=True + ) peft_config = model.peft_config # Generate the peft config and start fine-tuning from original model else: @@ -179,23 +219,36 @@ def main(**kwargs): model.print_trainable_parameters() hsdp_device_mesh_plan = None - if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD: - hsdp_device_mesh_plan = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size) + if ( + fsdp_config.hsdp + and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD + ): + hsdp_device_mesh_plan = hsdp_device_mesh( + replica_group_size=fsdp_config.replica_group_size, + sharding_group_size=fsdp_config.sharding_group_size, + ) print("HSDP device mesh is ready") - #setting up FSDP if enable_fsdp is enabled + # setting up FSDP if enable_fsdp is enabled if train_config.enable_fsdp: check_fsdp_config(fsdp_config) - + if not train_config.use_peft and train_config.freeze_layers: freeze_transformer_layers(model, train_config.num_freeze_layers) mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank) # Create the FSDP wrapper for MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer in vision models if is_vision: - my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [MllamaSelfAttentionDecoderLayer,MllamaSelfAttentionDecoderLayer,MllamaVisionEncoderLayer]) + my_auto_wrapping_policy = fsdp_auto_wrap_policy( + model, + [ + MllamaSelfAttentionDecoderLayer, + MllamaSelfAttentionDecoderLayer, + MllamaVisionEncoderLayer, + ], + ) else: - # Create the FSDP wrapper for LlamaDecoderLayer in text models + # Create the FSDP wrapper for LlamaDecoderLayer in text models my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, [LlamaDecoderLayer]) device_id = 0 if is_xpu_available(): @@ -204,21 +257,36 @@ def main(**kwargs): device_id = torch.cuda.current_device() model = FSDP( model, - auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy, - cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None, - mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None, + auto_wrap_policy=( + my_auto_wrapping_policy if train_config.use_peft else wrapping_policy + ), + cpu_offload=( + CPUOffload(offload_params=True) + if fsdp_config.fsdp_cpu_offload + else None + ), + mixed_precision=( + mixed_precision_policy if not fsdp_config.pure_bf16 else None + ), sharding_strategy=fsdp_config.sharding_strategy, device_mesh=hsdp_device_mesh_plan, device_id=device_id, limit_all_gathers=True, sync_module_states=train_config.low_cpu_fsdp, - param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)) - if train_config.low_cpu_fsdp and rank != 0 else None, + param_init_fn=( + ( + lambda module: module.to_empty( + device=torch.device("cuda"), recurse=False + ) + ) + if train_config.low_cpu_fsdp and rank != 0 + else None + ), ) - if fsdp_config.fsdp_activation_checkpointing: + if fsdp_config.fsdp_activation_checkpointing: model.enable_input_require_grads() model.gradient_checkpointing_enable() - apply_fsdp_checkpointing(model) + apply_fsdp_checkpointing(model) elif not train_config.quantization and not train_config.enable_fsdp: if is_xpu_available(): model.to("xpu:0") @@ -252,11 +320,15 @@ def main(**kwargs): if is_vision: raise ValueError("Packing is not supported for vision datasets") else: - dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length) + dataset_train = ConcatDataset( + dataset_train, chunk_size=train_config.context_length + ) - train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train") + train_dl_kwargs = get_dataloader_kwargs( + train_config, dataset_train, dataset_processer, "train" + ) print("length of dataset_train", len(dataset_train)) - custom_data_collator = get_custom_data_collator(dataset_processer,dataset_config) + custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config) if custom_data_collator: print("custom_data_collator is used") train_dl_kwargs["collate_fn"] = custom_data_collator @@ -275,9 +347,13 @@ def main(**kwargs): if is_vision: raise ValueError("Packing is not supported for vision datasets") else: - dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length) + dataset_val = ConcatDataset( + dataset_val, chunk_size=train_config.context_length + ) - val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val") + val_dl_kwargs = get_dataloader_kwargs( + train_config, dataset_val, dataset_processer, "val" + ) if custom_data_collator: val_dl_kwargs["collate_fn"] = custom_data_collator @@ -289,7 +365,9 @@ def main(**kwargs): ) print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: - raise ValueError(f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})") + raise ValueError( + f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" + ) else: print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") @@ -324,11 +402,12 @@ def main(**kwargs): rank if train_config.enable_fsdp else None, wandb_run, ) - if not train_config.enable_fsdp or rank==0: - [print(f'Key: {k}, Value: {v}') for k, v in results.items()] + if not train_config.enable_fsdp or rank == 0: + [print(f"Key: {k}, Value: {v}") for k, v in results.items()] if train_config.use_wandb: - for k,v in results.items(): + for k, v in results.items(): wandb_run.summary[k] = v + if __name__ == "__main__": fire.Fire(main) diff --git a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py index a8c5e646f7f64f248e2ad47a6fcfabe446e27a46..642459edda3f14f6a175912531492d5e87696f98 100644 --- a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py +++ b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py @@ -3,14 +3,15 @@ # from accelerate import init_empty_weights, load_checkpoint_and_dispatch -import fire import os import sys + +import fire import yaml -from transformers import AutoTokenizer +from llama_recipes.inference.model_utils import load_llama_from_config -from llama_recipes.inference.model_utils import load_llama_from_config +from transformers import AutoConfig, AutoTokenizer, MllamaProcessor # Get the current file's directory current_directory = os.path.dirname(os.path.abspath(__file__)) @@ -22,23 +23,24 @@ parent_directory = os.path.dirname(current_directory) sys.path.append(parent_directory) from model_checkpointing import load_sharded_model_single_gpu + def main( - fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints - consolidated_model_path="", # Path to save the HF converted model checkpoints - HF_model_path_or_name="" # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf) - ): - + fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints + consolidated_model_path="", # Path to save the HF converted model checkpoints + HF_model_path_or_name="", # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf) +): + try: - file_name = 'train_params.yaml' + file_name = "train_params.yaml" # Combine the directory and file name to create the full path train_params_path = os.path.join(fsdp_checkpoint_path, file_name) # Open the file - with open(train_params_path, 'r') as file: + with open(train_params_path, "r") as file: # Load the YAML data data = yaml.safe_load(file) # Access the 'model_name' field - HF_model_path_or_name = data.get('model_name') + HF_model_path_or_name = data.get("model_name") print(f"Model name: {HF_model_path_or_name}") except FileNotFoundError: @@ -47,19 +49,33 @@ def main( print(f"Model name: {HF_model_path_or_name}") except Exception as e: print(f"An error occurred: {e}") - - - #load the HF model definition from config + + # load the HF model definition from config model_def = load_llama_from_config(HF_model_path_or_name) print("model is loaded from config") - #load the FSDP sharded checkpoints into the model + # load the FSDP sharded checkpoints into the model model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path) print("model is loaded from FSDP checkpoints") - #loading the tokenizer form the model_path - tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name) - tokenizer.save_pretrained(consolidated_model_path) - #save the FSDP sharded checkpoints in HF format + # loading the tokenizer form the model_path + config = AutoConfig.from_pretrained(HF_model_path_or_name) + # save the processor and config for mllama models + if config.model_type == "mllama": + processor = MllamaProcessor.from_pretrained(HF_model_path_or_name) + processor.save_pretrained(consolidated_model_path) + print( + f"HuggingFace mllama processor has been saved in {consolidated_model_path}" + ) + else: + # save the tokenizer for llama models + tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name) + tokenizer.save_pretrained(consolidated_model_path) + print( + f"HuggingFace llama tokenizer has been saved in {consolidated_model_path}" + ) + # save the FSDP sharded checkpoints in HF format model.save_pretrained(consolidated_model_path) print(f"HuggingFace model checkpoints has been saved in {consolidated_model_path}") + + if __name__ == "__main__": fire.Fire(main)