diff --git a/recipes/quickstart/inference/local_inference/multi_modal_infer.py b/recipes/quickstart/inference/local_inference/multi_modal_infer.py index a92482c3c884f5392c08926e973e84347cf557e4..1090610b65150dc83de42aaa55133d22aa7a68c4 100644 --- a/recipes/quickstart/inference/local_inference/multi_modal_infer.py +++ b/recipes/quickstart/inference/local_inference/multi_modal_infer.py @@ -7,6 +7,7 @@ from PIL import Image as PIL_Image from transformers import MllamaForConditionalGeneration, MllamaProcessor from peft import PeftModel import gradio as gr +from huggingface_hub import login # Initialize accelerator accelerator = Accelerator() @@ -17,9 +18,24 @@ DEFAULT_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct" MAX_OUTPUT_TOKENS = 2048 MAX_IMAGE_SIZE = (1120, 1120) -def load_model_and_processor(model_name: str, hf_token: str = None, finetuning_path: str = None): +def get_hf_token(): + """Retrieve Hugging Face token from environment or local auth.""" + token = os.getenv("HUGGINGFACE_TOKEN") + if token: + return token + + # Check if the user is logged in via huggingface-cli + try: + login() # Will use local authentication cache if available + except Exception as e: + print("Unable to authenticate with Hugging Face. Ensure you are logged in via `huggingface-cli login`.") + sys.exit(1) + return None + +def load_model_and_processor(model_name: str, finetuning_path: str = None): """Load model and processor with optional LoRA adapter""" print(f"Loading model: {model_name}") + hf_token = get_hf_token() model = MllamaForConditionalGeneration.from_pretrained( model_name, torch_dtype=torch.bfloat16, @@ -60,7 +76,7 @@ def generate_text_from_image(model, processor, image, prompt_text: str, temperat output = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=MAX_OUTPUT_TOKENS) return processor.decode(output[0])[len(prompt):] -def gradio_interface(model_name: str, hf_token: str): +def gradio_interface(model_name: str): """Create Gradio UI with LoRA support""" # Initialize model state current_model = {"model": None, "processor": None} @@ -68,7 +84,6 @@ def gradio_interface(model_name: str, hf_token: str): def load_or_reload_model(enable_lora: bool, lora_path: str = None): current_model["model"], current_model["processor"] = load_model_and_processor( model_name, - hf_token, lora_path if enable_lora else None ) return "Model loaded successfully" + (" with LoRA" if enable_lora else "") @@ -159,12 +174,11 @@ def gradio_interface(model_name: str, hf_token: str): def main(args): """Main execution flow""" if args.gradio_ui: - demo = gradio_interface(args.model_name, args.hf_token) + demo = gradio_interface(args.model_name) demo.launch() else: model, processor = load_model_and_processor( args.model_name, - args.hf_token, args.finetuning_path ) image = process_image(image_path=args.image_path) @@ -183,9 +197,8 @@ if __name__ == "__main__": parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature") parser.add_argument("--top_p", type=float, default=0.9, help="Top-p sampling") parser.add_argument("--model_name", type=str, default=DEFAULT_MODEL, help="Model name") - parser.add_argument("--hf_token", type=str, help="Hugging Face API token") parser.add_argument("--finetuning_path", type=str, help="Path to LoRA weights") parser.add_argument("--gradio_ui", action="store_true", help="Launch Gradio UI") args = parser.parse_args() - main(args) \ No newline at end of file + main(args)