# Adapted from https://huggingface.co/HuggingFaceM4/idefics2-8b # This has support for all the IDEFICS2/3 models from transformers.image_utils import load_image from transformers import AutoProcessor, AutoModelForVision2Seq from tqdm import tqdm def generate_response(queries, model_path): model = AutoModelForVision2Seq.from_pretrained(model_path).to('cuda') processor = AutoProcessor.from_pretrained(model_path) for k in tqdm(queries): query = queries[k]['question'] image = load_image(queries[k]["figure_path"]) messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": f"{query}"}, ] } ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") inputs = {k: v.to('cuda') for k, v in inputs.items()} generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=False) response = processor.batch_decode(generated_ids, skip_special_tokens=True) response = response[0].split("Assistant: ")[-1] # get the response from the assistant queries[k]['response'] = response