Newer
Older
# This has support for all the IDEFICS2/3 models
from transformers.image_utils import load_image
from transformers import AutoProcessor, AutoModelForVision2Seq
from tqdm import tqdm
model = AutoModelForVision2Seq.from_pretrained(model_path).to('cuda')
processor = AutoProcessor.from_pretrained(model_path)
for k in tqdm(queries):
query = queries[k]['question']
image = load_image(queries[k]["figure_path"])
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": f"{query}"},
]
}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = {k: v.to('cuda') for k, v in inputs.items()}
generated_ids = model.generate(**inputs, max_new_tokens=500, do_sample=False)
response = processor.batch_decode(generated_ids, skip_special_tokens=True)
response = response[0].split("Assistant: ")[-1] # get the response from the assistant