Skip to content
Snippets Groups Projects
Commit e832f28a authored by zwcolin's avatar zwcolin
Browse files

update support for the following models:

- o1
- llama 3.2
- llava onevision
- molmo
- nvlm
- phi 3.5
- pixtral
- qwen2vl
parent 63ee484a
No related branches found
No related tags found
No related merge requests found
import requests
import torch
from PIL import Image
from tqdm import tqdm
from transformers import MllamaForConditionalGeneration, AutoProcessor
def generate_response(queries, model_path):
model = MllamaForConditionalGeneration.from_pretrained(model_path,
torch_dtype=torch.bfloat16,
device_map="auto")
processor = AutoProcessor.from_pretrained(model_path)
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
image = Image.open(image).convert('RGB')
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": query}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
image,
input_text,
add_special_tokens=False,
return_tensors="pt"
).to(model.device)
output = model.generate(**inputs, max_new_tokens=1024)
response = processor.decode(output[0])
response = response.split("<|start_header_id|>assistant<|end_header_id|>")[1].replace("<|eot_id|>", "").strip()
queries[k]['response'] = response
# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
from PIL import Image
import requests
import copy
import torch
import sys
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
def generate_response(queries, model_path):
model_name = "llava_qwen"
device = "cuda"
device_map = "auto"
tokenizer, model, image_processor, max_length = load_pretrained_model(model_path, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
model.eval()
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
image = Image.open(image).convert('RGB')
image_tensor = process_images([image], image_processor, model.config)
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
question = DEFAULT_IMAGE_TOKEN + "\n{}".format(query)
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
image_sizes = [image.size]
cont = model.generate(
input_ids,
images=image_tensor,
image_sizes=image_sizes,
do_sample=False,
temperature=0,
max_new_tokens=4096,
)
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
queries[k]['response'] = text_outputs
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
from tqdm import tqdm
def generate_response(queries, model_path):
processor = AutoProcessor.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
image = Image.open(image).convert('RGB')
inputs = processor.process(
images=[image],
text=query
)
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer
)
generated_tokens = output[0,inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
queries[k]['response'] = generated_text
import torch
from transformers import AutoTokenizer, AutoModel
import math
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from tqdm import tqdm
def split_model():
device_map = {}
world_size = torch.cuda.device_count()
num_layers = 80
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.lm_head'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def generate_response(queries, model_path):
device_map = split_model()
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=False,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=1024, do_sample=False)
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
query = f'<image>\n{query}'
pixel_values = load_image(image, max_num=12).to(torch.bfloat16)
response = model.chat(tokenizer, pixel_values, query, generation_config)
queries[k]['response'] = response
import base64
import requests
def get_client_model(model_path, api_key):
assert api_key is not None, "API key is required for using GPT"
assert model_path is not None, "Model name is required for using GPT"
model = model_path
client = None
return client, model
def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False):
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Getting the base64 string
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
if not random_baseline:
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": query
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"temperature": 1.0,
"top_p": 1.0,
"seed": 42
}
else:
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": query
}
]
}
],
"temperature": 1.0,
"top_p": 1.0,
"seed": 42
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response = response.json()
return response['choices'][0]['message']['content']
\ No newline at end of file
from PIL import Image
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor
from tqdm import tqdm
def generate_response(queries, model_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
device_map="cuda",
trust_remote_code=True,
torch_dtype="auto",
_attn_implementation='flash_attention_2')
processor = AutoProcessor.from_pretrained(model_path,
trust_remote_code=True,
num_crops=16)
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
image = Image.open(image).convert('RGB')
images = [image]
query = f"<|image_1|>\n{query}"
messages = [
{"role": "user", "content": query}
]
prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
generation_args = {
"max_new_tokens": 1000,
"temperature": 0.0,
"do_sample": False
}
generate_ids = model.generate(**inputs,
eos_token_id=processor.tokenizer.eos_token_id,
**generation_args)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]
print(response)
queries[k]['response'] = response
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk, ImageChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from tqdm import tqdm
def generate_response(queries, model_path):
tokenizer = MistralTokenizer.from_file(f"{model_path}/tekken.json")
model = Transformer.from_folder(model_path)
for k in tqdm(queries):
query = queries[k]['question']
image = queries[k]["figure_path"]
image = Image.open(image).convert('RGB')
completion_request = ChatCompletionRequest(messages=[UserMessage(content=[ImageChunk(image=image), TextChunk(text=query)])])
encoded = tokenizer.encode_chat_completion(completion_request)
images = encoded.images
tokens = encoded.tokens
out_tokens, _ = generate([tokens], model, images=[images], max_tokens=1024, temperature=0., eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
response = tokenizer.decode(out_tokens[0])
queries[k]['response'] = response
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
from tqdm import tqdm
import torch
def generate_response(queries, model_path):
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
# default processer
processor = AutoProcessor.from_pretrained(model_path)
for k in tqdm(queries):
query = queries[k]["question"]
image = queries[k]["figure_path"]
image = Image.open(image).convert("RGB")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": query},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
generated_ids_trimmed = [
out_ids[len(in_ids) :]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
queries[k]["response"] = response
......@@ -44,6 +44,11 @@ def get_client_fn(model_path):
'gpt-4-turbo-2024-04-09',
'gpt-4o-mini-2024-07-18']:
from .gpt import get_client_model
# o1
elif model_path in ['o1-preview',
'o1-mini',
'o1-2024-12-17']:
from .o1 import get_client_model
# reka
elif model_path in ['reka-core-20240415',
'reka-flash-20240226',
......@@ -74,6 +79,35 @@ def get_generate_fn(model_path):
'claude-3-haiku-20240307',
'claude-3-5-sonnet-20240620']:
from .claude import generate_response
# llama 3.2
elif model_name in ['Llama-3.2-11B-Vision-Instruct',
'Llama-3.2-90B-Vision-Instruct']:
from .llama32 import generate_response
# llavaov
elif model_name in ['llava-onevision-qwen2-0.5b-ov',
'llava-onevision-qwen2-7b-ov',
'llava-onevision-qwen2-72b-ov-chat']:
from .llavaov import generate_response
# molmo
elif model_name in ['Molmo-7B-D-0924',
'Molmo-7B-O-0924',
'Molmo-72B-0924',
'MolmoE-1B-0924',]:
from .molmo import generate_response
# nvlm
elif model_name in ['NVLM-D-72B']:
from .nvlm import generate_response
# phi35
elif model_name in ['Phi-3.5-vision-instruct']:
from .phi35 import generate_response
# pixtral
elif model_name in ['Pixtral-12B-2409']:
from .pixtral import generate_response
# qwen2
elif model_name in ['Qwen2-VL-2B-Instruct',
'Qwen2-VL-7B-Instruct',
'Qwen2-VL-72B-Instruct']:
from .qwen2 import generate_response
# deepseekvl
elif model_name in ['deepseek-vl-7b-chat']:
from .deepseekvl import generate_response
......@@ -90,6 +124,11 @@ def get_generate_fn(model_path):
'gpt-4-turbo-2024-04-09',
'gpt-4o-mini-2024-07-18']:
from .gpt import generate_response
# o1
elif model_name in ['o1-preview',
'o1-mini',
'o1-2024-12-17']:
from .o1 import generate_response
# idefics2
elif model_name in ['idefics2-8b',
'idefics2-8b-chatty',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment