diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py index 08634baf92d05d979ec9f7dbf4d3c1f084843462..6aa065f14ceedcad5fb5ef3360ca71656c6c26bb 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py @@ -541,25 +541,30 @@ class LlamaMultiModal(HuggingFaceMultiModal): """ Prepares the input messages and images for Llama3.2 models. Images are appended in a custom format. """ + prompt = messages[0].content messages = [ { "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": messages[0].content}, - ], + "content": [], } ] images = [] for img_doc in image_documents: + messages[0]["content"].append({"type": "image"}) images.append(Image.open(img_doc.image_path)) + messages[0]["content"].append({"type": "text", "text": prompt}) + # Apply a chat template to format the message with the processor input_text = self._processor.tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=False ) + # If no images are present then we should pass None to deactivate image processing in the processor + if len(images) == 0: + images = None + # Prepare the model inputs (text + images) and convert to tensor inputs = self._processor(images, input_text, return_tensors="pt") return inputs.to(self.device) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml index 5a344a9ccd1785f781a71b5757290e9945a25f97..c5552b6cbc7cb29405ef509c7fce4dfd22612fc2 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml @@ -30,7 +30,7 @@ license = "MIT" name = "llama-index-multi-modal-llms-huggingface" packages = [{include = "llama_index/"}] readme = "README.md" -version = "0.4.1" +version = "0.4.2" [tool.poetry.dependencies] python = ">=3.9,<4.0"