From d4c17a78a1ebf81b096638cdec5d769199ff97bd Mon Sep 17 00:00:00 2001 From: Harsh Jaykumar Jalan <harshjalan27@yahoo.com> Date: Wed, 26 Feb 2025 19:28:50 -0800 Subject: [PATCH] Support text-only prompts for LlamaMultiModal class (#17855) --- .../multi_modal_llms/huggingface/base.py | 13 +++++++++---- .../pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py index 08634baf92..6aa065f14c 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py @@ -541,25 +541,30 @@ class LlamaMultiModal(HuggingFaceMultiModal): """ Prepares the input messages and images for Llama3.2 models. Images are appended in a custom format. """ + prompt = messages[0].content messages = [ { "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": messages[0].content}, - ], + "content": [], } ] images = [] for img_doc in image_documents: + messages[0]["content"].append({"type": "image"}) images.append(Image.open(img_doc.image_path)) + messages[0]["content"].append({"type": "text", "text": prompt}) + # Apply a chat template to format the message with the processor input_text = self._processor.tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=False ) + # If no images are present then we should pass None to deactivate image processing in the processor + if len(images) == 0: + images = None + # Prepare the model inputs (text + images) and convert to tensor inputs = self._processor(images, input_text, return_tensors="pt") return inputs.to(self.device) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml index 5a344a9ccd..c5552b6cbc 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml @@ -30,7 +30,7 @@ license = "MIT" name = "llama-index-multi-modal-llms-huggingface" packages = [{include = "llama_index/"}] readme = "README.md" -version = "0.4.1" +version = "0.4.2" [tool.poetry.dependencies] python = ">=3.9,<4.0" -- GitLab