From d4c17a78a1ebf81b096638cdec5d769199ff97bd Mon Sep 17 00:00:00 2001
From: Harsh Jaykumar Jalan <harshjalan27@yahoo.com>
Date: Wed, 26 Feb 2025 19:28:50 -0800
Subject: [PATCH] Support text-only prompts for LlamaMultiModal class (#17855)

---
 .../multi_modal_llms/huggingface/base.py            | 13 +++++++++----
 .../pyproject.toml                                  |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
index 08634baf92..6aa065f14c 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
@@ -541,25 +541,30 @@ class LlamaMultiModal(HuggingFaceMultiModal):
         """
         Prepares the input messages and images for Llama3.2 models. Images are appended in a custom format.
         """
+        prompt = messages[0].content
         messages = [
             {
                 "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": messages[0].content},
-                ],
+                "content": [],
             }
         ]
         images = []
 
         for img_doc in image_documents:
+            messages[0]["content"].append({"type": "image"})
             images.append(Image.open(img_doc.image_path))
 
+        messages[0]["content"].append({"type": "text", "text": prompt})
+
         # Apply a chat template to format the message with the processor
         input_text = self._processor.tokenizer.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=False
         )
 
+        # If no images are present then we should pass None to deactivate image processing in the processor
+        if len(images) == 0:
+            images = None
+
         # Prepare the model inputs (text + images) and convert to tensor
         inputs = self._processor(images, input_text, return_tensors="pt")
         return inputs.to(self.device)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
index 5a344a9ccd..c5552b6cbc 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
@@ -30,7 +30,7 @@ license = "MIT"
 name = "llama-index-multi-modal-llms-huggingface"
 packages = [{include = "llama_index/"}]
 readme = "README.md"
-version = "0.4.1"
+version = "0.4.2"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-- 
GitLab