diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
index 08634baf92d05d979ec9f7dbf4d3c1f084843462..6aa065f14ceedcad5fb5ef3360ca71656c6c26bb 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/llama_index/multi_modal_llms/huggingface/base.py
@@ -541,25 +541,30 @@ class LlamaMultiModal(HuggingFaceMultiModal):
         """
         Prepares the input messages and images for Llama3.2 models. Images are appended in a custom format.
         """
+        prompt = messages[0].content
         messages = [
             {
                 "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": messages[0].content},
-                ],
+                "content": [],
             }
         ]
         images = []
 
         for img_doc in image_documents:
+            messages[0]["content"].append({"type": "image"})
             images.append(Image.open(img_doc.image_path))
 
+        messages[0]["content"].append({"type": "text", "text": prompt})
+
         # Apply a chat template to format the message with the processor
         input_text = self._processor.tokenizer.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=False
         )
 
+        # If no images are present then we should pass None to deactivate image processing in the processor
+        if len(images) == 0:
+            images = None
+
         # Prepare the model inputs (text + images) and convert to tensor
         inputs = self._processor(images, input_text, return_tensors="pt")
         return inputs.to(self.device)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
index 5a344a9ccd1785f781a71b5757290e9945a25f97..c5552b6cbc7cb29405ef509c7fce4dfd22612fc2 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml
@@ -30,7 +30,7 @@ license = "MIT"
 name = "llama-index-multi-modal-llms-huggingface"
 packages = [{include = "llama_index/"}]
 readme = "README.md"
-version = "0.4.1"
+version = "0.4.2"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"