From 97ca61aacddaa9457f8111dbbdfef2562212df1f Mon Sep 17 00:00:00 2001
From: Logan <logan.markewich@live.com>
Date: Thu, 27 Feb 2025 15:54:26 -0600
Subject: [PATCH] refactor openai multimodal (#17951)

---
 .../llama_index/llms/openai/base.py           |   5 +
 .../llama-index-llms-openai/pyproject.toml    |   2 +-
 .../multi_modal_llms/azure_openai/base.py     | 250 ++++----
 .../pyproject.toml                            |   3 +-
 .../test_multi-modal-llms_azure_openai.py     |   8 +-
 .../pyproject.toml                            |   4 +-
 .../tests/test_multi_modal_llms_nebius.py     |   4 +-
 .../multi_modal_llms/openai/base.py           | 546 +++---------------
 .../multi_modal_llms/openai/utils.py          |  82 ---
 .../pyproject.toml                            |   4 +-
 .../tests/test_multi-modal-llms_openai.py     |   4 +-
 11 files changed, 194 insertions(+), 718 deletions(-)
 delete mode 100644 llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py

diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py
index 8c6cb28947..d32203ad45 100644
--- a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py
@@ -266,6 +266,11 @@ class OpenAI(FunctionCallingLLM):
         audio_config: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
+        # TODO: Support deprecated max_new_tokens
+        if "max_new_tokens" in kwargs:
+            max_tokens = kwargs["max_new_tokens"]
+            del kwargs["max_new_tokens"]
+
         additional_kwargs = additional_kwargs or {}
 
         api_key, api_base, api_version = resolve_openai_credentials(
diff --git a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
index caa348458a..2670759798 100644
--- a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
@@ -29,7 +29,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-openai"
 readme = "README.md"
-version = "0.3.23"
+version = "0.3.24"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py
index b02c38b051..8ba378507f 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py
@@ -1,158 +1,116 @@
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Optional, Sequence
+from pathlib import Path
 
-import httpx
-from llama_index.core.bridge.pydantic import Field, PrivateAttr
-from llama_index.core.callbacks import CallbackManager
-from llama_index.core.constants import (
-    DEFAULT_CONTEXT_WINDOW,
-    DEFAULT_NUM_OUTPUTS,
-    DEFAULT_TEMPERATURE,
+from llama_index.core.base.llms.generic_utils import (
+    chat_response_to_completion_response,
+    stream_chat_response_to_completion_response,
+    astream_chat_response_to_completion_response,
 )
-from llama_index.core.base.llms.generic_utils import get_from_param_or_env
-from llama_index.core.multi_modal_llms import MultiModalLLMMetadata
-from llama_index.llms.azure_openai.utils import (
-    refresh_openai_azuread_token,
-    resolve_from_aliases,
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    MessageRole,
+    ImageBlock,
 )
-from llama_index.multi_modal_llms.openai import OpenAIMultiModal
-from openai.lib.azure import AsyncAzureOpenAI
-from openai.lib.azure import AzureOpenAI as SyncAzureOpenAI
+from llama_index.core.schema import ImageNode
+from llama_index.llms.azure_openai import AzureOpenAI
 
 
-class AzureOpenAIMultiModal(OpenAIMultiModal):
-    """
-    Azure OpenAI.
-
-    To use this, you must first deploy a model on Azure OpenAI.
-    Unlike OpenAI, you need to specify a `engine` parameter to identify
-    your deployment (called "model deployment name" in Azure portal).
-
-    - model: Name of the model (e.g. `text-davinci-003`)
-        This in only used to decide completion vs. chat endpoint.
-    - engine: This will correspond to the custom name you chose
-        for your deployment when you deployed a model.
-
-    You must have the following environment variables set:
-    - `OPENAI_API_VERSION`: set this to `2023-05-15`
-        This may change in the future.
-    - `AZURE_OPENAI_ENDPOINT`: your endpoint should look like the following
-        https://YOUR_RESOURCE_NAME.openai.azure.com/
-    - `AZURE_OPENAI_API_KEY`: your API key if the api type is `azure`
-
-    More information can be found here:
-        https://learn.microsoft.com/en-us/azure/cognitive-services/openai/quickstart?tabs=command-line&pivots=programming-language-python
-    """
-
-    engine: str = Field(description="The name of the deployed azure engine.")
-    azure_endpoint: Optional[str] = Field(
-        default=None, description="The Azure endpoint to use."
-    )
-    azure_deployment: Optional[str] = Field(
-        default=None, description="The Azure deployment to use."
-    )
-    use_azure_ad: bool = Field(
-        description="Indicates if Microsoft Entra ID (former Azure AD) is used for token authentication"
-    )
-
-    _azure_ad_token: Any = PrivateAttr(default=None)
+class AzureOpenAIMultiModal(AzureOpenAI):
+    @classmethod
+    def class_name(cls) -> str:
+        return "azure_openai_multi_modal_llm"
 
-    def __init__(
+    def _get_multi_modal_chat_message(
         self,
-        model: str = "gpt-4-vision-preview",
-        engine: Optional[str] = None,
-        temperature: float = DEFAULT_TEMPERATURE,
-        max_new_tokens: Optional[int] = 300,
-        additional_kwargs: Optional[Dict[str, Any]] = None,
-        context_window: Optional[int] = DEFAULT_CONTEXT_WINDOW,
-        max_retries: int = 3,
-        timeout: float = 60.0,
-        image_detail: str = "low",
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-        api_version: Optional[str] = None,
-        # azure specific
-        azure_endpoint: Optional[str] = None,
-        azure_deployment: Optional[str] = None,
-        use_azure_ad: bool = False,
-        # aliases for engine
-        deployment_name: Optional[str] = None,
-        deployment_id: Optional[str] = None,
-        deployment: Optional[str] = None,
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
-        callback_manager: Optional[CallbackManager] = None,
-        default_headers: Optional[Dict[str, str]] = None,
-        http_client: Optional[httpx.Client] = None,
+        prompt: str,
+        role: str,
+        image_documents: Sequence[ImageNode],
+        image_detail: Optional[str] = "low",
         **kwargs: Any,
-    ) -> None:
-        engine = resolve_from_aliases(
-            engine, deployment_name, deployment_id, deployment, azure_deployment
+    ) -> ChatMessage:
+        chat_msg = ChatMessage(role=role, content=prompt)
+        if not image_documents:
+            # if image_documents is empty, return text only chat message
+            return chat_msg
+
+        for image_document in image_documents:
+            # Create the appropriate ContentBlock depending on the document content
+            if image_document.image:
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        image=bytes(image_document.image, encoding="utf-8"),
+                        detail=image_detail,
+                    )
+                )
+            elif image_document.image_url:
+                chat_msg.blocks.append(
+                    ImageBlock(url=image_document.image_url, detail=image_detail)
+                )
+            elif image_document.image_path:
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        path=Path(image_document.image_path),
+                        detail=image_detail,
+                        image_mimetype=image_document.image_mimetype
+                        or image_document.metadata.get("file_type"),
+                    )
+                )
+            elif f_path := image_document.metadata.get("file_path"):
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        path=Path(f_path),
+                        detail=image_detail,
+                        image_mimetype=image_document.metadata.get("file_type"),
+                    )
+                )
+
+        return chat_msg
+
+    def complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
-
-        if engine is None:
-            raise ValueError("You must specify an `engine` parameter.")
-
-        azure_endpoint = get_from_param_or_env(
-            "azure_endpoint", azure_endpoint, "AZURE_OPENAI_ENDPOINT", ""
+        chat_response = self.chat([chat_message], **kwargs)
+        return chat_response_to_completion_response(chat_response)
+
+    def stream_complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponseGen:
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
-        super().__init__(
-            engine=engine,
-            model=model,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            additional_kwargs=additional_kwargs,
-            context_window=context_window,
-            max_retries=max_retries,
-            timeout=timeout,
-            image_detail=image_detail,
-            api_key=api_key,
-            api_base=api_base,
-            api_version=api_version,
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
-            callback_manager=callback_manager,
-            azure_endpoint=azure_endpoint,
-            azure_deployment=azure_deployment,
-            use_azure_ad=use_azure_ad,
-            default_headers=default_headers,
-            http_client=http_client,
-            **kwargs,
+        chat_response = self.stream_chat([chat_message], **kwargs)
+        return stream_chat_response_to_completion_response(chat_response)
+
+    # ===== Async Endpoints =====
+
+    async def acomplete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponse:
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
-
-    def _get_clients(self, **kwargs: Any) -> Tuple[SyncAzureOpenAI, AsyncAzureOpenAI]:
-        client = SyncAzureOpenAI(**self._get_credential_kwargs())
-        aclient = AsyncAzureOpenAI(**self._get_credential_kwargs())
-        return client, aclient
-
-    @classmethod
-    def class_name(cls) -> str:
-        return "azure_openai_multi_modal_llm"
-
-    @property
-    def metadata(self) -> MultiModalLLMMetadata:
-        """Multi Modal LLM metadata."""
-        return MultiModalLLMMetadata(
-            num_output=self.max_new_tokens or DEFAULT_NUM_OUTPUTS,
-            model_name=self.engine,
+        chat_response = await self.achat([chat_message], **kwargs)
+        return chat_response_to_completion_response(chat_response)
+
+    async def astream_complete(
+        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
-
-    def _get_credential_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        if self.use_azure_ad:
-            self._azure_ad_token = refresh_openai_azuread_token(self._azure_ad_token)
-            self.api_key = self._azure_ad_token.token
-
-        return {
-            "api_key": self.api_key or None,
-            "max_retries": self.max_retries,
-            "azure_endpoint": self.azure_endpoint,
-            "azure_deployment": self.azure_deployment,
-            "api_version": self.api_version,
-            "default_headers": self.default_headers,
-            "http_client": self._http_client,
-            "timeout": self.timeout,
-        }
-
-    def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        model_kwargs = super()._get_model_kwargs(**kwargs)
-        model_kwargs["model"] = self.engine
-        return model_kwargs
+        chat_response = await self.astream_chat([chat_message], **kwargs)
+        return astream_chat_response_to_completion_response(chat_response)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml
index 382df96fba..4aba750aaf 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml
@@ -27,12 +27,11 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-multi-modal-llms-azure-openai"
 readme = "README.md"
-version = "0.3.1"
+version = "0.4.0"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 llama-index-llms-azure-openai = "^0.3.0"
-llama-index-multi-modal-llms-openai = "^0.4.0"
 llama-index-core = "^0.12.0"
 
 [tool.poetry.group.dev.dependencies]
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py
index 81071c93f0..1d8c33feae 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py
@@ -1,12 +1,12 @@
-from llama_index.core.multi_modal_llms.base import MultiModalLLM
+from llama_index.llms.azure_openai import AzureOpenAI
 from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal
 
 
 def test_embedding_class():
     names_of_base_classes = [b.__name__ for b in AzureOpenAIMultiModal.__mro__]
-    assert MultiModalLLM.__name__ in names_of_base_classes
+    assert AzureOpenAI.__name__ in names_of_base_classes
 
 
 def test_init():
-    m = AzureOpenAIMultiModal(max_new_tokens=400, engine="fake", api_key="fake")
-    assert m.max_new_tokens == 400
+    m = AzureOpenAIMultiModal(max_tokens=400, engine="fake", api_key="fake")
+    assert m.max_tokens == 400
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml
index 0349c44a9e..24fcb199eb 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml
@@ -27,11 +27,11 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-multi-modal-llms-nebius"
 readme = "README.md"
-version = "0.3.1"
+version = "0.4.0"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-llama-index-multi-modal-llms-openai = "^0.4.0"
+llama-index-multi-modal-llms-openai = "^0.5.0"
 llama-index-core = "^0.12.0"
 
 [tool.poetry.group.dev.dependencies]
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py
index 074f4811a8..d6fd00ee95 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py
@@ -1,7 +1,7 @@
-from llama_index.core.multi_modal_llms.base import MultiModalLLM
+from llama_index.multi_modal_llms.openai import OpenAIMultiModal
 from llama_index.multi_modal_llms.nebius import NebiusMultiModal
 
 
 def test_multi_modal_class():
     names_of_base_classes = [b.__name__ for b in NebiusMultiModal.__mro__]
-    assert MultiModalLLM.__name__ in names_of_base_classes
+    assert OpenAIMultiModal.__name__ in names_of_base_classes
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py
index 0245343a0b..25d7bf3800 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py
@@ -1,520 +1,116 @@
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast
+from typing import Any, Optional, Sequence
+from pathlib import Path
 
-import httpx
 from llama_index.core.base.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
+    chat_response_to_completion_response,
+    stream_chat_response_to_completion_response,
+    astream_chat_response_to_completion_response,
 )
 from llama_index.core.base.llms.types import (
     ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
     CompletionResponse,
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     MessageRole,
+    ImageBlock,
 )
-from llama_index.core.bridge.pydantic import Field, PrivateAttr
-from llama_index.core.callbacks import CallbackManager
-from llama_index.core.constants import (
-    DEFAULT_CONTEXT_WINDOW,
-    DEFAULT_NUM_OUTPUTS,
-    DEFAULT_TEMPERATURE,
-)
-from llama_index.core.llms.callbacks import llm_chat_callback, llm_completion_callback
-from llama_index.core.multi_modal_llms import MultiModalLLM, MultiModalLLMMetadata
 from llama_index.core.schema import ImageNode
-from llama_index.llms.openai.utils import (
-    from_openai_message,
-    resolve_openai_credentials,
-    to_openai_message_dicts,
-    update_tool_calls,
-)
-from openai import AsyncOpenAI
-from openai import OpenAI as SyncOpenAI
-from openai.types.chat import ChatCompletionMessageParam
-from openai.types.chat.chat_completion_chunk import (
-    ChatCompletionChunk,
-    ChoiceDelta,
-    ChoiceDeltaToolCall,
-)
-
-from llama_index.multi_modal_llms.openai.utils import (
-    GPT4V_MODELS,
-    generate_openai_multi_modal_chat_message,
-)
-
-
-class OpenAIMultiModal(MultiModalLLM):
-    model: str = Field(description="The Multi-Modal model to use from OpenAI.")
-    temperature: float = Field(description="The temperature to use for sampling.")
-    max_new_tokens: Optional[int] = Field(
-        description=" The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt",
-        gt=0,
-    )
-    context_window: Optional[int] = Field(
-        description="The maximum number of context tokens for the model.",
-        gt=0,
-    )
-    image_detail: str = Field(
-        description="The level of details for image in API calls. Can be low, high, or auto"
-    )
-    max_retries: int = Field(
-        default=3,
-        description="Maximum number of retries.",
-        ge=0,
-    )
-    timeout: float = Field(
-        default=60.0,
-        description="The timeout, in seconds, for API requests.",
-        ge=0,
-    )
-    api_key: str = Field(default=None, description="The OpenAI API key.", exclude=True)
-    api_base: str = Field(default=None, description="The base URL for OpenAI API.")
-    api_version: str = Field(description="The API version for OpenAI API.")
-    additional_kwargs: Dict[str, Any] = Field(
-        default_factory=dict, description="Additional kwargs for the OpenAI API."
-    )
-    default_headers: Optional[Dict[str, str]] = Field(
-        default=None, description="The default headers for API requests."
-    )
+from llama_index.llms.openai import OpenAI
 
-    _messages_to_prompt: Callable = PrivateAttr()
-    _completion_to_prompt: Callable = PrivateAttr()
-    _client: SyncOpenAI = PrivateAttr()
-    _aclient: AsyncOpenAI = PrivateAttr()
-    _http_client: Optional[httpx.Client] = PrivateAttr()
-
-    def __init__(
-        self,
-        model: str = "gpt-4-vision-preview",
-        temperature: float = DEFAULT_TEMPERATURE,
-        max_new_tokens: Optional[int] = 300,
-        additional_kwargs: Optional[Dict[str, Any]] = None,
-        context_window: Optional[int] = DEFAULT_CONTEXT_WINDOW,
-        max_retries: int = 3,
-        timeout: float = 60.0,
-        image_detail: str = "low",
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-        api_version: Optional[str] = None,
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
-        callback_manager: Optional[CallbackManager] = None,
-        default_headers: Optional[Dict[str, str]] = None,
-        http_client: Optional[httpx.Client] = None,
-        **kwargs: Any,
-    ) -> None:
-        api_key, api_base, api_version = resolve_openai_credentials(
-            api_key=api_key,
-            api_base=api_base,
-            api_version=api_version,
-        )
-
-        super().__init__(
-            model=model,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            additional_kwargs=additional_kwargs or {},
-            context_window=context_window,
-            image_detail=image_detail,
-            max_retries=max_retries,
-            timeout=timeout,
-            api_key=api_key,
-            api_base=api_base,
-            api_version=api_version,
-            callback_manager=callback_manager,
-            default_headers=default_headers,
-            **kwargs,
-        )
-        self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        self._completion_to_prompt = completion_to_prompt or (lambda x: x)
-        self._http_client = http_client
-        self._client, self._aclient = self._get_clients(**kwargs)
-
-    def _get_clients(self, **kwargs: Any) -> Tuple[SyncOpenAI, AsyncOpenAI]:
-        client = SyncOpenAI(**self._get_credential_kwargs())
-        aclient = AsyncOpenAI(**self._get_credential_kwargs())
-        return client, aclient
 
+class OpenAIMultiModal(OpenAI):
     @classmethod
     def class_name(cls) -> str:
         return "openai_multi_modal_llm"
 
-    @property
-    def metadata(self) -> MultiModalLLMMetadata:
-        """Multi Modal LLM metadata."""
-        return MultiModalLLMMetadata(
-            num_output=self.max_new_tokens or DEFAULT_NUM_OUTPUTS,
-            model_name=self.model,
-        )
-
-    def _get_credential_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        return {
-            "api_key": self.api_key,
-            "base_url": self.api_base,
-            "max_retries": self.max_retries,
-            "default_headers": self.default_headers,
-            "http_client": self._http_client,
-            "timeout": self.timeout,
-            **kwargs,
-        }
-
-    def _get_multi_modal_chat_messages(
+    def _get_multi_modal_chat_message(
         self,
         prompt: str,
         role: str,
         image_documents: Sequence[ImageNode],
+        image_detail: Optional[str] = "low",
         **kwargs: Any,
-    ) -> List[ChatCompletionMessageParam]:
-        return to_openai_message_dicts(
-            [
-                generate_openai_multi_modal_chat_message(
-                    prompt=prompt,
-                    role=role,
-                    image_documents=image_documents,
-                    image_detail=self.image_detail,
+    ) -> ChatMessage:
+        chat_msg = ChatMessage(role=role, content=prompt)
+        if not image_documents:
+            # if image_documents is empty, return text only chat message
+            return chat_msg
+
+        for image_document in image_documents:
+            # Create the appropriate ContentBlock depending on the document content
+            if image_document.image:
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        image=bytes(image_document.image, encoding="utf-8"),
+                        detail=image_detail,
+                    )
                 )
-            ]
-        )
-
-    # Model Params for OpenAI GPT4V model.
-    def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
-        if self.model not in GPT4V_MODELS:
-            raise ValueError(
-                f"Invalid model {self.model}. "
-                f"Available models are: {list(GPT4V_MODELS.keys())}"
-            )
-        base_kwargs = {"model": self.model, "temperature": self.temperature, **kwargs}
-        if self.max_new_tokens is not None:
-            # If max_tokens is None, don't include in the payload:
-            # https://platform.openai.com/docs/api-reference/chat
-            # https://platform.openai.com/docs/api-reference/completions
-            base_kwargs["max_tokens"] = self.max_new_tokens
-        return {**base_kwargs, **self.additional_kwargs}
-
-    def _get_response_token_counts(self, raw_response: Any) -> dict:
-        """Get the token usage reported by the response."""
-        if not isinstance(raw_response, dict):
-            return {}
-
-        usage = raw_response.get("usage", {})
-        # NOTE: other model providers that use the OpenAI client may not report usage
-        if usage is None:
-            return {}
-
-        return {
-            "prompt_tokens": usage.get("prompt_tokens", 0),
-            "completion_tokens": usage.get("completion_tokens", 0),
-            "total_tokens": usage.get("total_tokens", 0),
-        }
-
-    @llm_completion_callback()
-    def _complete(
-        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
-    ) -> CompletionResponse:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dict = self._get_multi_modal_chat_messages(
-            prompt=prompt, role=MessageRole.USER, image_documents=image_documents
-        )
-        response = self._client.chat.completions.create(
-            messages=message_dict,
-            stream=False,
-            **all_kwargs,
-        )
-
-        return CompletionResponse(
-            text=response.choices[0].message.content,
-            raw=response,
-            additional_kwargs=self._get_response_token_counts(response),
-        )
-
-    @llm_chat_callback()
-    def _chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dicts = to_openai_message_dicts(messages)
-        response = self._client.chat.completions.create(
-            messages=message_dicts,
-            stream=False,
-            **all_kwargs,
-        )
-        openai_message = response.choices[0].message
-        message = from_openai_message(openai_message)
-
-        return ChatResponse(
-            message=message,
-            raw=response,
-            additional_kwargs=self._get_response_token_counts(response),
-        )
-
-    @llm_completion_callback()
-    def _stream_complete(
-        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
-    ) -> CompletionResponseGen:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dict = self._get_multi_modal_chat_messages(
-            prompt=prompt, role=MessageRole.USER, image_documents=image_documents
-        )
-
-        def gen() -> CompletionResponseGen:
-            text = ""
-
-            for response in self._client.chat.completions.create(
-                messages=message_dict,
-                stream=True,
-                **all_kwargs,
-            ):
-                response = cast(ChatCompletionChunk, response)
-                if len(response.choices) > 0:
-                    delta = response.choices[0].delta
-                else:
-                    delta = ChoiceDelta()
-
-                if delta is None:
-                    continue
-
-                # update using deltas
-                content_delta = delta.content or ""
-                text += content_delta
-
-                yield CompletionResponse(
-                    delta=content_delta,
-                    text=text,
-                    raw=response,
-                    additional_kwargs=self._get_response_token_counts(response),
+            elif image_document.image_url:
+                chat_msg.blocks.append(
+                    ImageBlock(url=image_document.image_url, detail=image_detail)
                 )
-
-        return gen()
-
-    @llm_chat_callback()
-    def _stream_chat(
-        self, messages: Sequence[ChatMessage], **kwargs: Any
-    ) -> ChatResponseGen:
-        message_dicts = to_openai_message_dicts(messages)
-
-        def gen() -> ChatResponseGen:
-            content = ""
-            tool_calls: List[ChoiceDeltaToolCall] = []
-
-            is_function = False
-            for response in self._client.chat.completions.create(
-                messages=message_dicts,
-                stream=True,
-                **self._get_model_kwargs(**kwargs),
-            ):
-                response = cast(ChatCompletionChunk, response)
-                if len(response.choices) > 0:
-                    delta = response.choices[0].delta
-                else:
-                    delta = ChoiceDelta()
-
-                if delta is None:
-                    continue
-
-                # check if this chunk is the start of a function call
-                if delta.tool_calls:
-                    is_function = True
-
-                # update using deltas
-                role = delta.role or MessageRole.ASSISTANT
-                content_delta = delta.content or ""
-                content += content_delta
-
-                additional_kwargs = {}
-                if is_function:
-                    tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
-                    additional_kwargs["tool_calls"] = tool_calls
-
-                yield ChatResponse(
-                    message=ChatMessage(
-                        role=role,
-                        content=content,
-                        additional_kwargs=additional_kwargs,
-                    ),
-                    delta=content_delta,
-                    raw=response,
-                    additional_kwargs=self._get_response_token_counts(response),
+            elif image_document.image_path:
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        path=Path(image_document.image_path),
+                        detail=image_detail,
+                        image_mimetype=image_document.image_mimetype
+                        or image_document.metadata.get("file_type"),
+                    )
+                )
+            elif f_path := image_document.metadata.get("file_path"):
+                chat_msg.blocks.append(
+                    ImageBlock(
+                        path=Path(f_path),
+                        detail=image_detail,
+                        image_mimetype=image_document.metadata.get("file_type"),
+                    )
                 )
 
-        return gen()
+        return chat_msg
 
     def complete(
         self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
     ) -> CompletionResponse:
-        return self._complete(prompt, image_documents, **kwargs)
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
+        )
+        chat_response = self.chat([chat_message], **kwargs)
+        return chat_response_to_completion_response(chat_response)
 
     def stream_complete(
         self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
     ) -> CompletionResponseGen:
-        return self._stream_complete(prompt, image_documents, **kwargs)
-
-    def chat(
-        self,
-        messages: Sequence[ChatMessage],
-        **kwargs: Any,
-    ) -> ChatResponse:
-        return self._chat(messages, **kwargs)
-
-    def stream_chat(
-        self,
-        messages: Sequence[ChatMessage],
-        **kwargs: Any,
-    ) -> ChatResponseGen:
-        return self._stream_chat(messages, **kwargs)
-
-    # ===== Async Endpoints =====
-
-    @llm_completion_callback()
-    async def _acomplete(
-        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
-    ) -> CompletionResponse:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dict = self._get_multi_modal_chat_messages(
-            prompt=prompt, role=MessageRole.USER, image_documents=image_documents
-        )
-        response = await self._aclient.chat.completions.create(
-            messages=message_dict,
-            stream=False,
-            **all_kwargs,
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
+        chat_response = self.stream_chat([chat_message], **kwargs)
+        return stream_chat_response_to_completion_response(chat_response)
 
-        return CompletionResponse(
-            text=response.choices[0].message.content,
-            raw=response,
-            additional_kwargs=self._get_response_token_counts(response),
-        )
+    # ===== Async Endpoints =====
 
     async def acomplete(
         self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
     ) -> CompletionResponse:
-        return await self._acomplete(prompt, image_documents, **kwargs)
-
-    @llm_completion_callback()
-    async def _astream_complete(
-        self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
-    ) -> CompletionResponseAsyncGen:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dict = self._get_multi_modal_chat_messages(
-            prompt=prompt, role=MessageRole.USER, image_documents=image_documents
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
         )
-
-        async def gen() -> CompletionResponseAsyncGen:
-            text = ""
-
-            async for response in await self._aclient.chat.completions.create(
-                messages=message_dict,
-                stream=True,
-                **all_kwargs,
-            ):
-                response = cast(ChatCompletionChunk, response)
-                if len(response.choices) > 0:
-                    delta = response.choices[0].delta
-                else:
-                    delta = ChoiceDelta()
-
-                if delta is None:
-                    continue
-
-                # update using deltas
-                content_delta = delta.content or ""
-                text += content_delta
-
-                yield CompletionResponse(
-                    delta=content_delta,
-                    text=text,
-                    raw=response,
-                    additional_kwargs=self._get_response_token_counts(response),
-                )
-
-        return gen()
-
-    @llm_chat_callback()
-    async def _achat(
-        self, messages: Sequence[ChatMessage], **kwargs: Any
-    ) -> ChatResponse:
-        all_kwargs = self._get_model_kwargs(**kwargs)
-        message_dicts = to_openai_message_dicts(messages)
-        response = await self._aclient.chat.completions.create(
-            messages=message_dicts,
-            stream=False,
-            **all_kwargs,
-        )
-        openai_message = response.choices[0].message
-        message = from_openai_message(openai_message)
-
-        return ChatResponse(
-            message=message,
-            raw=response,
-            additional_kwargs=self._get_response_token_counts(response),
-        )
-
-    @llm_chat_callback()
-    async def _astream_chat(
-        self, messages: Sequence[ChatMessage], **kwargs: Any
-    ) -> ChatResponseAsyncGen:
-        message_dicts = to_openai_message_dicts(messages)
-
-        async def gen() -> ChatResponseAsyncGen:
-            content = ""
-            tool_calls: List[ChoiceDeltaToolCall] = []
-
-            is_function = False
-            async for response in await self._aclient.chat.completions.create(
-                messages=message_dicts,
-                stream=True,
-                **self._get_model_kwargs(**kwargs),
-            ):
-                response = cast(ChatCompletionChunk, response)
-                if len(response.choices) > 0:
-                    delta = response.choices[0].delta
-                else:
-                    delta = ChoiceDelta()
-
-                if delta is None:
-                    continue
-
-                # check if this chunk is the start of a function call
-                if delta.tool_calls:
-                    is_function = True
-
-                # update using deltas
-                role = delta.role or MessageRole.ASSISTANT
-                content_delta = delta.content or ""
-                content += content_delta
-
-                additional_kwargs = {}
-                if is_function:
-                    tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
-                    additional_kwargs["tool_calls"] = tool_calls
-
-                yield ChatResponse(
-                    message=ChatMessage(
-                        role=role,
-                        content=content,
-                        additional_kwargs=additional_kwargs,
-                    ),
-                    delta=content_delta,
-                    raw=response,
-                    additional_kwargs=self._get_response_token_counts(response),
-                )
-
-        return gen()
+        chat_response = await self.achat([chat_message], **kwargs)
+        return chat_response_to_completion_response(chat_response)
 
     async def astream_complete(
         self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any
     ) -> CompletionResponseAsyncGen:
-        return await self._astream_complete(prompt, image_documents, **kwargs)
-
-    async def achat(
-        self,
-        messages: Sequence[ChatMessage],
-        **kwargs: Any,
-    ) -> ChatResponse:
-        return await self._achat(messages, **kwargs)
-
-    async def astream_chat(
-        self,
-        messages: Sequence[ChatMessage],
-        **kwargs: Any,
-    ) -> ChatResponseAsyncGen:
-        return await self._astream_chat(messages, **kwargs)
+        chat_message = self._get_multi_modal_chat_message(
+            prompt=prompt,
+            role=MessageRole.USER,
+            image_documents=image_documents,
+        )
+        chat_response = await self.astream_chat([chat_message], **kwargs)
+        return astream_chat_response_to_completion_response(chat_response)
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py
deleted file mode 100644
index b77685687a..0000000000
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Optional, Sequence
-
-from llama_index.core.base.llms.types import ImageBlock
-from llama_index.core.multi_modal_llms.base import ChatMessage, ImageNode
-
-DEFAULT_OPENAI_API_TYPE = "open_ai"
-DEFAULT_OPENAI_API_BASE = "https://api.openai.com/v1"
-
-
-GPT4V_MODELS = {
-    "gpt-4-vision-preview": 128000,
-    "gpt-4-turbo-2024-04-09": 128000,
-    "gpt-4-turbo": 128000,
-    "gpt-4o": 128000,
-    "gpt-4o-2024-05-13": 128000,
-    "gpt-4o-2024-08-06": 128000,
-    "gpt-4o-2024-11-20": 128000,
-    "gpt-4o-mini": 128000,
-    "gpt-4o-mini-2024-07-18": 128000,
-    "o1": 200000,
-    "o1-2024-12-17": 200000,
-    "o3-mini": 200000,
-    "o3-mini-2025-01-31": 200000,
-}
-
-
-MISSING_API_KEY_ERROR_MESSAGE = """No API key found for OpenAI.
-Please set either the OPENAI_API_KEY environment variable or \
-openai.api_key prior to initialization.
-API keys can be found or created at \
-https://platform.openai.com/account/api-keys
-"""
-
-logger = logging.getLogger(__name__)
-
-
-def generate_openai_multi_modal_chat_message(
-    prompt: str,
-    role: str,
-    image_documents: Optional[Sequence[ImageNode]] = None,
-    image_detail: Optional[str] = "low",
-) -> ChatMessage:
-    """Create a ChatMessage to be used in a multimodal query."""
-    chat_msg = ChatMessage(role=role, content=prompt)
-    if image_documents is None:
-        # if image_documents is empty, return text only chat message
-        return chat_msg
-
-    for image_document in image_documents:
-        # Create the appropriate ContentBlock depending on the document content
-        if image_document.image:
-            chat_msg.blocks.append(
-                ImageBlock(
-                    image=bytes(image_document.image, encoding="utf-8"),
-                    detail=image_detail,
-                )
-            )
-        elif image_document.image_url:
-            chat_msg.blocks.append(
-                ImageBlock(url=image_document.image_url, detail=image_detail)
-            )
-        elif image_document.image_path:
-            chat_msg.blocks.append(
-                ImageBlock(
-                    path=Path(image_document.image_path),
-                    detail=image_detail,
-                    image_mimetype=image_document.image_mimetype
-                    or image_document.metadata.get("file_type"),
-                )
-            )
-        elif f_path := image_document.metadata.get("file_path"):
-            chat_msg.blocks.append(
-                ImageBlock(
-                    path=Path(f_path),
-                    detail=image_detail,
-                    image_mimetype=image_document.metadata.get("file_type"),
-                )
-            )
-
-    return chat_msg
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml
index cd6e99421c..45a8c2c9d7 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml
@@ -27,11 +27,11 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-multi-modal-llms-openai"
 readme = "README.md"
-version = "0.4.3"
+version = "0.5.0"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
-llama-index-llms-openai = "^0.3.0"
+llama-index-llms-openai = "^0.3.22"
 llama-index-core = "^0.12.3"
 
 [tool.poetry.group.dev.dependencies]
diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py
index 2327bdd18c..2e6b60e721 100644
--- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py
+++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py
@@ -1,7 +1,7 @@
-from llama_index.core.multi_modal_llms.base import MultiModalLLM
+from llama_index.llms.openai import OpenAI
 from llama_index.multi_modal_llms.openai import OpenAIMultiModal
 
 
 def test_embedding_class():
     names_of_base_classes = [b.__name__ for b in OpenAIMultiModal.__mro__]
-    assert MultiModalLLM.__name__ in names_of_base_classes
+    assert OpenAI.__name__ in names_of_base_classes
-- 
GitLab