diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py index 8c6cb28947c5c16d053c7b6c3d626cdd925b8f0b..d32203ad45038aa75f6dc476c56fd943f179572a 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py +++ b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py @@ -266,6 +266,11 @@ class OpenAI(FunctionCallingLLM): audio_config: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: + # TODO: Support deprecated max_new_tokens + if "max_new_tokens" in kwargs: + max_tokens = kwargs["max_new_tokens"] + del kwargs["max_new_tokens"] + additional_kwargs = additional_kwargs or {} api_key, api_base, api_version = resolve_openai_credentials( diff --git a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml index caa348458ad5021f9b97dce0c654bba88ecc4265..2670759798b90b98f1241f5f0e87fab4cf1e35d8 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml @@ -29,7 +29,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-llms-openai" readme = "README.md" -version = "0.3.23" +version = "0.3.24" [tool.poetry.dependencies] python = ">=3.9,<4.0" diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py index b02c38b051c589fcde092008593aadc80b4836f4..8ba378507f5da54cd103df0a190e39cd666229a5 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/llama_index/multi_modal_llms/azure_openai/base.py @@ -1,158 +1,116 @@ -from typing import Any, Callable, Dict, Optional, Tuple +from typing import Any, Optional, Sequence +from pathlib import Path -import httpx -from llama_index.core.bridge.pydantic import Field, PrivateAttr -from llama_index.core.callbacks import CallbackManager -from llama_index.core.constants import ( - DEFAULT_CONTEXT_WINDOW, - DEFAULT_NUM_OUTPUTS, - DEFAULT_TEMPERATURE, +from llama_index.core.base.llms.generic_utils import ( + chat_response_to_completion_response, + stream_chat_response_to_completion_response, + astream_chat_response_to_completion_response, ) -from llama_index.core.base.llms.generic_utils import get_from_param_or_env -from llama_index.core.multi_modal_llms import MultiModalLLMMetadata -from llama_index.llms.azure_openai.utils import ( - refresh_openai_azuread_token, - resolve_from_aliases, +from llama_index.core.base.llms.types import ( + ChatMessage, + CompletionResponse, + CompletionResponseAsyncGen, + CompletionResponseGen, + MessageRole, + ImageBlock, ) -from llama_index.multi_modal_llms.openai import OpenAIMultiModal -from openai.lib.azure import AsyncAzureOpenAI -from openai.lib.azure import AzureOpenAI as SyncAzureOpenAI +from llama_index.core.schema import ImageNode +from llama_index.llms.azure_openai import AzureOpenAI -class AzureOpenAIMultiModal(OpenAIMultiModal): - """ - Azure OpenAI. - - To use this, you must first deploy a model on Azure OpenAI. - Unlike OpenAI, you need to specify a `engine` parameter to identify - your deployment (called "model deployment name" in Azure portal). - - - model: Name of the model (e.g. `text-davinci-003`) - This in only used to decide completion vs. chat endpoint. - - engine: This will correspond to the custom name you chose - for your deployment when you deployed a model. - - You must have the following environment variables set: - - `OPENAI_API_VERSION`: set this to `2023-05-15` - This may change in the future. - - `AZURE_OPENAI_ENDPOINT`: your endpoint should look like the following - https://YOUR_RESOURCE_NAME.openai.azure.com/ - - `AZURE_OPENAI_API_KEY`: your API key if the api type is `azure` - - More information can be found here: - https://learn.microsoft.com/en-us/azure/cognitive-services/openai/quickstart?tabs=command-line&pivots=programming-language-python - """ - - engine: str = Field(description="The name of the deployed azure engine.") - azure_endpoint: Optional[str] = Field( - default=None, description="The Azure endpoint to use." - ) - azure_deployment: Optional[str] = Field( - default=None, description="The Azure deployment to use." - ) - use_azure_ad: bool = Field( - description="Indicates if Microsoft Entra ID (former Azure AD) is used for token authentication" - ) - - _azure_ad_token: Any = PrivateAttr(default=None) +class AzureOpenAIMultiModal(AzureOpenAI): + @classmethod + def class_name(cls) -> str: + return "azure_openai_multi_modal_llm" - def __init__( + def _get_multi_modal_chat_message( self, - model: str = "gpt-4-vision-preview", - engine: Optional[str] = None, - temperature: float = DEFAULT_TEMPERATURE, - max_new_tokens: Optional[int] = 300, - additional_kwargs: Optional[Dict[str, Any]] = None, - context_window: Optional[int] = DEFAULT_CONTEXT_WINDOW, - max_retries: int = 3, - timeout: float = 60.0, - image_detail: str = "low", - api_key: Optional[str] = None, - api_base: Optional[str] = None, - api_version: Optional[str] = None, - # azure specific - azure_endpoint: Optional[str] = None, - azure_deployment: Optional[str] = None, - use_azure_ad: bool = False, - # aliases for engine - deployment_name: Optional[str] = None, - deployment_id: Optional[str] = None, - deployment: Optional[str] = None, - messages_to_prompt: Optional[Callable] = None, - completion_to_prompt: Optional[Callable] = None, - callback_manager: Optional[CallbackManager] = None, - default_headers: Optional[Dict[str, str]] = None, - http_client: Optional[httpx.Client] = None, + prompt: str, + role: str, + image_documents: Sequence[ImageNode], + image_detail: Optional[str] = "low", **kwargs: Any, - ) -> None: - engine = resolve_from_aliases( - engine, deployment_name, deployment_id, deployment, azure_deployment + ) -> ChatMessage: + chat_msg = ChatMessage(role=role, content=prompt) + if not image_documents: + # if image_documents is empty, return text only chat message + return chat_msg + + for image_document in image_documents: + # Create the appropriate ContentBlock depending on the document content + if image_document.image: + chat_msg.blocks.append( + ImageBlock( + image=bytes(image_document.image, encoding="utf-8"), + detail=image_detail, + ) + ) + elif image_document.image_url: + chat_msg.blocks.append( + ImageBlock(url=image_document.image_url, detail=image_detail) + ) + elif image_document.image_path: + chat_msg.blocks.append( + ImageBlock( + path=Path(image_document.image_path), + detail=image_detail, + image_mimetype=image_document.image_mimetype + or image_document.metadata.get("file_type"), + ) + ) + elif f_path := image_document.metadata.get("file_path"): + chat_msg.blocks.append( + ImageBlock( + path=Path(f_path), + detail=image_detail, + image_mimetype=image_document.metadata.get("file_type"), + ) + ) + + return chat_msg + + def complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) - - if engine is None: - raise ValueError("You must specify an `engine` parameter.") - - azure_endpoint = get_from_param_or_env( - "azure_endpoint", azure_endpoint, "AZURE_OPENAI_ENDPOINT", "" + chat_response = self.chat([chat_message], **kwargs) + return chat_response_to_completion_response(chat_response) + + def stream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseGen: + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) - super().__init__( - engine=engine, - model=model, - temperature=temperature, - max_new_tokens=max_new_tokens, - additional_kwargs=additional_kwargs, - context_window=context_window, - max_retries=max_retries, - timeout=timeout, - image_detail=image_detail, - api_key=api_key, - api_base=api_base, - api_version=api_version, - messages_to_prompt=messages_to_prompt, - completion_to_prompt=completion_to_prompt, - callback_manager=callback_manager, - azure_endpoint=azure_endpoint, - azure_deployment=azure_deployment, - use_azure_ad=use_azure_ad, - default_headers=default_headers, - http_client=http_client, - **kwargs, + chat_response = self.stream_chat([chat_message], **kwargs) + return stream_chat_response_to_completion_response(chat_response) + + # ===== Async Endpoints ===== + + async def acomplete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) - - def _get_clients(self, **kwargs: Any) -> Tuple[SyncAzureOpenAI, AsyncAzureOpenAI]: - client = SyncAzureOpenAI(**self._get_credential_kwargs()) - aclient = AsyncAzureOpenAI(**self._get_credential_kwargs()) - return client, aclient - - @classmethod - def class_name(cls) -> str: - return "azure_openai_multi_modal_llm" - - @property - def metadata(self) -> MultiModalLLMMetadata: - """Multi Modal LLM metadata.""" - return MultiModalLLMMetadata( - num_output=self.max_new_tokens or DEFAULT_NUM_OUTPUTS, - model_name=self.engine, + chat_response = await self.achat([chat_message], **kwargs) + return chat_response_to_completion_response(chat_response) + + async def astream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseAsyncGen: + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) - - def _get_credential_kwargs(self, **kwargs: Any) -> Dict[str, Any]: - if self.use_azure_ad: - self._azure_ad_token = refresh_openai_azuread_token(self._azure_ad_token) - self.api_key = self._azure_ad_token.token - - return { - "api_key": self.api_key or None, - "max_retries": self.max_retries, - "azure_endpoint": self.azure_endpoint, - "azure_deployment": self.azure_deployment, - "api_version": self.api_version, - "default_headers": self.default_headers, - "http_client": self._http_client, - "timeout": self.timeout, - } - - def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]: - model_kwargs = super()._get_model_kwargs(**kwargs) - model_kwargs["model"] = self.engine - return model_kwargs + chat_response = await self.astream_chat([chat_message], **kwargs) + return astream_chat_response_to_completion_response(chat_response) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml index 382df96fba76d22b6f2e2b57922d08969a75e022..4aba750aaf65099b21a4073a3650e78ac819284e 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/pyproject.toml @@ -27,12 +27,11 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-multi-modal-llms-azure-openai" readme = "README.md" -version = "0.3.1" +version = "0.4.0" [tool.poetry.dependencies] python = ">=3.9,<4.0" llama-index-llms-azure-openai = "^0.3.0" -llama-index-multi-modal-llms-openai = "^0.4.0" llama-index-core = "^0.12.0" [tool.poetry.group.dev.dependencies] diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py index 81071c93f0f769d1ead0a44c1b76a0056bd16f84..1d8c33feaef9acc944c76281f441a0f94814bea6 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-azure-openai/tests/test_multi-modal-llms_azure_openai.py @@ -1,12 +1,12 @@ -from llama_index.core.multi_modal_llms.base import MultiModalLLM +from llama_index.llms.azure_openai import AzureOpenAI from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal def test_embedding_class(): names_of_base_classes = [b.__name__ for b in AzureOpenAIMultiModal.__mro__] - assert MultiModalLLM.__name__ in names_of_base_classes + assert AzureOpenAI.__name__ in names_of_base_classes def test_init(): - m = AzureOpenAIMultiModal(max_new_tokens=400, engine="fake", api_key="fake") - assert m.max_new_tokens == 400 + m = AzureOpenAIMultiModal(max_tokens=400, engine="fake", api_key="fake") + assert m.max_tokens == 400 diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml index 0349c44a9ebef80ed6be3613245790e8250ed5ed..24fcb199ebeee082dfbcd6fb13eb1456dae68a78 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/pyproject.toml @@ -27,11 +27,11 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-multi-modal-llms-nebius" readme = "README.md" -version = "0.3.1" +version = "0.4.0" [tool.poetry.dependencies] python = ">=3.9,<4.0" -llama-index-multi-modal-llms-openai = "^0.4.0" +llama-index-multi-modal-llms-openai = "^0.5.0" llama-index-core = "^0.12.0" [tool.poetry.group.dev.dependencies] diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py index 074f4811a8d9da557f15798daaf20c8a3f863d8e..d6fd00ee956aeb7e5b765086692c4591d555bdd1 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-nebius/tests/test_multi_modal_llms_nebius.py @@ -1,7 +1,7 @@ -from llama_index.core.multi_modal_llms.base import MultiModalLLM +from llama_index.multi_modal_llms.openai import OpenAIMultiModal from llama_index.multi_modal_llms.nebius import NebiusMultiModal def test_multi_modal_class(): names_of_base_classes = [b.__name__ for b in NebiusMultiModal.__mro__] - assert MultiModalLLM.__name__ in names_of_base_classes + assert OpenAIMultiModal.__name__ in names_of_base_classes diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py index 0245343a0bc9d7992ab0eacb4eb113c83ce71738..25d7bf380039e30ba963f6ad60f99f1776f55894 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/base.py @@ -1,520 +1,116 @@ -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast +from typing import Any, Optional, Sequence +from pathlib import Path -import httpx from llama_index.core.base.llms.generic_utils import ( - messages_to_prompt as generic_messages_to_prompt, + chat_response_to_completion_response, + stream_chat_response_to_completion_response, + astream_chat_response_to_completion_response, ) from llama_index.core.base.llms.types import ( ChatMessage, - ChatResponse, - ChatResponseAsyncGen, - ChatResponseGen, CompletionResponse, CompletionResponseAsyncGen, CompletionResponseGen, MessageRole, + ImageBlock, ) -from llama_index.core.bridge.pydantic import Field, PrivateAttr -from llama_index.core.callbacks import CallbackManager -from llama_index.core.constants import ( - DEFAULT_CONTEXT_WINDOW, - DEFAULT_NUM_OUTPUTS, - DEFAULT_TEMPERATURE, -) -from llama_index.core.llms.callbacks import llm_chat_callback, llm_completion_callback -from llama_index.core.multi_modal_llms import MultiModalLLM, MultiModalLLMMetadata from llama_index.core.schema import ImageNode -from llama_index.llms.openai.utils import ( - from_openai_message, - resolve_openai_credentials, - to_openai_message_dicts, - update_tool_calls, -) -from openai import AsyncOpenAI -from openai import OpenAI as SyncOpenAI -from openai.types.chat import ChatCompletionMessageParam -from openai.types.chat.chat_completion_chunk import ( - ChatCompletionChunk, - ChoiceDelta, - ChoiceDeltaToolCall, -) - -from llama_index.multi_modal_llms.openai.utils import ( - GPT4V_MODELS, - generate_openai_multi_modal_chat_message, -) - - -class OpenAIMultiModal(MultiModalLLM): - model: str = Field(description="The Multi-Modal model to use from OpenAI.") - temperature: float = Field(description="The temperature to use for sampling.") - max_new_tokens: Optional[int] = Field( - description=" The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt", - gt=0, - ) - context_window: Optional[int] = Field( - description="The maximum number of context tokens for the model.", - gt=0, - ) - image_detail: str = Field( - description="The level of details for image in API calls. Can be low, high, or auto" - ) - max_retries: int = Field( - default=3, - description="Maximum number of retries.", - ge=0, - ) - timeout: float = Field( - default=60.0, - description="The timeout, in seconds, for API requests.", - ge=0, - ) - api_key: str = Field(default=None, description="The OpenAI API key.", exclude=True) - api_base: str = Field(default=None, description="The base URL for OpenAI API.") - api_version: str = Field(description="The API version for OpenAI API.") - additional_kwargs: Dict[str, Any] = Field( - default_factory=dict, description="Additional kwargs for the OpenAI API." - ) - default_headers: Optional[Dict[str, str]] = Field( - default=None, description="The default headers for API requests." - ) +from llama_index.llms.openai import OpenAI - _messages_to_prompt: Callable = PrivateAttr() - _completion_to_prompt: Callable = PrivateAttr() - _client: SyncOpenAI = PrivateAttr() - _aclient: AsyncOpenAI = PrivateAttr() - _http_client: Optional[httpx.Client] = PrivateAttr() - - def __init__( - self, - model: str = "gpt-4-vision-preview", - temperature: float = DEFAULT_TEMPERATURE, - max_new_tokens: Optional[int] = 300, - additional_kwargs: Optional[Dict[str, Any]] = None, - context_window: Optional[int] = DEFAULT_CONTEXT_WINDOW, - max_retries: int = 3, - timeout: float = 60.0, - image_detail: str = "low", - api_key: Optional[str] = None, - api_base: Optional[str] = None, - api_version: Optional[str] = None, - messages_to_prompt: Optional[Callable] = None, - completion_to_prompt: Optional[Callable] = None, - callback_manager: Optional[CallbackManager] = None, - default_headers: Optional[Dict[str, str]] = None, - http_client: Optional[httpx.Client] = None, - **kwargs: Any, - ) -> None: - api_key, api_base, api_version = resolve_openai_credentials( - api_key=api_key, - api_base=api_base, - api_version=api_version, - ) - - super().__init__( - model=model, - temperature=temperature, - max_new_tokens=max_new_tokens, - additional_kwargs=additional_kwargs or {}, - context_window=context_window, - image_detail=image_detail, - max_retries=max_retries, - timeout=timeout, - api_key=api_key, - api_base=api_base, - api_version=api_version, - callback_manager=callback_manager, - default_headers=default_headers, - **kwargs, - ) - self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt - self._completion_to_prompt = completion_to_prompt or (lambda x: x) - self._http_client = http_client - self._client, self._aclient = self._get_clients(**kwargs) - - def _get_clients(self, **kwargs: Any) -> Tuple[SyncOpenAI, AsyncOpenAI]: - client = SyncOpenAI(**self._get_credential_kwargs()) - aclient = AsyncOpenAI(**self._get_credential_kwargs()) - return client, aclient +class OpenAIMultiModal(OpenAI): @classmethod def class_name(cls) -> str: return "openai_multi_modal_llm" - @property - def metadata(self) -> MultiModalLLMMetadata: - """Multi Modal LLM metadata.""" - return MultiModalLLMMetadata( - num_output=self.max_new_tokens or DEFAULT_NUM_OUTPUTS, - model_name=self.model, - ) - - def _get_credential_kwargs(self, **kwargs: Any) -> Dict[str, Any]: - return { - "api_key": self.api_key, - "base_url": self.api_base, - "max_retries": self.max_retries, - "default_headers": self.default_headers, - "http_client": self._http_client, - "timeout": self.timeout, - **kwargs, - } - - def _get_multi_modal_chat_messages( + def _get_multi_modal_chat_message( self, prompt: str, role: str, image_documents: Sequence[ImageNode], + image_detail: Optional[str] = "low", **kwargs: Any, - ) -> List[ChatCompletionMessageParam]: - return to_openai_message_dicts( - [ - generate_openai_multi_modal_chat_message( - prompt=prompt, - role=role, - image_documents=image_documents, - image_detail=self.image_detail, + ) -> ChatMessage: + chat_msg = ChatMessage(role=role, content=prompt) + if not image_documents: + # if image_documents is empty, return text only chat message + return chat_msg + + for image_document in image_documents: + # Create the appropriate ContentBlock depending on the document content + if image_document.image: + chat_msg.blocks.append( + ImageBlock( + image=bytes(image_document.image, encoding="utf-8"), + detail=image_detail, + ) ) - ] - ) - - # Model Params for OpenAI GPT4V model. - def _get_model_kwargs(self, **kwargs: Any) -> Dict[str, Any]: - if self.model not in GPT4V_MODELS: - raise ValueError( - f"Invalid model {self.model}. " - f"Available models are: {list(GPT4V_MODELS.keys())}" - ) - base_kwargs = {"model": self.model, "temperature": self.temperature, **kwargs} - if self.max_new_tokens is not None: - # If max_tokens is None, don't include in the payload: - # https://platform.openai.com/docs/api-reference/chat - # https://platform.openai.com/docs/api-reference/completions - base_kwargs["max_tokens"] = self.max_new_tokens - return {**base_kwargs, **self.additional_kwargs} - - def _get_response_token_counts(self, raw_response: Any) -> dict: - """Get the token usage reported by the response.""" - if not isinstance(raw_response, dict): - return {} - - usage = raw_response.get("usage", {}) - # NOTE: other model providers that use the OpenAI client may not report usage - if usage is None: - return {} - - return { - "prompt_tokens": usage.get("prompt_tokens", 0), - "completion_tokens": usage.get("completion_tokens", 0), - "total_tokens": usage.get("total_tokens", 0), - } - - @llm_completion_callback() - def _complete( - self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any - ) -> CompletionResponse: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dict = self._get_multi_modal_chat_messages( - prompt=prompt, role=MessageRole.USER, image_documents=image_documents - ) - response = self._client.chat.completions.create( - messages=message_dict, - stream=False, - **all_kwargs, - ) - - return CompletionResponse( - text=response.choices[0].message.content, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) - - @llm_chat_callback() - def _chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dicts = to_openai_message_dicts(messages) - response = self._client.chat.completions.create( - messages=message_dicts, - stream=False, - **all_kwargs, - ) - openai_message = response.choices[0].message - message = from_openai_message(openai_message) - - return ChatResponse( - message=message, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) - - @llm_completion_callback() - def _stream_complete( - self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any - ) -> CompletionResponseGen: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dict = self._get_multi_modal_chat_messages( - prompt=prompt, role=MessageRole.USER, image_documents=image_documents - ) - - def gen() -> CompletionResponseGen: - text = "" - - for response in self._client.chat.completions.create( - messages=message_dict, - stream=True, - **all_kwargs, - ): - response = cast(ChatCompletionChunk, response) - if len(response.choices) > 0: - delta = response.choices[0].delta - else: - delta = ChoiceDelta() - - if delta is None: - continue - - # update using deltas - content_delta = delta.content or "" - text += content_delta - - yield CompletionResponse( - delta=content_delta, - text=text, - raw=response, - additional_kwargs=self._get_response_token_counts(response), + elif image_document.image_url: + chat_msg.blocks.append( + ImageBlock(url=image_document.image_url, detail=image_detail) ) - - return gen() - - @llm_chat_callback() - def _stream_chat( - self, messages: Sequence[ChatMessage], **kwargs: Any - ) -> ChatResponseGen: - message_dicts = to_openai_message_dicts(messages) - - def gen() -> ChatResponseGen: - content = "" - tool_calls: List[ChoiceDeltaToolCall] = [] - - is_function = False - for response in self._client.chat.completions.create( - messages=message_dicts, - stream=True, - **self._get_model_kwargs(**kwargs), - ): - response = cast(ChatCompletionChunk, response) - if len(response.choices) > 0: - delta = response.choices[0].delta - else: - delta = ChoiceDelta() - - if delta is None: - continue - - # check if this chunk is the start of a function call - if delta.tool_calls: - is_function = True - - # update using deltas - role = delta.role or MessageRole.ASSISTANT - content_delta = delta.content or "" - content += content_delta - - additional_kwargs = {} - if is_function: - tool_calls = update_tool_calls(tool_calls, delta.tool_calls) - additional_kwargs["tool_calls"] = tool_calls - - yield ChatResponse( - message=ChatMessage( - role=role, - content=content, - additional_kwargs=additional_kwargs, - ), - delta=content_delta, - raw=response, - additional_kwargs=self._get_response_token_counts(response), + elif image_document.image_path: + chat_msg.blocks.append( + ImageBlock( + path=Path(image_document.image_path), + detail=image_detail, + image_mimetype=image_document.image_mimetype + or image_document.metadata.get("file_type"), + ) + ) + elif f_path := image_document.metadata.get("file_path"): + chat_msg.blocks.append( + ImageBlock( + path=Path(f_path), + detail=image_detail, + image_mimetype=image_document.metadata.get("file_type"), + ) ) - return gen() + return chat_msg def complete( self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any ) -> CompletionResponse: - return self._complete(prompt, image_documents, **kwargs) + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, + ) + chat_response = self.chat([chat_message], **kwargs) + return chat_response_to_completion_response(chat_response) def stream_complete( self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any ) -> CompletionResponseGen: - return self._stream_complete(prompt, image_documents, **kwargs) - - def chat( - self, - messages: Sequence[ChatMessage], - **kwargs: Any, - ) -> ChatResponse: - return self._chat(messages, **kwargs) - - def stream_chat( - self, - messages: Sequence[ChatMessage], - **kwargs: Any, - ) -> ChatResponseGen: - return self._stream_chat(messages, **kwargs) - - # ===== Async Endpoints ===== - - @llm_completion_callback() - async def _acomplete( - self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any - ) -> CompletionResponse: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dict = self._get_multi_modal_chat_messages( - prompt=prompt, role=MessageRole.USER, image_documents=image_documents - ) - response = await self._aclient.chat.completions.create( - messages=message_dict, - stream=False, - **all_kwargs, + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) + chat_response = self.stream_chat([chat_message], **kwargs) + return stream_chat_response_to_completion_response(chat_response) - return CompletionResponse( - text=response.choices[0].message.content, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) + # ===== Async Endpoints ===== async def acomplete( self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any ) -> CompletionResponse: - return await self._acomplete(prompt, image_documents, **kwargs) - - @llm_completion_callback() - async def _astream_complete( - self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any - ) -> CompletionResponseAsyncGen: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dict = self._get_multi_modal_chat_messages( - prompt=prompt, role=MessageRole.USER, image_documents=image_documents + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, ) - - async def gen() -> CompletionResponseAsyncGen: - text = "" - - async for response in await self._aclient.chat.completions.create( - messages=message_dict, - stream=True, - **all_kwargs, - ): - response = cast(ChatCompletionChunk, response) - if len(response.choices) > 0: - delta = response.choices[0].delta - else: - delta = ChoiceDelta() - - if delta is None: - continue - - # update using deltas - content_delta = delta.content or "" - text += content_delta - - yield CompletionResponse( - delta=content_delta, - text=text, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) - - return gen() - - @llm_chat_callback() - async def _achat( - self, messages: Sequence[ChatMessage], **kwargs: Any - ) -> ChatResponse: - all_kwargs = self._get_model_kwargs(**kwargs) - message_dicts = to_openai_message_dicts(messages) - response = await self._aclient.chat.completions.create( - messages=message_dicts, - stream=False, - **all_kwargs, - ) - openai_message = response.choices[0].message - message = from_openai_message(openai_message) - - return ChatResponse( - message=message, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) - - @llm_chat_callback() - async def _astream_chat( - self, messages: Sequence[ChatMessage], **kwargs: Any - ) -> ChatResponseAsyncGen: - message_dicts = to_openai_message_dicts(messages) - - async def gen() -> ChatResponseAsyncGen: - content = "" - tool_calls: List[ChoiceDeltaToolCall] = [] - - is_function = False - async for response in await self._aclient.chat.completions.create( - messages=message_dicts, - stream=True, - **self._get_model_kwargs(**kwargs), - ): - response = cast(ChatCompletionChunk, response) - if len(response.choices) > 0: - delta = response.choices[0].delta - else: - delta = ChoiceDelta() - - if delta is None: - continue - - # check if this chunk is the start of a function call - if delta.tool_calls: - is_function = True - - # update using deltas - role = delta.role or MessageRole.ASSISTANT - content_delta = delta.content or "" - content += content_delta - - additional_kwargs = {} - if is_function: - tool_calls = update_tool_calls(tool_calls, delta.tool_calls) - additional_kwargs["tool_calls"] = tool_calls - - yield ChatResponse( - message=ChatMessage( - role=role, - content=content, - additional_kwargs=additional_kwargs, - ), - delta=content_delta, - raw=response, - additional_kwargs=self._get_response_token_counts(response), - ) - - return gen() + chat_response = await self.achat([chat_message], **kwargs) + return chat_response_to_completion_response(chat_response) async def astream_complete( self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any ) -> CompletionResponseAsyncGen: - return await self._astream_complete(prompt, image_documents, **kwargs) - - async def achat( - self, - messages: Sequence[ChatMessage], - **kwargs: Any, - ) -> ChatResponse: - return await self._achat(messages, **kwargs) - - async def astream_chat( - self, - messages: Sequence[ChatMessage], - **kwargs: Any, - ) -> ChatResponseAsyncGen: - return await self._astream_chat(messages, **kwargs) + chat_message = self._get_multi_modal_chat_message( + prompt=prompt, + role=MessageRole.USER, + image_documents=image_documents, + ) + chat_response = await self.astream_chat([chat_message], **kwargs) + return astream_chat_response_to_completion_response(chat_response) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py deleted file mode 100644 index b77685687a8cde79fe6cc4398b63df057e1210d4..0000000000000000000000000000000000000000 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/llama_index/multi_modal_llms/openai/utils.py +++ /dev/null @@ -1,82 +0,0 @@ -import logging -from pathlib import Path -from typing import Optional, Sequence - -from llama_index.core.base.llms.types import ImageBlock -from llama_index.core.multi_modal_llms.base import ChatMessage, ImageNode - -DEFAULT_OPENAI_API_TYPE = "open_ai" -DEFAULT_OPENAI_API_BASE = "https://api.openai.com/v1" - - -GPT4V_MODELS = { - "gpt-4-vision-preview": 128000, - "gpt-4-turbo-2024-04-09": 128000, - "gpt-4-turbo": 128000, - "gpt-4o": 128000, - "gpt-4o-2024-05-13": 128000, - "gpt-4o-2024-08-06": 128000, - "gpt-4o-2024-11-20": 128000, - "gpt-4o-mini": 128000, - "gpt-4o-mini-2024-07-18": 128000, - "o1": 200000, - "o1-2024-12-17": 200000, - "o3-mini": 200000, - "o3-mini-2025-01-31": 200000, -} - - -MISSING_API_KEY_ERROR_MESSAGE = """No API key found for OpenAI. -Please set either the OPENAI_API_KEY environment variable or \ -openai.api_key prior to initialization. -API keys can be found or created at \ -https://platform.openai.com/account/api-keys -""" - -logger = logging.getLogger(__name__) - - -def generate_openai_multi_modal_chat_message( - prompt: str, - role: str, - image_documents: Optional[Sequence[ImageNode]] = None, - image_detail: Optional[str] = "low", -) -> ChatMessage: - """Create a ChatMessage to be used in a multimodal query.""" - chat_msg = ChatMessage(role=role, content=prompt) - if image_documents is None: - # if image_documents is empty, return text only chat message - return chat_msg - - for image_document in image_documents: - # Create the appropriate ContentBlock depending on the document content - if image_document.image: - chat_msg.blocks.append( - ImageBlock( - image=bytes(image_document.image, encoding="utf-8"), - detail=image_detail, - ) - ) - elif image_document.image_url: - chat_msg.blocks.append( - ImageBlock(url=image_document.image_url, detail=image_detail) - ) - elif image_document.image_path: - chat_msg.blocks.append( - ImageBlock( - path=Path(image_document.image_path), - detail=image_detail, - image_mimetype=image_document.image_mimetype - or image_document.metadata.get("file_type"), - ) - ) - elif f_path := image_document.metadata.get("file_path"): - chat_msg.blocks.append( - ImageBlock( - path=Path(f_path), - detail=image_detail, - image_mimetype=image_document.metadata.get("file_type"), - ) - ) - - return chat_msg diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml index cd6e99421c3a77dd2cc65226f66c51b143005299..45a8c2c9d70bdc6f0bce569695e7acb6a76cd3af 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/pyproject.toml @@ -27,11 +27,11 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-multi-modal-llms-openai" readme = "README.md" -version = "0.4.3" +version = "0.5.0" [tool.poetry.dependencies] python = ">=3.9,<4.0" -llama-index-llms-openai = "^0.3.0" +llama-index-llms-openai = "^0.3.22" llama-index-core = "^0.12.3" [tool.poetry.group.dev.dependencies] diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py index 2327bdd18c4c750f390ff33b9a4e0de1b3ed87b1..2e6b60e721f7e9a208456b322b175ce1d5033824 100644 --- a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-openai/tests/test_multi-modal-llms_openai.py @@ -1,7 +1,7 @@ -from llama_index.core.multi_modal_llms.base import MultiModalLLM +from llama_index.llms.openai import OpenAI from llama_index.multi_modal_llms.openai import OpenAIMultiModal def test_embedding_class(): names_of_base_classes = [b.__name__ for b in OpenAIMultiModal.__mro__] - assert MultiModalLLM.__name__ in names_of_base_classes + assert OpenAI.__name__ in names_of_base_classes