From 2e656dc3e1ba2548ac36ff006a90c6bfa161debe Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi <mpippi@gmail.com> Date: Thu, 27 Feb 2025 04:43:49 +0100 Subject: [PATCH] fix: make base64 detection more robust across the board (#17930) --- .../llama_index/core/base/llms/types.py | 10 ++++++---- llama-index-core/llama_index/core/schema.py | 13 +++++-------- llama-index-core/llama_index/core/utils.py | 13 ++++++++++++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/llama-index-core/llama_index/core/base/llms/types.py b/llama-index-core/llama_index/core/base/llms/types.py index 1fdcd1d3d8..6fa3863210 100644 --- a/llama-index-core/llama_index/core/base/llms/types.py +++ b/llama-index-core/llama_index/core/base/llms/types.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +from binascii import Error as BinasciiError from enum import Enum from io import BytesIO from typing import ( @@ -78,11 +79,12 @@ class ImageBlock(BaseModel): return self try: - # Check if image is already base64 encoded - decoded_img = base64.b64decode(self.image) - except Exception: + # Check if self.image is already base64 encoded. + # b64decode() can succeed on random binary data, so we + # pass verify=True to make sure it's not a false positive + decoded_img = base64.b64decode(self.image, validate=True) + except BinasciiError: decoded_img = self.image - # Not base64 - encode it self.image = base64.b64encode(self.image) self._guess_mimetype(decoded_img) diff --git a/llama-index-core/llama_index/core/schema.py b/llama-index-core/llama_index/core/schema.py index 1aaf8bc490..e4f815ab13 100644 --- a/llama-index-core/llama_index/core/schema.py +++ b/llama-index-core/llama_index/core/schema.py @@ -9,6 +9,7 @@ import pickle import textwrap import uuid from abc import abstractmethod +from binascii import Error as BinasciiError from dataclasses import dataclass from enum import Enum, auto from hashlib import sha256 @@ -531,14 +532,10 @@ class MediaResource(BaseModel): try: # Check if data is already base64 encoded. - # b64decode() can succeed on random binary data, we make - # a full roundtrip to make sure it's not a false positive - decoded = base64.b64decode(v) - encoded = base64.b64encode(decoded) - if encoded != v: - # Roundtrip failed, this is a false positive, return encoded - return base64.b64encode(v) - except Exception: + # b64decode() can succeed on random binary data, so we + # pass verify=True to make sure it's not a false positive + decoded = base64.b64decode(v, validate=True) + except BinasciiError: # b64decode failed, return encoded return base64.b64encode(v) diff --git a/llama-index-core/llama_index/core/utils.py b/llama-index-core/llama_index/core/utils.py index 99331aff3e..1e578f7ade 100644 --- a/llama-index-core/llama_index/core/utils.py +++ b/llama-index-core/llama_index/core/utils.py @@ -4,11 +4,11 @@ import asyncio import base64 import os import random -import requests import sys import time import traceback import uuid +from binascii import Error as BinasciiError from contextlib import contextmanager from dataclasses import dataclass from functools import partial, wraps @@ -31,6 +31,8 @@ from typing import ( runtime_checkable, ) +import requests + class GlobalsHelper: """Helper to retrieve globals. @@ -602,6 +604,15 @@ def resolve_binary( except Exception: decoded_bytes = raw_bytes + try: + # Check if raw_bytes is already base64 encoded. + # b64decode() can succeed on random binary data, so we + # pass verify=True to make sure it's not a false positive + decoded_bytes = base64.b64decode(raw_bytes, validate=True) + except BinasciiError: + # b64decode failed, leave as is + decoded_bytes = raw_bytes + if as_base64: return BytesIO(base64.b64encode(decoded_bytes)) return BytesIO(decoded_bytes) -- GitLab