From 2e656dc3e1ba2548ac36ff006a90c6bfa161debe Mon Sep 17 00:00:00 2001
From: Massimiliano Pippi <mpippi@gmail.com>
Date: Thu, 27 Feb 2025 04:43:49 +0100
Subject: [PATCH] fix: make base64 detection more robust across the board
 (#17930)

---
 .../llama_index/core/base/llms/types.py             | 10 ++++++----
 llama-index-core/llama_index/core/schema.py         | 13 +++++--------
 llama-index-core/llama_index/core/utils.py          | 13 ++++++++++++-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/llama-index-core/llama_index/core/base/llms/types.py b/llama-index-core/llama_index/core/base/llms/types.py
index 1fdcd1d3d8..6fa3863210 100644
--- a/llama-index-core/llama_index/core/base/llms/types.py
+++ b/llama-index-core/llama_index/core/base/llms/types.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import base64
+from binascii import Error as BinasciiError
 from enum import Enum
 from io import BytesIO
 from typing import (
@@ -78,11 +79,12 @@ class ImageBlock(BaseModel):
             return self
 
         try:
-            # Check if image is already base64 encoded
-            decoded_img = base64.b64decode(self.image)
-        except Exception:
+            # Check if self.image is already base64 encoded.
+            # b64decode() can succeed on random binary data, so we
+            # pass verify=True to make sure it's not a false positive
+            decoded_img = base64.b64decode(self.image, validate=True)
+        except BinasciiError:
             decoded_img = self.image
-            # Not base64 - encode it
             self.image = base64.b64encode(self.image)
 
         self._guess_mimetype(decoded_img)
diff --git a/llama-index-core/llama_index/core/schema.py b/llama-index-core/llama_index/core/schema.py
index 1aaf8bc490..e4f815ab13 100644
--- a/llama-index-core/llama_index/core/schema.py
+++ b/llama-index-core/llama_index/core/schema.py
@@ -9,6 +9,7 @@ import pickle
 import textwrap
 import uuid
 from abc import abstractmethod
+from binascii import Error as BinasciiError
 from dataclasses import dataclass
 from enum import Enum, auto
 from hashlib import sha256
@@ -531,14 +532,10 @@ class MediaResource(BaseModel):
 
         try:
             # Check if data is already base64 encoded.
-            # b64decode() can succeed on random binary data, we make
-            # a full roundtrip to make sure it's not a false positive
-            decoded = base64.b64decode(v)
-            encoded = base64.b64encode(decoded)
-            if encoded != v:
-                # Roundtrip failed, this is a false positive, return encoded
-                return base64.b64encode(v)
-        except Exception:
+            # b64decode() can succeed on random binary data, so we
+            # pass verify=True to make sure it's not a false positive
+            decoded = base64.b64decode(v, validate=True)
+        except BinasciiError:
             # b64decode failed, return encoded
             return base64.b64encode(v)
 
diff --git a/llama-index-core/llama_index/core/utils.py b/llama-index-core/llama_index/core/utils.py
index 99331aff3e..1e578f7ade 100644
--- a/llama-index-core/llama_index/core/utils.py
+++ b/llama-index-core/llama_index/core/utils.py
@@ -4,11 +4,11 @@ import asyncio
 import base64
 import os
 import random
-import requests
 import sys
 import time
 import traceback
 import uuid
+from binascii import Error as BinasciiError
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial, wraps
@@ -31,6 +31,8 @@ from typing import (
     runtime_checkable,
 )
 
+import requests
+
 
 class GlobalsHelper:
     """Helper to retrieve globals.
@@ -602,6 +604,15 @@ def resolve_binary(
         except Exception:
             decoded_bytes = raw_bytes
 
+        try:
+            # Check if raw_bytes is already base64 encoded.
+            # b64decode() can succeed on random binary data, so we
+            # pass verify=True to make sure it's not a false positive
+            decoded_bytes = base64.b64decode(raw_bytes, validate=True)
+        except BinasciiError:
+            # b64decode failed, leave as is
+            decoded_bytes = raw_bytes
+
         if as_base64:
             return BytesIO(base64.b64encode(decoded_bytes))
         return BytesIO(decoded_bytes)
-- 
GitLab