From aa6726706476e0f957a8d57a5ca89e519e93bad7 Mon Sep 17 00:00:00 2001
From: R Ostrowski <rostrovsky@users.noreply.github.com>
Date: Mon, 11 Dec 2023 01:34:32 +0100
Subject: [PATCH] Remediate RCE vulnerability CVE-2023-39662 - part 2 (#9423)

---
 llama_index/exec_utils.py         | 19 +++++++++++++++++++
 tests/query_engine/test_pandas.py | 23 +++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/llama_index/exec_utils.py b/llama_index/exec_utils.py
index 3cdef1cd52..3e8575a3af 100644
--- a/llama_index/exec_utils.py
+++ b/llama_index/exec_utils.py
@@ -1,4 +1,5 @@
 import copy
+import re
 from types import CodeType, ModuleType
 from typing import Any, Dict, Mapping, Sequence, Union
 
@@ -90,6 +91,22 @@ def _get_restricted_globals(__globals: Union[dict, None]) -> Any:
     return restricted_globals
 
 
+def _verify_source_safety(__source: Union[str, bytes, CodeType]) -> None:
+    pattern = r"_{1,2}\w+_{0,2}"
+
+    if isinstance(__source, CodeType):
+        raise RuntimeError("Direct execution of CodeType is forbidden!")
+    if isinstance(__source, bytes):
+        __source = __source.decode()
+
+    matches = re.findall(pattern, __source)
+
+    if matches:
+        raise RuntimeError(
+            "Execution of code containing references to private or dunder methods is forbidden!"
+        )
+
+
 def safe_eval(
     __source: Union[str, bytes, CodeType],
     __globals: Union[Dict[str, Any], None] = None,
@@ -98,6 +115,7 @@ def safe_eval(
     """
     eval within safe global context.
     """
+    _verify_source_safety(__source)
     return eval(__source, _get_restricted_globals(__globals), __locals)
 
 
@@ -109,4 +127,5 @@ def safe_exec(
     """
     eval within safe global context.
     """
+    _verify_source_safety(__source)
     return exec(__source, _get_restricted_globals(__globals), __locals)
diff --git a/tests/query_engine/test_pandas.py b/tests/query_engine/test_pandas.py
index 060d837093..0c7acedd55 100644
--- a/tests/query_engine/test_pandas.py
+++ b/tests/query_engine/test_pandas.py
@@ -84,6 +84,29 @@ def test_default_output_processor_rce(tmp_path: Path) -> None:
     assert not tmp_file.is_file(), "file has been created via RCE!"
 
 
+@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires Python 3.9 or higher")
+def test_default_output_processor_rce2() -> None:
+    """
+    Test that output processor prevents RCE.
+    https://github.com/run-llama/llama_index/issues/7054#issuecomment-1829141330 .
+    """
+    df = pd.DataFrame(
+        {
+            "city": ["Toronto", "Tokyo", "Berlin"],
+            "population": [2930000, 13960000, 3645000],
+        }
+    )
+
+    injected_code = "().__class__.__mro__[-1].__subclasses__()[137].__init__.__globals__['system']('ls')"
+
+    output = default_output_processor(injected_code, df)
+
+    assert (
+        "Execution of code containing references to private or dunder methods is forbidden!"
+        in output
+    ), "Injected code executed successfully!"
+
+
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires Python 3.9 or higher")
 def test_default_output_processor_e2e(tmp_path: Path) -> None:
     """
-- 
GitLab