JSONReader Option to maintain valid json structure (#11871)

* Option to maintain valid json * bump json readers version * replace newline

JSONReader Option to maintain valid json structure (#11871)
ca9634e6 · Scott Rodgers · GitHub · 94cda4d2 · ca9634e6 · ca9634e6
Unverified Commit ca9634e6 authored 1 year ago by Scott Rodgers Committed by GitHub 1 year ago
--- a/llama-index-core/llama_index/core/readers/json.py
+++ b/llama-index-core/llama_index/core/readers/json.py
@@ -68,6 +68,10 @@ class JSONReader(BaseReader):
        is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
        Defaults to False.
+        clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
+        This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
+        If levels_back is set the json is not cleaned and this option is ignored.
+        Defaults to True.
    """
    def __init__(
@@ -76,6 +80,7 @@ class JSONReader(BaseReader):
        collapse_length: Optional[int] = None,
        ensure_ascii: bool = False,
        is_jsonl: Optional[bool] = False,
+        clean_json: Optional[bool] = True,
    ) -> None:
        """Initialize with arguments."""
        super().__init__()
@@ -83,6 +88,7 @@ class JSONReader(BaseReader):
        self.collapse_length = collapse_length
        self.ensure_ascii = ensure_ascii
        self.is_jsonl = is_jsonl
+        self.clean_json = clean_json
    def load_data(
        self, input_file: str, extra_info: Optional[Dict] = {}
@@ -98,9 +104,9 @@ class JSONReader(BaseReader):
            documents = []
            for data in load_data:
-                # print(data)
+                if self.levels_back is None and self.clean_json is True:
-                if self.levels_back is None:
+                    # If levels_back isn't set and clean json is set,
-                    # If levels_back isn't set, we just format and make each
+                    # remove lines containing only formatting, we just format and make each
                    # line an embedding
                    json_output = json.dumps(
                        data, indent=0, ensure_ascii=self.ensure_ascii
@@ -112,6 +118,12 @@ class JSONReader(BaseReader):
                    documents.append(
                        Document(text="\n".join(useful_lines), metadata=extra_info)
                    )
+                elif self.levels_back is None and self.clean_json is False:
+                    # If levels_back isn't set  and clean json is False, create documents without cleaning
+                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
+                    documents.append(Document(text=json_output, metadata=extra_info))
                elif self.levels_back is not None:
                    # If levels_back is set, we make the embeddings contain the labels
                    # from further up the JSON tree

--- a/llama-index-core/tests/readers/test_json.py
+++ b/llama-index-core/tests/readers/test_json.py
@@ -70,3 +70,26 @@ def test_jsonl() -> None:
        assert data[1].get_content().index("test2") is not None
        assert isinstance(data[2].get_content(), str)
        assert data[2].get_content().index("test3") is not None
+def test_clean_json() -> None:
+    """Test JSON reader using the clean_json function."""
+    with TemporaryDirectory() as tmp_dir:
+        file_name = f"{tmp_dir}/test5.json"
+        with open(file_name, "w") as f:
+            f.write('{ "a": { "b": "c" } }')
+        # If levels back is set clean_json is ignored
+        reader1 = JSONReader(levels_back=0, clean_json=False)
+        data1 = reader1.load_data(file_name)
+        assert data1[0].get_content() == "a b c"
+        # If clean_json is false the full json should be contained in a document
+        reader1 = JSONReader(clean_json=False)
+        data1 = reader1.load_data(file_name)
+        assert data1[0].get_content() == '{"a": {"b": "c"}}'
+        # If clean_json is True the full json should be contained in a document
+        reader1 = JSONReader(clean_json=True)
+        data1 = reader1.load_data(file_name)
+        assert data1[0].get_content() == '"a": {\n"b": "c"'
--- a/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py
@@ -68,6 +68,10 @@ class JSONReader(BaseReader):
        is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
        Defaults to False.
+        clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
+        This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
+        If levels_back is set the json is not cleaned and this option is ignored.
+        Defaults to True.
    """
    def __init__(
@@ -76,6 +80,7 @@ class JSONReader(BaseReader):
        collapse_length: Optional[int] = None,
        ensure_ascii: bool = False,
        is_jsonl: Optional[bool] = False,
+        clean_json: Optional[bool] = True,
    ) -> None:
        """Initialize with arguments."""
        super().__init__()
@@ -83,6 +88,7 @@ class JSONReader(BaseReader):
        self.collapse_length = collapse_length
        self.ensure_ascii = ensure_ascii
        self.is_jsonl = is_jsonl
+        self.clean_json = clean_json
    def load_data(
        self, input_file: str, extra_info: Optional[Dict] = {}
@@ -98,9 +104,9 @@ class JSONReader(BaseReader):
            documents = []
            for data in load_data:
-                # print(data)
+                if self.levels_back is None and self.clean_json is True:
-                if self.levels_back is None:
+                    # If levels_back isn't set and clean json is set,
-                    # If levels_back isn't set, we just format and make each
+                    # remove lines containing only formatting, we just format and make each
                    # line an embedding
                    json_output = json.dumps(
                        data, indent=0, ensure_ascii=self.ensure_ascii
@@ -112,6 +118,12 @@ class JSONReader(BaseReader):
                    documents.append(
                        Document(text="\n".join(useful_lines), metadata=extra_info)
                    )
+                elif self.levels_back is None and self.clean_json is False:
+                    # If levels_back isn't set  and clean json is False, create documents without cleaning
+                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
+                    documents.append(Document(text=json_output, metadata=extra_info))
                elif self.levels_back is not None:
                    # If levels_back is set, we make the embeddings contain the labels
                    # from further up the JSON tree

--- a/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml
@@ -28,7 +28,7 @@ license = "MIT"
 maintainers = ["yisding"]
 name = "llama-index-readers-json"
 readme = "README.md"
-version = "0.1.4"
+version = "0.1.5"
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"