diff --git a/llama-index-core/llama_index/core/readers/json.py b/llama-index-core/llama_index/core/readers/json.py index 6211b64d50edd9028451009eef985d4f38081915..4ae6a9c21c196fcb02b1370732a9076a5175ead3 100644 --- a/llama-index-core/llama_index/core/readers/json.py +++ b/llama-index-core/llama_index/core/readers/json.py @@ -68,6 +68,10 @@ class JSONReader(BaseReader): is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format. Defaults to False. + clean_json (Optional[bool]): If True, lines containing only JSON structure are removed. + This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure. + If levels_back is set the json is not cleaned and this option is ignored. + Defaults to True. """ def __init__( @@ -76,6 +80,7 @@ class JSONReader(BaseReader): collapse_length: Optional[int] = None, ensure_ascii: bool = False, is_jsonl: Optional[bool] = False, + clean_json: Optional[bool] = True, ) -> None: """Initialize with arguments.""" super().__init__() @@ -83,6 +88,7 @@ class JSONReader(BaseReader): self.collapse_length = collapse_length self.ensure_ascii = ensure_ascii self.is_jsonl = is_jsonl + self.clean_json = clean_json def load_data( self, input_file: str, extra_info: Optional[Dict] = {} @@ -98,9 +104,9 @@ class JSONReader(BaseReader): documents = [] for data in load_data: - # print(data) - if self.levels_back is None: - # If levels_back isn't set, we just format and make each + if self.levels_back is None and self.clean_json is True: + # If levels_back isn't set and clean json is set, + # remove lines containing only formatting, we just format and make each # line an embedding json_output = json.dumps( data, indent=0, ensure_ascii=self.ensure_ascii @@ -112,6 +118,12 @@ class JSONReader(BaseReader): documents.append( Document(text="\n".join(useful_lines), metadata=extra_info) ) + + elif self.levels_back is None and self.clean_json is False: + # If levels_back isn't set and clean json is False, create documents without cleaning + json_output = json.dumps(data, ensure_ascii=self.ensure_ascii) + documents.append(Document(text=json_output, metadata=extra_info)) + elif self.levels_back is not None: # If levels_back is set, we make the embeddings contain the labels # from further up the JSON tree diff --git a/llama-index-core/tests/readers/test_json.py b/llama-index-core/tests/readers/test_json.py index 8a7717325a666517cc61c23d3a100a2b277bde55..8d2772b1631f1f42a8ed04aed183003b7817957f 100644 --- a/llama-index-core/tests/readers/test_json.py +++ b/llama-index-core/tests/readers/test_json.py @@ -70,3 +70,26 @@ def test_jsonl() -> None: assert data[1].get_content().index("test2") is not None assert isinstance(data[2].get_content(), str) assert data[2].get_content().index("test3") is not None + + +def test_clean_json() -> None: + """Test JSON reader using the clean_json function.""" + with TemporaryDirectory() as tmp_dir: + file_name = f"{tmp_dir}/test5.json" + with open(file_name, "w") as f: + f.write('{ "a": { "b": "c" } }') + + # If levels back is set clean_json is ignored + reader1 = JSONReader(levels_back=0, clean_json=False) + data1 = reader1.load_data(file_name) + assert data1[0].get_content() == "a b c" + + # If clean_json is false the full json should be contained in a document + reader1 = JSONReader(clean_json=False) + data1 = reader1.load_data(file_name) + assert data1[0].get_content() == '{"a": {"b": "c"}}' + + # If clean_json is True the full json should be contained in a document + reader1 = JSONReader(clean_json=True) + data1 = reader1.load_data(file_name) + assert data1[0].get_content() == '"a": {\n"b": "c"' diff --git a/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py b/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py index 6211b64d50edd9028451009eef985d4f38081915..4ae6a9c21c196fcb02b1370732a9076a5175ead3 100644 --- a/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py +++ b/llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py @@ -68,6 +68,10 @@ class JSONReader(BaseReader): is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format. Defaults to False. + clean_json (Optional[bool]): If True, lines containing only JSON structure are removed. + This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure. + If levels_back is set the json is not cleaned and this option is ignored. + Defaults to True. """ def __init__( @@ -76,6 +80,7 @@ class JSONReader(BaseReader): collapse_length: Optional[int] = None, ensure_ascii: bool = False, is_jsonl: Optional[bool] = False, + clean_json: Optional[bool] = True, ) -> None: """Initialize with arguments.""" super().__init__() @@ -83,6 +88,7 @@ class JSONReader(BaseReader): self.collapse_length = collapse_length self.ensure_ascii = ensure_ascii self.is_jsonl = is_jsonl + self.clean_json = clean_json def load_data( self, input_file: str, extra_info: Optional[Dict] = {} @@ -98,9 +104,9 @@ class JSONReader(BaseReader): documents = [] for data in load_data: - # print(data) - if self.levels_back is None: - # If levels_back isn't set, we just format and make each + if self.levels_back is None and self.clean_json is True: + # If levels_back isn't set and clean json is set, + # remove lines containing only formatting, we just format and make each # line an embedding json_output = json.dumps( data, indent=0, ensure_ascii=self.ensure_ascii @@ -112,6 +118,12 @@ class JSONReader(BaseReader): documents.append( Document(text="\n".join(useful_lines), metadata=extra_info) ) + + elif self.levels_back is None and self.clean_json is False: + # If levels_back isn't set and clean json is False, create documents without cleaning + json_output = json.dumps(data, ensure_ascii=self.ensure_ascii) + documents.append(Document(text=json_output, metadata=extra_info)) + elif self.levels_back is not None: # If levels_back is set, we make the embeddings contain the labels # from further up the JSON tree diff --git a/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml index 9723c9a47f47afd6c269fa6c78a29b8a16a3c4b2..db59a177437442637c1fc40e0bdad7e1eaeaaf90 100644 --- a/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-json/pyproject.toml @@ -28,7 +28,7 @@ license = "MIT" maintainers = ["yisding"] name = "llama-index-readers-json" readme = "README.md" -version = "0.1.4" +version = "0.1.5" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"