Skip to content
Snippets Groups Projects
Unverified Commit ca9634e6 authored by Scott Rodgers's avatar Scott Rodgers Committed by GitHub
Browse files

JSONReader Option to maintain valid json structure (#11871)

* Option to maintain valid json

* bump json readers version

* replace newline
parent 94cda4d2
No related branches found
No related tags found
No related merge requests found
...@@ -68,6 +68,10 @@ class JSONReader(BaseReader): ...@@ -68,6 +68,10 @@ class JSONReader(BaseReader):
is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format. is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
Defaults to False. Defaults to False.
clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
If levels_back is set the json is not cleaned and this option is ignored.
Defaults to True.
""" """
def __init__( def __init__(
...@@ -76,6 +80,7 @@ class JSONReader(BaseReader): ...@@ -76,6 +80,7 @@ class JSONReader(BaseReader):
collapse_length: Optional[int] = None, collapse_length: Optional[int] = None,
ensure_ascii: bool = False, ensure_ascii: bool = False,
is_jsonl: Optional[bool] = False, is_jsonl: Optional[bool] = False,
clean_json: Optional[bool] = True,
) -> None: ) -> None:
"""Initialize with arguments.""" """Initialize with arguments."""
super().__init__() super().__init__()
...@@ -83,6 +88,7 @@ class JSONReader(BaseReader): ...@@ -83,6 +88,7 @@ class JSONReader(BaseReader):
self.collapse_length = collapse_length self.collapse_length = collapse_length
self.ensure_ascii = ensure_ascii self.ensure_ascii = ensure_ascii
self.is_jsonl = is_jsonl self.is_jsonl = is_jsonl
self.clean_json = clean_json
def load_data( def load_data(
self, input_file: str, extra_info: Optional[Dict] = {} self, input_file: str, extra_info: Optional[Dict] = {}
...@@ -98,9 +104,9 @@ class JSONReader(BaseReader): ...@@ -98,9 +104,9 @@ class JSONReader(BaseReader):
documents = [] documents = []
for data in load_data: for data in load_data:
# print(data) if self.levels_back is None and self.clean_json is True:
if self.levels_back is None: # If levels_back isn't set and clean json is set,
# If levels_back isn't set, we just format and make each # remove lines containing only formatting, we just format and make each
# line an embedding # line an embedding
json_output = json.dumps( json_output = json.dumps(
data, indent=0, ensure_ascii=self.ensure_ascii data, indent=0, ensure_ascii=self.ensure_ascii
...@@ -112,6 +118,12 @@ class JSONReader(BaseReader): ...@@ -112,6 +118,12 @@ class JSONReader(BaseReader):
documents.append( documents.append(
Document(text="\n".join(useful_lines), metadata=extra_info) Document(text="\n".join(useful_lines), metadata=extra_info)
) )
elif self.levels_back is None and self.clean_json is False:
# If levels_back isn't set and clean json is False, create documents without cleaning
json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
documents.append(Document(text=json_output, metadata=extra_info))
elif self.levels_back is not None: elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels # If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree # from further up the JSON tree
......
...@@ -70,3 +70,26 @@ def test_jsonl() -> None: ...@@ -70,3 +70,26 @@ def test_jsonl() -> None:
assert data[1].get_content().index("test2") is not None assert data[1].get_content().index("test2") is not None
assert isinstance(data[2].get_content(), str) assert isinstance(data[2].get_content(), str)
assert data[2].get_content().index("test3") is not None assert data[2].get_content().index("test3") is not None
def test_clean_json() -> None:
"""Test JSON reader using the clean_json function."""
with TemporaryDirectory() as tmp_dir:
file_name = f"{tmp_dir}/test5.json"
with open(file_name, "w") as f:
f.write('{ "a": { "b": "c" } }')
# If levels back is set clean_json is ignored
reader1 = JSONReader(levels_back=0, clean_json=False)
data1 = reader1.load_data(file_name)
assert data1[0].get_content() == "a b c"
# If clean_json is false the full json should be contained in a document
reader1 = JSONReader(clean_json=False)
data1 = reader1.load_data(file_name)
assert data1[0].get_content() == '{"a": {"b": "c"}}'
# If clean_json is True the full json should be contained in a document
reader1 = JSONReader(clean_json=True)
data1 = reader1.load_data(file_name)
assert data1[0].get_content() == '"a": {\n"b": "c"'
...@@ -68,6 +68,10 @@ class JSONReader(BaseReader): ...@@ -68,6 +68,10 @@ class JSONReader(BaseReader):
is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format. is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
Defaults to False. Defaults to False.
clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
If levels_back is set the json is not cleaned and this option is ignored.
Defaults to True.
""" """
def __init__( def __init__(
...@@ -76,6 +80,7 @@ class JSONReader(BaseReader): ...@@ -76,6 +80,7 @@ class JSONReader(BaseReader):
collapse_length: Optional[int] = None, collapse_length: Optional[int] = None,
ensure_ascii: bool = False, ensure_ascii: bool = False,
is_jsonl: Optional[bool] = False, is_jsonl: Optional[bool] = False,
clean_json: Optional[bool] = True,
) -> None: ) -> None:
"""Initialize with arguments.""" """Initialize with arguments."""
super().__init__() super().__init__()
...@@ -83,6 +88,7 @@ class JSONReader(BaseReader): ...@@ -83,6 +88,7 @@ class JSONReader(BaseReader):
self.collapse_length = collapse_length self.collapse_length = collapse_length
self.ensure_ascii = ensure_ascii self.ensure_ascii = ensure_ascii
self.is_jsonl = is_jsonl self.is_jsonl = is_jsonl
self.clean_json = clean_json
def load_data( def load_data(
self, input_file: str, extra_info: Optional[Dict] = {} self, input_file: str, extra_info: Optional[Dict] = {}
...@@ -98,9 +104,9 @@ class JSONReader(BaseReader): ...@@ -98,9 +104,9 @@ class JSONReader(BaseReader):
documents = [] documents = []
for data in load_data: for data in load_data:
# print(data) if self.levels_back is None and self.clean_json is True:
if self.levels_back is None: # If levels_back isn't set and clean json is set,
# If levels_back isn't set, we just format and make each # remove lines containing only formatting, we just format and make each
# line an embedding # line an embedding
json_output = json.dumps( json_output = json.dumps(
data, indent=0, ensure_ascii=self.ensure_ascii data, indent=0, ensure_ascii=self.ensure_ascii
...@@ -112,6 +118,12 @@ class JSONReader(BaseReader): ...@@ -112,6 +118,12 @@ class JSONReader(BaseReader):
documents.append( documents.append(
Document(text="\n".join(useful_lines), metadata=extra_info) Document(text="\n".join(useful_lines), metadata=extra_info)
) )
elif self.levels_back is None and self.clean_json is False:
# If levels_back isn't set and clean json is False, create documents without cleaning
json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
documents.append(Document(text=json_output, metadata=extra_info))
elif self.levels_back is not None: elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels # If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree # from further up the JSON tree
......
...@@ -28,7 +28,7 @@ license = "MIT" ...@@ -28,7 +28,7 @@ license = "MIT"
maintainers = ["yisding"] maintainers = ["yisding"]
name = "llama-index-readers-json" name = "llama-index-readers-json"
readme = "README.md" readme = "README.md"
version = "0.1.4" version = "0.1.5"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<4.0" python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment