diff --git a/.changeset/few-pots-beg.md b/.changeset/few-pots-beg.md new file mode 100644 index 0000000000000000000000000000000000000000..def0f2ddcbcde7abcfab007e637bb5583f0c78c5 --- /dev/null +++ b/.changeset/few-pots-beg.md @@ -0,0 +1,7 @@ +--- +"llamaindex": patch +"@llamaindex/llamaindex-test": patch +"docs": patch +--- + +feat: add a reader for JSON data diff --git a/apps/docs/docs/modules/data_loaders/json.md b/apps/docs/docs/modules/data_loaders/json.md new file mode 100644 index 0000000000000000000000000000000000000000..c12767c05d9d18f1adbeffb7e669df16c9d0d4d4 --- /dev/null +++ b/apps/docs/docs/modules/data_loaders/json.md @@ -0,0 +1,144 @@ +# JSONReader + +A simple JSON data loader with various options. +Either parses the entire string, cleaning it and treat each line as an embedding or performs a recursive depth-first traversal yielding JSON paths. + +## Usage + +```ts +import { JSONReader } from "llamaindex"; + +const file = "../../PATH/TO/FILE"; +const content = new TextEncoder().encode("JSON_CONTENT"); + +const reader = new JSONReader({ levelsBack: 0, collapseLength: 100 }); +const docsFromFile = reader.loadData(file); +const docsFromContent = reader.loadDataAsContent(content); +``` + +### Options + +Basic: + +- `ensureAscii?`: Wether to ensure only ASCII characters be present in the output by converting non-ASCII characters to their unicode escape sequence. Default is `false`. + +- `isJsonLines?`: Wether the JSON is in JSON Lines format. If true, will split into lines, remove empty one and parse each line as JSON. Default is `false` + +- `cleanJson?`: Whether to clean the JSON by filtering out structural characters (`{}, [], and ,`). If set to false, it will just parse the JSON, not removing structural characters. Default is `true`. + +Depth-First-Traversal: + +- `levelsBack?`: Specifies how many levels up the JSON structure to include in the output. `cleanJson` will be ignored. If set to 0, all levels are included. If undefined, parses the entire JSON, treat each line as an embedding and create a document per top-level array. Default is `undefined` + +- `collapseLength?`: The maximum length of JSON string representation to be collapsed into a single line. Only applicable when `levelsBack` is set. Default is `undefined` + +#### Examples + +<!-- prettier-ignore-start --> +Input: + +```json +{"a": {"1": {"key1": "value1"}, "2": {"key2": "value2"}}, "b": {"3": {"k3": "v3"}, "4": {"k4": "v4"}}} +``` + +Default options: + +`LevelsBack` = `undefined` & `cleanJson` = `true` + +Output: + +```json +"a": { +"1": { +"key1": "value1" +"2": { +"key2": "value2" +"b": { +"3": { +"k3": "v3" +"4": { +"k4": "v4" +``` + +Depth-First Traversal all levels: + +`levelsBack` = `0` + +Output: + +```json +a 1 key1 value1 +a 2 key2 value2 +b 3 k3 v3 +b 4 k4 v4 +``` + +Depth-First Traversal and Collapse: + +`levelsBack` = `0` & `collapseLength` = `35` + +Output: + +```json +a 1 {"key1":"value1"} +a 2 {"key2":"value2"} +b {"3":{"k3":"v3"},"4":{"k4":"v4"}} +``` + +Depth-First Traversal limited levels: + +`levelsBack` = `2` + +Output: + +```json +1 key1 value1 +2 key2 value2 +3 k3 v3 +4 k4 v4 +``` + +Uncleaned JSON: + +`levelsBack` = `undefined` & `cleanJson` = `false` + +Output: + +```json +{"a":{"1":{"key1":"value1"},"2":{"key2":"value2"}},"b":{"3":{"k3":"v3"},"4":{"k4":"v4"}}} +``` + +ASCII-Conversion: + +Input: + +```json +{ "message": "ã“ã‚“ã«ã¡ã¯ä¸–ç•Œ" } +``` + +Output: + +```json +"message": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c" +``` + +JSON Lines Format: + +Input: + +```json +{"tweet": "Hello world"}\n{"tweet": "ã“ã‚“ã«ã¡ã¯ä¸–ç•Œ"} +``` + +Output: + +```json +"tweet": "Hello world" + +"tweet": "ã“ã‚“ã«ã¡ã¯ä¸–ç•Œ" +``` +<!-- prettier-ignore-end --> + +## API Reference + +- [JSONReader](../../api/classes/JSONReader.md) diff --git a/examples/readers/src/json.ts b/examples/readers/src/json.ts new file mode 100644 index 0000000000000000000000000000000000000000..02ef507dbc1c2e380264459895054d13a905ac76 --- /dev/null +++ b/examples/readers/src/json.ts @@ -0,0 +1,49 @@ +import { JSONReader } from "llamaindex"; + +async function main() { + // Data + const file = "../data/tinytweets.json"; + const nonAsciiContent = '{"message": "ã“ã‚“ã«ã¡ã¯ä¸–ç•Œ"}'; + const jsonlContent = '{"tweet": "Hello world"}\n{"tweet": "ã“ã‚“ã«ã¡ã¯ä¸–ç•Œ"}'; + + // Convert strings to Uint8Array for loadDataAsContent + const nonAsciiBuffer = new TextEncoder().encode(nonAsciiContent); + const jsonlBuffer = new TextEncoder().encode(jsonlContent); + + // Default settings + const reader1 = new JSONReader(); + const docs1 = await reader1.loadData(file); + console.log(docs1[0]); + + // Unclean JSON + const reader2 = new JSONReader({ cleanJson: false }); + const docs2 = await reader2.loadData(file); + console.log(docs2[0]); + + // Depth first yield of JSON structural paths, going back 2 levels + const reader3 = new JSONReader({ levelsBack: 2 }); + const docs3 = await reader3.loadData(file); + console.log(docs3[0]); + + // Depth first yield of all levels + const reader4 = new JSONReader({ levelsBack: 0 }); + const docs4 = await reader4.loadData(file); + console.log(docs4[0]); + + // Depth first yield of all levels, collapse structural paths below length 100 + const reader5 = new JSONReader({ levelsBack: 0, collapseLength: 100 }); + const docs5 = await reader5.loadData(file); + console.log(docs5[0]); + + // Convert ASCII to unichode escape sequences + const reader6 = new JSONReader({ ensureAscii: true }); + const docs6 = await reader6.loadDataAsContent(nonAsciiBuffer); + console.log(docs6[0]); + + // JSON Lines Format + const reader7 = new JSONReader({ isJsonLines: true }); + const docs7 = await reader7.loadDataAsContent(jsonlBuffer); + console.log(docs7[0]); +} + +main().catch(console.error); diff --git a/packages/llamaindex/src/readers/JSONReader.ts b/packages/llamaindex/src/readers/JSONReader.ts new file mode 100644 index 0000000000000000000000000000000000000000..fb0333a4a8924e83f8158f9b43afc3bdab28da25 --- /dev/null +++ b/packages/llamaindex/src/readers/JSONReader.ts @@ -0,0 +1,306 @@ +import type { JSONValue } from "@llamaindex/core/global"; +import { Document } from "@llamaindex/core/schema"; +import { FileReader } from "./type.js"; + +export interface JSONReaderOptions { + /** + * Whether to ensure only ASCII characters. + * Converts non-ASCII characters to their unicode escape sequence. + * @default false + */ + ensureAscii?: boolean; + + /** + * Whether the JSON is in JSON Lines format. + * Split into lines, remove empty lines, parse each line as JSON. + * @default false + */ + isJsonLines?: boolean; + + /** + * Whether to clean the JSON by filtering out structural characters (`{}, [], and ,`). + * If set to false, it will just parse the JSON, not removing structural characters. + * @default true + */ + cleanJson?: boolean; + + /** + * Specifies how many levels up the JSON structure to include in the output. cleanJson will be ignored. + * If set to 0, all levels are included. If undefined, parses the entire JSON and treats each line as an embedding. + * @default undefined + */ + levelsBack?: number; + + /** + * The maximum length of JSON string representation to be collapsed into a single line. + * Only applicable when `levelsBack` is set. + * @default undefined + */ + collapseLength?: number; +} + +export class JSONReaderError extends Error {} +export class JSONParseError extends JSONReaderError {} +export class JSONStringifyError extends JSONReaderError {} + +/** + * A reader that reads JSON data and returns an array of Document objects. + * Supports various options to modify the output. + */ +export class JSONReader<T extends JSONValue> extends FileReader { + private options: JSONReaderOptions; + + constructor(options: JSONReaderOptions = {}) { + super(); + this.options = { + ensureAscii: false, + isJsonLines: false, + cleanJson: true, + ...options, + }; + this.validateOptions(); + } + private validateOptions(): void { + const { levelsBack, collapseLength } = this.options; + if (levelsBack !== undefined && levelsBack < 0) { + throw new JSONReaderError("levelsBack must not be negative"); + } + if (collapseLength !== undefined && collapseLength < 0) { + throw new JSONReaderError("collapseLength must not be negative"); + } + } + + /** + * Loads JSON data and returns an array of Document objects. + * + * @param {Uint8Array} content - The JSON data as a Uint8Array. + * @return {Promise<Document[]>} A Promise that resolves to an array of Document objects. + */ + async loadDataAsContent(content: Uint8Array): Promise<Document[]> { + const jsonStr = new TextDecoder("utf-8").decode(content); + const parser = this.parseJsonString(jsonStr); + const documents: Document[] = []; + + for await (const data of parser) { + documents.push(await this.createDocument(data)); + } + return documents; + } + + private async *parseJsonString(jsonStr: string): AsyncGenerator<T> { + if (this.options.isJsonLines) { + yield* this.parseJsonLines(jsonStr); + } else { + yield* this.parseJson(jsonStr); + } + } + + private async *parseJsonLines(jsonStr: string): AsyncGenerator<T> { + // Process each line as a separate JSON object for JSON Lines format + for (const line of jsonStr.split("\n")) { + if (line.trim() !== "") { + try { + yield JSON.parse(line.trim()); + } catch (e) { + throw new JSONParseError( + `Error parsing JSON Line: ${e} in "${line.trim()}"`, + ); + } + } + } + } + + private async *parseJson(jsonStr: string): AsyncGenerator<T> { + try { + // TODO: Add streaming to handle large JSON files + const parsedData = JSON.parse(jsonStr); + + if (!this.options.cleanJson) { + // Yield the parsed data directly if cleanJson is false + yield parsedData; + } else if (Array.isArray(parsedData)) { + // Check if it's an Array, if so yield each item seperately, i.e. create a document per top-level array of the json + for (const item of parsedData) { + yield item; + } + } else { + // If not an array, just yield the parsed data + yield parsedData; + } + } catch (e) { + throw new JSONParseError(`Error parsing JSON: ${e} in "${jsonStr}"`); + } + } + + private async createDocument(data: T): Promise<Document> { + const docText: string = + this.options.levelsBack === undefined + ? this.formatJsonString(data) + : await this.prepareDepthFirstYield(data); + + return new Document({ + text: this.options.ensureAscii ? this.convertToAscii(docText) : docText, + metadata: { + doc_length: docText.length, + traversal_data: { + levels_back: this.options.levelsBack, + collapse_length: this.options.collapseLength, + }, + }, + }); + } + + private async prepareDepthFirstYield(data: T): Promise<string> { + const levelsBack = this.options.levelsBack ?? 0; + const results: string[] = []; + for await (const value of this.depthFirstYield( + data, + levelsBack === 0 ? Infinity : levelsBack, + [], + this.options.collapseLength, + )) { + results.push(value); + } + return results.join("\n"); + } + + // Note: JSON.stringify does not differentiate between indent "undefined/null"(= no whitespaces) and "0"(= no whitespaces, but linebreaks) + // as python json.dumps does. Thats why we use indent 1 and remove the leading spaces. + + private formatJsonString(data: T): string { + try { + const jsonStr = JSON.stringify( + data, + null, + this.options.cleanJson ? 1 : 0, + ); + if (this.options.cleanJson) { + // Clean JSON by removing structural characters and unnecessary whitespace + return jsonStr + .split("\n") + .filter((line) => !/^[{}\[\],]*$/.test(line.trim())) + .map((line) => line.trimStart()) // Removes the indent + .join("\n"); + } + return jsonStr; + } catch (e) { + throw new JSONStringifyError( + `Error stringifying JSON: ${e} in "${data}"`, + ); + } + } + + /** + * A generator function that determines the next step in traversing the JSON data. + * If the serialized JSON string is not null, it yields the string and returns. + * If the JSON data is an object, it delegates the traversal to the depthFirstTraversal method. + * Otherwise, it yields the JSON data as a string. + * + * @param jsonData - The JSON data to traverse. + * @param levelsBack - The number of levels up the JSON structure to include in the output. + * @param path - The current path in the JSON structure. + * @param collapseLength - The maximum length of JSON string representation to be collapsed into a single line. + * @throws {JSONReaderError} - Throws an error if there is an issue during the depth-first traversal. + */ + private async *depthFirstYield( + jsonData: T, + levelsBack: number, + path: string[], + collapseLength?: number, + ): AsyncGenerator<string> { + try { + const jsonStr = this.serializeAndCollapse( + jsonData, + levelsBack, + path, + collapseLength, + ); + if (jsonStr !== null) { + yield jsonStr; + return; + } + + if (jsonData !== null && typeof jsonData === "object") { + yield* this.depthFirstTraversal( + jsonData, + levelsBack, + path, + collapseLength, + ); + } else { + yield `${path.slice(-levelsBack).join(" ")} ${String(jsonData)}`; + } + } catch (e) { + throw new JSONReaderError( + `Error during depth first traversal at path ${path.join(" ")}: ${e}`, + ); + } + } + + private serializeAndCollapse( + jsonData: T, + levelsBack: number, + path: string[], + collapseLength?: number, + ): string | null { + try { + const jsonStr = JSON.stringify(jsonData); + return collapseLength !== undefined && jsonStr.length <= collapseLength + ? `${path.slice(-levelsBack).join(" ")} ${jsonStr}` + : null; + } catch (e) { + throw new JSONStringifyError(`Error stringifying JSON data: ${e}`); + } + } + /** + * A generator function that performs a depth-first traversal of the JSON data. + * If the JSON data is an array, it traverses each item in the array. + * If the JSON data is an object, it traverses each key-value pair in the object. + * For each traversed item or value, it performs a depth-first yield. + * + * @param jsonData - The JSON data to traverse. + * @param levelsBack - The number of levels up the JSON structure to include in the output. + * @param path - The current path in the JSON structure. + * @param collapseLength - The maximum length of JSON string representation to be collapsed into a single line. + * @throws {JSONReaderError} - Throws an error if there is an issue during the depth-first traversal of the object. + */ + private async *depthFirstTraversal( + jsonData: T, + levelsBack: number, + path: string[], + collapseLength?: number, + ): AsyncGenerator<string> { + try { + if (Array.isArray(jsonData)) { + for (const item of jsonData) { + yield* this.depthFirstYield(item, levelsBack, path, collapseLength); + } + } else if (jsonData !== null && typeof jsonData === "object") { + const originalLength = path.length; + for (const [key, value] of Object.entries(jsonData)) { + path.push(key); + if (value !== null) { + yield* this.depthFirstYield( + value as T, + levelsBack, + path, + collapseLength, + ); + } + path.length = originalLength; // Reset path length to original. Avoids cloning the path array every time. + } + } + } catch (e) { + throw new JSONReaderError( + `Error during depth-first traversal of object: ${e}`, + ); + } + } + + private convertToAscii(str: string): string { + return str.replace( + /[\u007F-\uFFFF]/g, + (char) => `\\u${char.charCodeAt(0).toString(16).padStart(4, "0")}`, + ); + } +} diff --git a/packages/llamaindex/src/readers/index.ts b/packages/llamaindex/src/readers/index.ts index c384ba323501fff38528ef1ce96408214331e87a..cc83dfb9fe05fe9b5920cbdef219c600622627f4 100644 --- a/packages/llamaindex/src/readers/index.ts +++ b/packages/llamaindex/src/readers/index.ts @@ -4,6 +4,7 @@ export * from "./DiscordReader.js"; export * from "./DocxReader.js"; export * from "./HTMLReader.js"; export * from "./ImageReader.js"; +export * from "./JSONReader.js"; export * from "./LlamaParseReader.js"; export * from "./MarkdownReader.js"; export * from "./NotionReader.js"; diff --git a/packages/llamaindex/tests/readers/JSONReader.test.ts b/packages/llamaindex/tests/readers/JSONReader.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..7673baef63cef045a6a6015db8a9ff2ebd6eaa47 --- /dev/null +++ b/packages/llamaindex/tests/readers/JSONReader.test.ts @@ -0,0 +1,135 @@ +import { + JSONParseError, + JSONReader, + JSONReaderError, + type JSONValue, +} from "llamaindex"; +import { beforeEach, describe, expect, it } from "vitest"; + +const content = new TextEncoder().encode( + '{"a": {"1": {"key1": "value1"}, "2": {"key2": "value2"}}, "b": {"c": "d"}}', +); + +describe("JSONReader", () => { + let reader: JSONReader<JSONValue>; + + beforeEach(() => { + reader = new JSONReader(); + }); + + describe("constructor", () => { + it("should set default options", () => { + expect(reader["options"]).toEqual({ + ensureAscii: false, + isJsonLines: false, + cleanJson: true, + }); + }); + + it("should validate options", () => { + expect(() => new JSONReader({ levelsBack: -1 })).toThrow(JSONReaderError); + expect(() => new JSONReader({ collapseLength: -1 })).toThrow( + JSONReaderError, + ); + }); + }); + + describe("loadDataAsContent", () => { + it("should load and parse valid JSON content", async () => { + const docs = await reader.loadDataAsContent(content); + expect(docs).toHaveLength(1); + expect(docs[0].text).toContain('"key1": "value1"'); + }); + + it("should throw JSONParseError for invalid JSON content", async () => { + const content = new TextEncoder().encode("invalid json"); + await expect(reader.loadDataAsContent(content)).rejects.toThrow( + JSONParseError, + ); + }); + }); + + describe("isJsonLines option", () => { + it("should handle JSON Lines format", async () => { + reader = new JSONReader({ isJsonLines: true }); + const content = new TextEncoder().encode( + '{"key1": "value1"}\n{"key2": "value2"}\n', + ); + const docs = await reader.loadDataAsContent(content); + expect(docs).toHaveLength(2); + expect(docs[0].text).toBe('"key1": "value1"'); + expect(docs[1].text).toBe('"key2": "value2"'); + }); + + it("should skip empty lines in JSON Lines format", async () => { + reader = new JSONReader({ isJsonLines: true }); + const content = new TextEncoder().encode( + '{"key1": "value1"}\n\n{"key2": "value2"}\n', + ); + const docs = await reader.loadDataAsContent(content); + expect(docs).toHaveLength(2); + expect(docs[0].text).toBe('"key1": "value1"'); + expect(docs[1].text).toBe('"key2": "value2"'); + }); + }); + + describe("ensureAscii option", () => { + it("should convert non-ASCII characters to unicode escape sequences", async () => { + reader = new JSONReader({ ensureAscii: true }); + const content = new TextEncoder().encode('{"key": "valüe"}'); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toBe('"key": "val\\u00fce"'); + }); + + it("should not alter ASCII characters", async () => { + reader = new JSONReader({ ensureAscii: true }); + const content = new TextEncoder().encode('{"key": "value"}'); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toBe('"key": "value"'); + }); + }); + + describe("levelsBack option", () => { + it("should create document with levelsBack option", async () => { + reader = new JSONReader({ levelsBack: 1 }); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toContain("key1 value1"); + expect(docs[0].text).toContain("c d"); + }); + + it("should traverse all levels with levelsBack 0", async () => { + reader = new JSONReader({ levelsBack: 0 }); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toContain("a 1 key1 value1"); + expect(docs[0].text).toContain("a 2 key2 value2"); + expect(docs[0].text).toContain("b c d"); + }); + }); + describe("collapseLength option", () => { + it("should collapse values based on collapseLength", async () => { + reader = new JSONReader({ collapseLength: 10, levelsBack: 0 }); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toContain('a 1 key1 "value1"'); + expect(docs[0].text).toContain('b {"c":"d"}'); + expect(docs[0].metadata.traversal_data.collapse_length).toBe(10); + expect(docs[0].metadata.traversal_data.levels_back).toBe(0); + }); + }); + + describe("cleanJson option", () => { + it("should remove JSON structural characters", async () => { + reader = new JSONReader({ cleanJson: true }); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toContain('"key1": "value1"'); + expect(docs[0].text).toContain('"a": {'); + }); + + it("should not remove JSON structural characters, but white spaces", async () => { + reader = new JSONReader({ cleanJson: false }); + const docs = await reader.loadDataAsContent(content); + expect(docs[0].text).toBe( + '{"a":{"1":{"key1":"value1"},"2":{"key2":"value2"}},"b":{"c":"d"}}', + ); + }); + }); +});