From 9015aea5274296d7b68ef84f3d1f5ff9650002e2 Mon Sep 17 00:00:00 2001 From: Fabian Wimmer <github@insightby.ai> Date: Tue, 25 Jun 2024 20:16:27 +0200 Subject: [PATCH] docs: LlamaParse JSON + SimpleDirectoryReader (#970) --- .../data_loaders/llama_parse/json_mode.mdx | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/json_mode.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/json_mode.mdx index 6254ae26c..838354e7b 100644 --- a/apps/docs/docs/modules/data_loaders/llama_parse/json_mode.mdx +++ b/apps/docs/docs/modules/data_loaders/llama_parse/json_mode.mdx @@ -54,6 +54,42 @@ Within page objects, the following keys may be present depending on your documen - `images`: Any images extracted from the page. - `items`: An array of heading, text and table objects in the order they appear on the page. +### JSON Mode with SimpleDirectoryReader + +All Readers share a `loadData` method with `SimpleDirectoryReader` that promises to return a uniform Document with Metadata. This makes JSON mode incompatible with SimpleDirectoryReader. + +However, a simple work around is to create a new reader class that extends `LlamaParseReader` and adds a new method or overrides `loadData`, wrapping around JSON mode, extracting the required values, and returning a Document object. + +```ts +import { LlamaParseReader, Document } from "llamaindex"; + +class LlamaParseReaderWithJson extends LlamaParseReader { + // Override the loadData method + override async loadData(filePath: string): Promise<Document[]> { + // Call loadJson method that was inherited by LlamaParseReader + const jsonObjs = await super.loadJson(filePath); + let documents: Document[] = []; + + jsonObjs.forEach((jsonObj) => { + // Making sure it's an array before iterating over it + if (Array.isArray(jsonObj.pages)) { + } + const docs = jsonObj.pages.map( + (page: { text: string; page: number }) => + new Document({ text: page.text, metadata: { page: page.page } }), + ); + documents = documents.concat(docs); + }); + return documents; + } +} +``` + +Now we have documents with page number as metadata. This new reader can be used like any other and be integrated with SimpleDirectoryReader. Since it extends `LlamaParseReader`, you can use the same params. + +You can assign any other values of the JSON response to the Document as needed. + ## API Reference - [LlamaParseReader](../../../api/classes/LlamaParseReader.md) +- [SimpleDirectoryReader](../../../api/classes/SimpleDirectoryReader.md) -- GitLab