diff --git a/examples/data/uber_10q_march_2022.pdf b/examples/data/uber_10q_march_2022.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b4997b19989e15e9eb8448fef20c8878bccbdd71 Binary files /dev/null and b/examples/data/uber_10q_march_2022.pdf differ diff --git a/examples/readers/package.json b/examples/readers/package.json index 47a7bb4e0fcdf51b468b7c1cc1aeedbcd68fc1f8..6bc38b464d34420ed9eb2ad90b9b2b86a4f105fe 100644 --- a/examples/readers/package.json +++ b/examples/readers/package.json @@ -11,7 +11,8 @@ "start:pdf": "node --import tsx ./src/pdf.ts", "start:llamaparse": "node --import tsx ./src/llamaparse.ts", "start:notion": "node --import tsx ./src/notion.ts", - "start:llamaparse-dir": "node --import tsx ./src/simple-directory-reader-with-llamaparse.ts" + "start:llamaparse-dir": "node --import tsx ./src/simple-directory-reader-with-llamaparse.ts", + "start:llamaparse-json": "node --import tsx ./src/llamaparse-json.ts" }, "dependencies": { "llamaindex": "*" diff --git a/examples/readers/src/llamaparse-json.ts b/examples/readers/src/llamaparse-json.ts new file mode 100644 index 0000000000000000000000000000000000000000..8f8e991f927162a25f1c7b49afdc4035cf775387 --- /dev/null +++ b/examples/readers/src/llamaparse-json.ts @@ -0,0 +1,30 @@ +import fs from "fs/promises"; +import { LlamaParseReader } from "llamaindex"; + +async function main() { + // Load PDF using LlamaParse json mode + const reader = new LlamaParseReader({ resultType: "json" }); + const jsonObjs = await reader.loadJson("../data/uber_10q_march_2022.pdf"); + + // Write the JSON objects to a file + try { + await fs.writeFile("jsonObjs.json", JSON.stringify(jsonObjs, null, 4)); + console.log("Array of JSON objects has been written to jsonObjs.json"); + } catch (e) { + console.error("Error writing jsonObjs.json", e); + } + + const jsonList = jsonObjs[0]["pages"]; + + // Write the list of JSON objects as a single array to a file + try { + await fs.writeFile("jsonList.json", JSON.stringify(jsonList, null, 4)); + console.log( + "List of JSON objects as single array has been written to jsonList.json", + ); + } catch (e) { + console.error("Error writing jsonList.json", e); + } +} + +main().catch(console.error); diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index 25b4b51d622b6a28b802a5584f3191abc6d1541b..4ccd9e92c928a569c5fd1825c0c123fca6b9d3d0 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -162,10 +162,10 @@ export class LlamaParseReader extends FileReader { // Create a job for the LlamaParse API private async createJob(data: Buffer): Promise<string> { // Load data, set the mime type - const mimeType = await this.getMimeType(data); + const { mimeType, extension } = await this.getMimeType(data); if (this.verbose) { - console.log(`Starting load for file with mimeType: ${mimeType}`); + console.log(`Starting load for ${extension} file`); } const body = new FormData(); @@ -290,13 +290,13 @@ export class LlamaParseReader extends FileReader { ]; } /** - * Loads data from a file and returns its contents as a JSON object. + * Loads data from a file and returns an array of JSON objects. * To be used with resultType = "json" * * @param {string} file - The path to the file to be loaded. - * @return {Promise<Record<string, any>>} A Promise that resolves to the JSON object. + * @return {Promise<Record<string, any>[]>} A Promise that resolves to an array of JSON objects. */ - async loadJson(file: string): Promise<Record<string, any>> { + async loadJson(file: string): Promise<Record<string, any>[]> { const data = await fs.readFile(file); // Creates a job for the file const jobId = await this.createJob(data); @@ -304,11 +304,11 @@ export class LlamaParseReader extends FileReader { console.log(`Started parsing the file under job id ${jobId}`); } - // Return results as JSON object + // Return results as an array of JSON objects (same format as Python version of the reader) const resultJson = await this.getJobResult(jobId, "json"); resultJson.job_id = jobId; resultJson.file_path = file; - return resultJson; + return [resultJson]; } /** @@ -370,18 +370,19 @@ export class LlamaParseReader extends FileReader { return images; } - private async getMimeType(data: Buffer): Promise<string> { - const mimes = filetypemime(data); - const validMime = mimes.find((mime) => - Object.values(SupportedFiles).includes(mime), - ); - if (!validMime) { + private async getMimeType( + data: Buffer, + ): Promise<{ mimeType: string; extension: string }> { + const mimes = filetypemime(data); // Get an array of possible MIME types + const extension = Object.keys(SupportedFiles).find( + (ext) => SupportedFiles[ext] === mimes[0], + ); // Find the extension for the first MIME type + if (!extension) { const supportedExtensions = Object.keys(SupportedFiles).join(", "); throw new Error( - `File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`, + `File has type "${mimes[0]}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`, ); } - - return validMime; + return { mimeType: mimes[0], extension }; // Return the first MIME type and its corresponding extension } }