diff --git a/.changeset/violet-cows-fix.md b/.changeset/violet-cows-fix.md new file mode 100644 index 0000000000000000000000000000000000000000..b006514c4741b01c34d1a5eef6de042683698aa6 --- /dev/null +++ b/.changeset/violet-cows-fix.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add reader for LlamaParse diff --git a/examples/data/TOS.pdf b/examples/data/TOS.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fdc44d375adcb6e012e7cfe00427d91c50deaa97 Binary files /dev/null and b/examples/data/TOS.pdf differ diff --git a/examples/readers/package.json b/examples/readers/package.json index a781b81e3f0813abed1ce711449698d485fc84d1..32a9bcb25448e2e90565ed072fd2ff651e387913 100644 --- a/examples/readers/package.json +++ b/examples/readers/package.json @@ -8,7 +8,8 @@ "start:docx": "node --loader ts-node/esm ./src/docx.ts", "start:html": "node --loader ts-node/esm ./src/html.ts", "start:markdown": "node --loader ts-node/esm ./src/markdown.ts", - "start:pdf": "node --loader ts-node/esm ./src/pdf.ts" + "start:pdf": "node --loader ts-node/esm ./src/pdf.ts", + "start:llamaparse": "node --loader ts-node/esm ./src/llamaparse.ts" }, "dependencies": { "llamaindex": "latest" diff --git a/examples/readers/src/llamaparse.ts b/examples/readers/src/llamaparse.ts new file mode 100644 index 0000000000000000000000000000000000000000..3a9e6b2875869cd67944a9f4111e2dea20b5b16b --- /dev/null +++ b/examples/readers/src/llamaparse.ts @@ -0,0 +1,21 @@ +import { LlamaParseReader, VectorStoreIndex } from "llamaindex"; + +async function main() { + // Load PDF using LlamaParse + const reader = new LlamaParseReader({ resultType: "markdown" }); + const documents = await reader.loadData("../data/TOS.pdf"); + + // Split text and create embeddings. Store them in a VectorStoreIndex + const index = await VectorStoreIndex.fromDocuments(documents); + + // Query the index + const queryEngine = index.asQueryEngine(); + const response = await queryEngine.query({ + query: "What is the license grant in the TOS?", + }); + + // Output response + console.log(response.toString()); +} + +main().catch(console.error); diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts new file mode 100644 index 0000000000000000000000000000000000000000..ba9c670539afb95a6d8ed8cf53f5bfce669ae1e0 --- /dev/null +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -0,0 +1,120 @@ +import { Document } from "../Node"; +import { defaultFS } from "../env"; +import { GenericFileSystem } from "../storage/FileSystem"; +import { FileReader } from "./type"; + +type ResultType = "text" | "markdown"; + +/** + * Represents a reader for parsing files using the LlamaParse API. + * See https://github.com/run-llama/llama_parse + */ +export class LlamaParseReader implements FileReader { + // The API key for the LlamaParse API. + apiKey: string; + // The base URL of the Llama Parsing API. + baseUrl: string = "https://api.cloud.llamaindex.ai/api/parsing"; + // The maximum timeout in seconds to wait for the parsing to finish. + maxTimeout = 2000; + // The interval in seconds to check if the parsing is done. + checkInterval = 1; + // Whether to print the progress of the parsing. + verbose = true; + resultType: ResultType = "text"; + + constructor(params: Partial<LlamaParseReader> = {}) { + Object.assign(this, params); + params.apiKey = params.apiKey ?? process.env.LLAMA_CLOUD_API_KEY; + if (!params.apiKey) { + throw new Error( + "API Key is required for LlamaParseReader. Please pass the apiKey parameter or set the LLAMA_CLOUD_API_KEY environment variable.", + ); + } + this.apiKey = params.apiKey; + } + + async loadData( + file: string, + fs: GenericFileSystem = defaultFS, + ): Promise<Document[]> { + if (!file.endsWith(".pdf")) { + throw new Error("Currently, only PDF files are supported."); + } + + const metadata = { file_path: file }; + + // Load data, set the mime type + const data = await fs.readRawFile(file); + const mimeType = await this.getMimeType(data); + const body = new FormData(); + body.set("file", new Blob([data], { type: mimeType }), file); + + const headers = { + Authorization: `Bearer ${this.apiKey}`, + }; + + // Send the request, start job + const url = `${this.baseUrl}/upload`; + let response = await fetch(url, { + signal: AbortSignal.timeout(this.maxTimeout * 1000), + method: "POST", + body, + headers, + }); + if (!response.ok) { + throw new Error(`Failed to parse the PDF file: ${await response.text()}`); + } + const jsonResponse = await response.json(); + + // Check the status of the job, return when done + const jobId = jsonResponse.id; + if (this.verbose) { + console.log(`Started parsing the file under job id ${jobId}`); + } + + const resultUrl = `${this.baseUrl}/job/${jobId}/result/${this.resultType}`; + + let start = Date.now(); + let tries = 0; + while (true) { + await new Promise((resolve) => + setTimeout(resolve, this.checkInterval * 1000), + ); + response = await fetch(resultUrl, { + headers, + signal: AbortSignal.timeout(this.maxTimeout * 1000), + }); + + if (!response.ok) { + const end = Date.now(); + if (end - start > this.maxTimeout * 1000) { + throw new Error( + `Timeout while parsing the PDF file: ${await response.text()}`, + ); + } + if (this.verbose && tries % 10 === 0) { + process.stdout.write("."); + } + tries++; + continue; + } + + const resultJson = await response.json(); + return [ + new Document({ + text: resultJson[this.resultType], + metadata: metadata, + }), + ]; + } + } + + private async getMimeType(data: Buffer): Promise<string> { + const { fileTypeFromBuffer } = await import("file-type"); + const type = await fileTypeFromBuffer(data); + if (type?.mime !== "application/pdf") { + throw new Error("Currently, only PDF files are supported."); + } + return type.mime; + } +} diff --git a/packages/core/src/readers/index.ts b/packages/core/src/readers/index.ts index 00b83bc2ab25f1e86cf3ef1615023a03c4b52c00..fc3411a17622b83660050591749898a29c416057 100644 --- a/packages/core/src/readers/index.ts +++ b/packages/core/src/readers/index.ts @@ -3,6 +3,7 @@ export * from "./CSVReader"; export * from "./DocxReader"; export * from "./HTMLReader"; export * from "./ImageReader"; +export * from "./LlamaParseReader"; export * from "./MarkdownReader"; export * from "./NotionReader"; export * from "./PDFReader"; diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js index db2ba0f50ec343acaaedfeaef71f657606f5485b..6920e9db112e299527574c2b8e3261c7b34fd092 100644 --- a/packages/eslint-config-custom/index.js +++ b/packages/eslint-config-custom/index.js @@ -6,6 +6,7 @@ module.exports = { "error", { allowList: [ + "LLAMA_CLOUD_API_KEY", "OPENAI_API_KEY", "REPLICATE_API_TOKEN", "ANTHROPIC_API_KEY",