From 6b1ded41a9b0c2cdd113df3b7c9dfeb6db380653 Mon Sep 17 00:00:00 2001 From: Fabian Wimmer <github@insightby.ai> Date: Mon, 3 Jun 2024 17:22:18 +0200 Subject: [PATCH] feat: LlamaParse: add gpt4o-mode, invalidate cache, skip diagonal text, update supported file types (#889) Co-authored-by: Marcus Schiesser <marcus.schiesser@googlemail.com> --- .changeset/khaki-ears-develop.md | 5 + packages/core/src/readers/LlamaParseReader.ts | 120 ++++++++++++++---- 2 files changed, 103 insertions(+), 22 deletions(-) create mode 100644 .changeset/khaki-ears-develop.md diff --git a/.changeset/khaki-ears-develop.md b/.changeset/khaki-ears-develop.md new file mode 100644 index 000000000..9c9c10e65 --- /dev/null +++ b/.changeset/khaki-ears-develop.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +add gpt4o-mode, invalidate cache and skip diagonal text to LlamaParseReader diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index 2996a589e..5bc4934b1 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -5,45 +5,100 @@ import type { FileReader, Language, ResultType } from "./type.js"; const SupportedFiles: { [key: string]: string } = { ".pdf": "application/pdf", + // Documents and Presentations + ".602": "application/x-t602", + ".abw": "application/x-abiword", + ".cgm": "image/cgm", + ".cwk": "application/x-cwk", ".doc": "application/msword", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".docm": "application/vnd.ms-word.document.macroEnabled.12", ".dot": "application/msword", + ".dotm": "application/vnd.ms-word.template.macroEnabled.12", ".dotx": "application/vnd.openxmlformats-officedocument.wordprocessingml.template", - ".dotm": "application/vnd.ms-word.template.macroEnabled.12", - ".rtf": "application/rtf", - ".wps": "application/vnd.ms-works", - ".wpd": "application/wordperfect", - ".sxw": "application/vnd.sun.xml.writer", - ".stw": "application/vnd.sun.xml.writer.template", - ".sxg": "application/vnd.sun.xml.writer.global", - ".pages": "application/x-iwork-pages-sffpages", + ".hwp": "application/x-hwp", + ".key": "application/x-iwork-keynote-sffkey", + ".lwp": "application/vnd.lotus-wordpro", ".mw": "application/macwriteii", ".mcw": "application/macwriteii", - ".uot": "application/x-uo", - ".uof": "application/vnd.uoml+xml", - ".uos": "application/vnd.sun.xml.calc", - ".uop": "application/vnd.openofficeorg.presentation", + ".pages": "application/x-iwork-pages-sffpages", + ".pbd": "application/x-pagemaker", ".ppt": "application/vnd.ms-powerpoint", + ".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12", ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", ".pot": "application/vnd.ms-powerpoint", - ".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12", + ".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12", ".potx": "application/vnd.openxmlformats-officedocument.presentationml.template", - ".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12", - ".key": "application/x-iwork-keynote-sffkey", - ".odp": "application/vnd.oasis.opendocument.presentation", - ".odg": "application/vnd.oasis.opendocument.graphics", - ".otp": "application/vnd.oasis.opendocument.presentation-template", - ".fopd": "application/vnd.oasis.opendocument.presentation", - ".sxi": "application/vnd.sun.xml.impress", + ".rtf": "application/rtf", + ".sda": "application/vnd.stardivision.draw", + ".sdd": "application/vnd.stardivision.impress", + ".sdp": "application/sdp", + ".sdw": "application/vnd.stardivision.writer", + ".sgl": "application/vnd.stardivision.writer", ".sti": "application/vnd.sun.xml.impress.template", + ".sxi": "application/vnd.sun.xml.impress", + ".sxw": "application/vnd.sun.xml.writer", + ".stw": "application/vnd.sun.xml.writer.template", + ".sxg": "application/vnd.sun.xml.writer.global", + ".txt": "text/plain", + ".uof": "application/vnd.uoml+xml", + ".uop": "application/vnd.openofficeorg.presentation", + ".uot": "application/x-uo", + ".vor": "application/vnd.stardivision.writer", + ".wpd": "application/wordperfect", + ".wps": "application/vnd.ms-works", + ".xml": "application/xml", + ".zabw": "application/x-abiword", + // Images ".epub": "application/epub+zip", - ".html": "text/html", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".bmp": "image/bmp", + ".svg": "image/svg+xml", + ".tiff": "image/tiff", + ".webp": "image/webp", + // Web ".htm": "text/html", + ".html": "text/html", + // Spreadsheets + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".xls": "application/vnd.ms-excel", + ".xlsm": "application/vnd.ms-excel.sheet.macroEnabled.12", + ".xlsb": "application/vnd.ms-excel.sheet.binary.macroEnabled.12", + ".xlw": "application/vnd.ms-excel", + ".csv": "text/csv", + ".dif": "application/x-dif", + ".sylk": "text/vnd.sylk", + ".slk": "text/vnd.sylk", + ".prn": "application/x-prn", + ".numbers": "application/x-iwork-numbers-sffnumbers", + ".et": "application/vnd.ms-excel", + ".ods": "application/vnd.oasis.opendocument.spreadsheet", + ".fods": "application/vnd.oasis.opendocument.spreadsheet", + ".uos1": "application/vnd.uoml+xml", + ".uos2": "application/vnd.uoml+xml", + ".dbf": "application/vnd.dbf", + ".wk1": "application/vnd.lotus-1-2-3", + ".wk2": "application/vnd.lotus-1-2-3", + ".wk3": "application/vnd.lotus-1-2-3", + ".wk4": "application/vnd.lotus-1-2-3", + ".wks": "application/vnd.lotus-1-2-3", + ".123": "application/vnd.lotus-1-2-3", + ".wq1": "application/x-lotus", + ".wq2": "application/x-lotus", + ".wb1": "application/x-quattro-pro", + ".wb2": "application/x-quattro-pro", + ".wb3": "application/x-quattro-pro", + ".qpw": "application/x-quattro-pro", + ".xlr": "application/vnd.ms-works", + ".eth": "application/ethos", + ".tsv": "text/tab-separated-values", }; /** @@ -67,6 +122,14 @@ export class LlamaParseReader implements FileReader { language: Language = "en"; // The parsing instruction for the parser. parsingInstruction: string = ""; + // If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0). + skipDiagonalText: boolean = false; + // If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing 2 time the same document. + invalidateCache: boolean = false; + // Whether to use gpt-4o extract text from documents. + gpt4oMode: boolean = false; + // The API key for the GPT-4o API. Lowers the cost of parsing. + gpt4oApiKey?: string; constructor(params: Partial<LlamaParseReader> = {}) { Object.assign(this, params); @@ -77,6 +140,13 @@ export class LlamaParseReader implements FileReader { ); } this.apiKey = params.apiKey; + + if (params.gpt4oMode) { + params.gpt4oApiKey = + params.gpt4oApiKey ?? getEnv("LLAMA_CLOUD_GPT4O_API_KEY"); + + this.gpt4oApiKey = params.gpt4oApiKey; + } } async loadData(file: string): Promise<Document[]> { @@ -89,7 +159,13 @@ export class LlamaParseReader implements FileReader { const body = new FormData(); body.set("file", new Blob([data], { type: mimeType }), file); body.append("language", this.language); - body.append("parsingInstruction", this.parsingInstruction); + body.append("parsing_instruction", this.parsingInstruction); + body.append("skip_diagonal_text", this.skipDiagonalText.toString()); + body.append("invalidate_cache", this.invalidateCache.toString()); + body.append("gpt4o_mode", this.gpt4oMode.toString()); + if (this.gpt4oMode && this.gpt4oApiKey) { + body.append("gpt4o_api_key", this.gpt4oApiKey); + } const headers = { Authorization: `Bearer ${this.apiKey}`, -- GitLab