From a1a72ab22382c3e6f5094d10b3f451d96db1b873 Mon Sep 17 00:00:00 2001
From: Fabian Wimmer <github@insightby.ai>
Date: Thu, 9 May 2024 04:51:01 +0200
Subject: [PATCH] feat: LlamaParseReader: update Supported File Types to match
 python version (#823)

---
 packages/core/src/readers/LlamaParseReader.ts | 65 ++++++++++++++++---
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts
index 817611743..327878c4c 100644
--- a/packages/core/src/readers/LlamaParseReader.ts
+++ b/packages/core/src/readers/LlamaParseReader.ts
@@ -3,6 +3,49 @@ import { filetypemime } from "magic-bytes.js";
 import { Document } from "../Node.js";
 import type { FileReader, Language, ResultType } from "./type.js";
 
+const SupportedFiles: { [key: string]: string } = {
+  ".pdf": "application/pdf",
+  ".doc": "application/msword",
+  ".docx":
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+  ".docm": "application/vnd.ms-word.document.macroEnabled.12",
+  ".dot": "application/msword",
+  ".dotx":
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+  ".dotm": "application/vnd.ms-word.template.macroEnabled.12",
+  ".rtf": "application/rtf",
+  ".wps": "application/vnd.ms-works",
+  ".wpd": "application/wordperfect",
+  ".sxw": "application/vnd.sun.xml.writer",
+  ".stw": "application/vnd.sun.xml.writer.template",
+  ".sxg": "application/vnd.sun.xml.writer.global",
+  ".pages": "application/x-iwork-pages-sffpages",
+  ".mw": "application/macwriteii",
+  ".mcw": "application/macwriteii",
+  ".uot": "application/x-uo",
+  ".uof": "application/vnd.uoml+xml",
+  ".uos": "application/vnd.sun.xml.calc",
+  ".uop": "application/vnd.openofficeorg.presentation",
+  ".ppt": "application/vnd.ms-powerpoint",
+  ".pptx":
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+  ".pot": "application/vnd.ms-powerpoint",
+  ".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
+  ".potx":
+    "application/vnd.openxmlformats-officedocument.presentationml.template",
+  ".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12",
+  ".key": "application/x-iwork-keynote-sffkey",
+  ".odp": "application/vnd.oasis.opendocument.presentation",
+  ".odg": "application/vnd.oasis.opendocument.graphics",
+  ".otp": "application/vnd.oasis.opendocument.presentation-template",
+  ".fopd": "application/vnd.oasis.opendocument.presentation",
+  ".sxi": "application/vnd.sun.xml.impress",
+  ".sti": "application/vnd.sun.xml.impress.template",
+  ".epub": "application/epub+zip",
+  ".html": "text/html",
+  ".htm": "text/html",
+};
+
 /**
  * Represents a reader for parsing files using the LlamaParse API.
  * See https://github.com/run-llama/llama_parse
@@ -40,15 +83,12 @@ export class LlamaParseReader implements FileReader {
     file: string,
     fs: GenericFileSystem = defaultFS,
   ): Promise<Document[]> {
-    if (!file.endsWith(".pdf")) {
-      throw new Error("Currently, only PDF files are supported.");
-    }
-
     const metadata = { file_path: file };
 
     // Load data, set the mime type
     const data = await fs.readRawFile(file);
     const mimeType = await this.getMimeType(data);
+
     const body = new FormData();
     body.set("file", new Blob([data], { type: mimeType }), file);
     body.append("language", this.language);
@@ -67,7 +107,7 @@ export class LlamaParseReader implements FileReader {
       headers,
     });
     if (!response.ok) {
-      throw new Error(`Failed to parse the PDF file: ${await response.text()}`);
+      throw new Error(`Failed to parse the file: ${await response.text()}`);
     }
     const jsonResponse = await response.json();
 
@@ -94,7 +134,7 @@ export class LlamaParseReader implements FileReader {
         const end = Date.now();
         if (end - start > this.maxTimeout * 1000) {
           throw new Error(
-            `Timeout while parsing the PDF file: ${await response.text()}`,
+            `Timeout while parsing the file: ${await response.text()}`,
           );
         }
         if (this.verbose && tries % 10 === 0) {
@@ -116,9 +156,16 @@ export class LlamaParseReader implements FileReader {
 
   private async getMimeType(data: Buffer): Promise<string> {
     const mimes = filetypemime(data);
-    if (!mimes.includes("application/pdf")) {
-      throw new Error("Currently, only PDF files are supported.");
+    const validMime = mimes.find((mime) =>
+      Object.values(SupportedFiles).includes(mime),
+    );
+    if (!validMime) {
+      const supportedExtensions = Object.keys(SupportedFiles).join(", ");
+      throw new Error(
+        `File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`,
+      );
     }
-    return "application/pdf";
+
+    return validMime;
   }
 }
-- 
GitLab