From 4c38c1be0b4d0b907d54e65941f6c4d3e8cf55c7 Mon Sep 17 00:00:00 2001
From: Alex Yang <himself65@outlook.com>
Date: Fri, 18 Oct 2024 09:36:01 -0700
Subject: [PATCH] fix: do not detect file type in sdk (#1340)

---
 .changeset/funny-dancers-listen.md            |   6 +
 packages/cloud/package.json                   |   3 -
 packages/cloud/src/reader.ts                  | 121 +-----------------
 .../tests/readers/llama-parser-reader.test.ts |  15 ---
 pnpm-lock.yaml                                |  39 +-----
 5 files changed, 9 insertions(+), 175 deletions(-)
 create mode 100644 .changeset/funny-dancers-listen.md
 delete mode 100644 packages/llamaindex/tests/readers/llama-parser-reader.test.ts

diff --git a/.changeset/funny-dancers-listen.md b/.changeset/funny-dancers-listen.md
new file mode 100644
index 000000000..b1b6d5530
--- /dev/null
+++ b/.changeset/funny-dancers-listen.md
@@ -0,0 +1,6 @@
+---
+"@llamaindex/cloud": patch
+"llamaindex": patch
+---
+
+fix(cloud): do not detect file type in llama parse
diff --git a/packages/cloud/package.json b/packages/cloud/package.json
index e30419c71..8e2377417 100644
--- a/packages/cloud/package.json
+++ b/packages/cloud/package.json
@@ -58,8 +58,5 @@
   "peerDependencies": {
     "@llamaindex/core": "workspace:*",
     "@llamaindex/env": "workspace:*"
-  },
-  "dependencies": {
-    "magic-bytes.js": "^1.10.0"
   }
 }
diff --git a/packages/cloud/src/reader.ts b/packages/cloud/src/reader.ts
index 88c0fac38..3a2c9a946 100644
--- a/packages/cloud/src/reader.ts
+++ b/packages/cloud/src/reader.ts
@@ -1,7 +1,6 @@
 import { type Client, createClient, createConfig } from "@hey-api/client-fetch";
 import { Document, FileReader } from "@llamaindex/core/schema";
 import { fs, getEnv, path } from "@llamaindex/env";
-import { filetypeinfo } from "magic-bytes.js";
 import {
   type Body_upload_file_api_v1_parsing_upload_post,
   type ParserLanguages,
@@ -13,99 +12,6 @@ export type Language = ParserLanguages;
 
 export type ResultType = "text" | "markdown" | "json";
 
-const SUPPORT_FILE_EXT: string[] = [
-  ".pdf",
-  // document and presentations
-  ".602",
-  ".abw",
-  ".cgm",
-  ".cwk",
-  ".doc",
-  ".docx",
-  ".docm",
-  ".dot",
-  ".dotm",
-  ".hwp",
-  ".key",
-  ".lwp",
-  ".mw",
-  ".mcw",
-  ".pages",
-  ".pbd",
-  ".ppt",
-  ".pptm",
-  ".pptx",
-  ".pot",
-  ".potm",
-  ".potx",
-  ".rtf",
-  ".sda",
-  ".sdd",
-  ".sdp",
-  ".sdw",
-  ".sgl",
-  ".sti",
-  ".sxi",
-  ".sxw",
-  ".stw",
-  ".sxg",
-  ".txt",
-  ".uof",
-  ".uop",
-  ".uot",
-  ".vor",
-  ".wpd",
-  ".wps",
-  ".xml",
-  ".zabw",
-  ".epub",
-  // images
-  ".jpg",
-  ".jpeg",
-  ".png",
-  ".gif",
-  ".bmp",
-  ".svg",
-  ".tiff",
-  ".webp",
-  // web
-  ".htm",
-  ".html",
-  // spreadsheets
-  ".xlsx",
-  ".xls",
-  ".xlsm",
-  ".xlsb",
-  ".xlw",
-  ".csv",
-  ".dif",
-  ".sylk",
-  ".slk",
-  ".prn",
-  ".numbers",
-  ".et",
-  ".ods",
-  ".fods",
-  ".uos1",
-  ".uos2",
-  ".dbf",
-  ".wk1",
-  ".wk2",
-  ".wk3",
-  ".wk4",
-  ".wks",
-  ".123",
-  ".wq1",
-  ".wq2",
-  ".wb1",
-  ".wb2",
-  ".wb3",
-  ".qpw",
-  ".xlr",
-  ".eth",
-  ".tsv",
-];
-
 //todo: should move into @llamaindex/env
 type WriteStream = {
   write: (text: string) => void;
@@ -239,17 +145,12 @@ export class LlamaParseReader extends FileReader {
 
   // Create a job for the LlamaParse API
   private async createJob(data: Uint8Array): Promise<string> {
-    // Load data, set the mime type
-    const { mime } = await LlamaParseReader.getMimeType(data);
-
     if (this.verbose) {
       console.log("Started uploading the file");
     }
 
     const body = {
-      file: new Blob([data], {
-        type: mime,
-      }),
+      file: new Blob([data]),
       language: this.language,
       parsing_instruction: this.parsingInstruction,
       skip_diagonal_text: this.skipDiagonalText,
@@ -564,24 +465,4 @@ export class LlamaParseReader extends FileReader {
         }),
     );
   }
-
-  static async getMimeType(
-    data: Uint8Array,
-  ): Promise<{ mime: string; extension: string }> {
-    const typeinfos = filetypeinfo(data);
-    // find the first type info that matches the supported MIME types
-    // It could be happened that docx file is recognized as zip file, so we need to check the mime type
-    const info = typeinfos.find((info) => {
-      if (info.extension && SUPPORT_FILE_EXT.includes(`.${info.extension}`)) {
-        return info;
-      }
-    });
-    if (!info || !info.mime || !info.extension) {
-      const ext = SUPPORT_FILE_EXT.join(", ");
-      throw new Error(
-        `File has type which does not match supported MIME Types. Supported formats include: ${ext}`,
-      );
-    }
-    return { mime: info.mime, extension: info.extension };
-  }
 }
diff --git a/packages/llamaindex/tests/readers/llama-parser-reader.test.ts b/packages/llamaindex/tests/readers/llama-parser-reader.test.ts
deleted file mode 100644
index b43cf9915..000000000
--- a/packages/llamaindex/tests/readers/llama-parser-reader.test.ts
+++ /dev/null
@@ -1,15 +0,0 @@
-import { LlamaParseReader } from "llamaindex";
-import { readFile } from "node:fs/promises";
-import { join } from "node:path";
-import { fileURLToPath } from "node:url";
-import { expect, test } from "vitest";
-
-const fixturesDir = fileURLToPath(new URL("./fixtures", import.meta.url));
-
-test("file type should be detected correctly", async () => {
-  const xlsx = join(fixturesDir, "test.xlsx");
-  const buffer = await readFile(xlsx);
-  const { mime, extension } = await LlamaParseReader.getMimeType(buffer);
-  expect(mime).toBe("application/vnd.oasis.opendocument.spreadsheet");
-  expect(extension).toBe("ods");
-});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 0067bb768..f3f66ce67 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -348,10 +348,6 @@ importers:
         version: 5.6.2
 
   packages/cloud:
-    dependencies:
-      magic-bytes.js:
-        specifier: ^1.10.0
-        version: 1.10.0
     devDependencies:
       '@hey-api/client-fetch':
         specifier: ^0.2.4
@@ -6568,7 +6564,6 @@ packages:
   eslint@8.57.0:
     resolution: {integrity: sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==}
     engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
-    deprecated: This version is no longer supported. Please see https://eslint.org/version-support for other options.
     hasBin: true
 
   eslint@9.10.0:
@@ -19534,7 +19529,7 @@ snapshots:
       '@typescript-eslint/parser': 7.2.0(eslint@8.57.0)(typescript@5.6.2)
       eslint: 8.57.0
       eslint-import-resolver-node: 0.3.9
-      eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0)
+      eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0)
       eslint-plugin-import: 2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.0)
       eslint-plugin-jsx-a11y: 6.9.0(eslint@8.57.0)
       eslint-plugin-react: 7.35.0(eslint@8.57.0)
@@ -19582,25 +19577,6 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0):
-    dependencies:
-      '@nolyfill/is-core-module': 1.0.39
-      debug: 4.3.7
-      enhanced-resolve: 5.17.1
-      eslint: 8.57.0
-      eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0)
-      fast-glob: 3.3.2
-      get-tsconfig: 4.8.0
-      is-bun-module: 1.1.0
-      is-glob: 4.0.3
-    optionalDependencies:
-      eslint-plugin-import: 2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.0)
-    transitivePeerDependencies:
-      - '@typescript-eslint/parser'
-      - eslint-import-resolver-node
-      - eslint-import-resolver-webpack
-      - supports-color
-
   eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0):
     dependencies:
       '@nolyfill/is-core-module': 1.0.39
@@ -19620,17 +19596,6 @@ snapshots:
       - eslint-import-resolver-webpack
       - supports-color
 
-  eslint-module-utils@2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0):
-    dependencies:
-      debug: 3.2.7
-    optionalDependencies:
-      '@typescript-eslint/parser': 7.2.0(eslint@8.57.0)(typescript@5.6.2)
-      eslint: 8.57.0
-      eslint-import-resolver-node: 0.3.9
-      eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0)
-    transitivePeerDependencies:
-      - supports-color
-
   eslint-module-utils@2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0):
     dependencies:
       debug: 3.2.7
@@ -19652,7 +19617,7 @@ snapshots:
       doctrine: 2.1.0
       eslint: 8.57.0
       eslint-import-resolver-node: 0.3.9
-      eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0)
+      eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0)
       hasown: 2.0.2
       is-core-module: 2.15.1
       is-glob: 4.0.3
-- 
GitLab