From ec90060d3698c8f471df0c8877c7d425f1cf6036 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Thu, 29 Feb 2024 10:05:03 -0800
Subject: [PATCH] Re-map some file mimes to support text (#842)

re-map some file mimes to support text
---
 collector/utils/files/index.js | 24 ++++++----------------
 collector/utils/files/mime.js  | 37 ++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 18 deletions(-)
 create mode 100644 collector/utils/files/mime.js

diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js
index 3e6ce3445..4bca62f9f 100644
--- a/collector/utils/files/index.js
+++ b/collector/utils/files/index.js
@@ -1,28 +1,16 @@
 const fs = require("fs");
 const path = require("path");
-const { getType } = require("mime");
+const { MimeDetector } = require("./mime");
 
 function isTextType(filepath) {
-  if (!fs.existsSync(filepath)) return false;
-  // These are types of mime primary classes that for sure
-  // cannot also for forced into a text type.
-  const nonTextTypes = ["multipart", "image", "model", "audio", "video"];
-  // These are full-mimes we for sure cannot parse or interpret as text
-  // documents
-  const BAD_MIMES = [
-    "application/octet-stream",
-    "application/zip",
-    "application/pkcs8",
-    "application/vnd.microsoft.portable-executable",
-    "application/x-msdownload",
-  ];
-
   try {
-    const mime = getType(filepath);
-    if (BAD_MIMES.includes(mime)) return false;
+    if (!fs.existsSync(filepath)) return false;
+    const mimeLib = new MimeDetector();
+    const mime = mimeLib.getType(filepath);
+    if (mimeLib.badMimes.includes(mime)) return false;
 
     const type = mime.split("/")[0];
-    if (nonTextTypes.includes(type)) return false;
+    if (mimeLib.nonTextTypes.includes(type)) return false;
     return true;
   } catch {
     return false;
diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js
new file mode 100644
index 000000000..feabd6209
--- /dev/null
+++ b/collector/utils/files/mime.js
@@ -0,0 +1,37 @@
+const MimeLib = require("mime");
+
+class MimeDetector {
+  nonTextTypes = ["multipart", "image", "model", "audio", "video"];
+  badMimes = [
+    "application/octet-stream",
+    "application/zip",
+    "application/pkcs8",
+    "application/vnd.microsoft.portable-executable",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", // XLSX are binaries and need to be handled explicitly.
+    "application/x-msdownload",
+  ];
+
+  constructor() {
+    this.lib = MimeLib;
+    this.setOverrides();
+  }
+
+  setOverrides() {
+    // the .ts extension maps to video/mp2t because of https://en.wikipedia.org/wiki/MPEG_transport_stream
+    // which has had this extension far before TS was invented. So need to force re-map this MIME map.
+    this.lib.define(
+      {
+        "text/plain": ["ts", "py", "opts", "lock", "jsonl"],
+      },
+      true
+    );
+  }
+
+  getType(filepath) {
+    return this.lib.getType(filepath);
+  }
+}
+
+module.exports = {
+  MimeDetector,
+};
-- 
GitLab