From ec90060d3698c8f471df0c8877c7d425f1cf6036 Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Thu, 29 Feb 2024 10:05:03 -0800 Subject: [PATCH] Re-map some file mimes to support text (#842) re-map some file mimes to support text --- collector/utils/files/index.js | 24 ++++++---------------- collector/utils/files/mime.js | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 18 deletions(-) create mode 100644 collector/utils/files/mime.js diff --git a/collector/utils/files/index.js b/collector/utils/files/index.js index 3e6ce3445..4bca62f9f 100644 --- a/collector/utils/files/index.js +++ b/collector/utils/files/index.js @@ -1,28 +1,16 @@ const fs = require("fs"); const path = require("path"); -const { getType } = require("mime"); +const { MimeDetector } = require("./mime"); function isTextType(filepath) { - if (!fs.existsSync(filepath)) return false; - // These are types of mime primary classes that for sure - // cannot also for forced into a text type. - const nonTextTypes = ["multipart", "image", "model", "audio", "video"]; - // These are full-mimes we for sure cannot parse or interpret as text - // documents - const BAD_MIMES = [ - "application/octet-stream", - "application/zip", - "application/pkcs8", - "application/vnd.microsoft.portable-executable", - "application/x-msdownload", - ]; - try { - const mime = getType(filepath); - if (BAD_MIMES.includes(mime)) return false; + if (!fs.existsSync(filepath)) return false; + const mimeLib = new MimeDetector(); + const mime = mimeLib.getType(filepath); + if (mimeLib.badMimes.includes(mime)) return false; const type = mime.split("/")[0]; - if (nonTextTypes.includes(type)) return false; + if (mimeLib.nonTextTypes.includes(type)) return false; return true; } catch { return false; diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js new file mode 100644 index 000000000..feabd6209 --- /dev/null +++ b/collector/utils/files/mime.js @@ -0,0 +1,37 @@ +const MimeLib = require("mime"); + +class MimeDetector { + nonTextTypes = ["multipart", "image", "model", "audio", "video"]; + badMimes = [ + "application/octet-stream", + "application/zip", + "application/pkcs8", + "application/vnd.microsoft.portable-executable", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", // XLSX are binaries and need to be handled explicitly. + "application/x-msdownload", + ]; + + constructor() { + this.lib = MimeLib; + this.setOverrides(); + } + + setOverrides() { + // the .ts extension maps to video/mp2t because of https://en.wikipedia.org/wiki/MPEG_transport_stream + // which has had this extension far before TS was invented. So need to force re-map this MIME map. + this.lib.define( + { + "text/plain": ["ts", "py", "opts", "lock", "jsonl"], + }, + true + ); + } + + getType(filepath) { + return this.lib.getType(filepath); + } +} + +module.exports = { + MimeDetector, +}; -- GitLab