autodetect parseable text file contents (#3079)

9a4df22c · Timothy Carambat · GitHub · d1ca16f7 · 9a4df22c · 9a4df22c
Unverified Commit 9a4df22c authored 1 month ago by Timothy Carambat Committed by GitHub 1 month ago
--- a/collector/utils/files/index.js
+++ b/collector/utils/files/index.js
@@ -2,16 +2,62 @@ const fs = require("fs");
 const path = require("path");
 const { MimeDetector } = require("./mime");
+/**
+ * Checks if a file is text by checking the mime type and then falling back to buffer inspection.
+ * This way we can capture all the cases where the mime type is not known but still parseable as text
+ * without having to constantly add new mime type overrides.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is text, false otherwise.
+ */
 function isTextType(filepath) {
+  if (!fs.existsSync(filepath)) return false;
+  const result = isKnownTextMime(filepath);
+  if (result.valid) return true; // Known text type - return true.
+  if (result.reason !== "generic") return false; // If any other reason than generic - return false.
+  return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
+}
+/**
+ * Checks if a file is known to be text by checking the mime type.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is known to be text, false otherwise.
+ */
+function isKnownTextMime(filepath) {
  try {
-    if (!fs.existsSync(filepath)) return false;
    const mimeLib = new MimeDetector();
    const mime = mimeLib.getType(filepath);
-    if (mimeLib.badMimes.includes(mime)) return false;
+    if (mimeLib.badMimes.includes(mime))
+      return { valid: false, reason: "bad_mime" };
    const type = mime.split("/")[0];
-    if (mimeLib.nonTextTypes.includes(type)) return false;
+    if (mimeLib.nonTextTypes.includes(type))
-    return true;
+      return { valid: false, reason: "non_text_mime" };
+    return { valid: true, reason: "valid_mime" };
+  } catch (e) {
+    return { valid: false, reason: "generic" };
+  }
+}
+/**
+ * Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
+ * If the file looks too much like a binary file, it will return false.
+ * @param {string} filepath - The path to the file.
+ * @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
+ */
+function parseableAsText(filepath) {
+  try {
+    const fd = fs.openSync(filepath, "r");
+    const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
+    const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
+    fs.closeSync(fd);
+    const content = buffer.subarray(0, bytesRead).toString("utf8");
+    const nullCount = (content.match(/\0/g) || []).length;
+    const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
+      .length;
+    const threshold = bytesRead * 0.1;
+    return nullCount + controlCount < threshold;
  } catch {
    return false;
  }

--- a/collector/utils/files/mime.js
+++ b/collector/utils/files/mime.js
 const MimeLib = require("mime");
-const path = require("path");
 class MimeDetector {
-  nonTextTypes = ["multipart", "image", "model", "audio", "video"];
+  nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
  badMimes = [
    "application/octet-stream",
    "application/zip",
@@ -48,11 +47,6 @@ class MimeDetector {
    );
  }
-  // These are file types that are not detected by the mime library and need to be processed as text files.
-  // You should only add file types that are not detected by the mime library, are parsable as text, and are files
-  // with no extension. Otherwise, their extension should be added to the overrides array.
-  #specialTextFileTypes = ["dockerfile", "jenkinsfile", "dockerignore"];
  /**
   * Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
   * @param {string} filepath
@@ -61,12 +55,6 @@ class MimeDetector {
  getType(filepath) {
    const parsedMime = this.lib.getType(filepath);
    if (!!parsedMime) return parsedMime;
-    // If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
-    // which we can reliably process as text files.
-    const baseName = path.basename(filepath)?.toLowerCase();
-    if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";
    return null;
  }
 }