Skip to content
Snippets Groups Projects
Unverified Commit 9a4df22c authored by Timothy Carambat's avatar Timothy Carambat Committed by GitHub
Browse files

autodetect parseable text file contents (#3079)

parent d1ca16f7
No related branches found
No related tags found
No related merge requests found
...@@ -2,16 +2,62 @@ const fs = require("fs"); ...@@ -2,16 +2,62 @@ const fs = require("fs");
const path = require("path"); const path = require("path");
const { MimeDetector } = require("./mime"); const { MimeDetector } = require("./mime");
/**
* Checks if a file is text by checking the mime type and then falling back to buffer inspection.
* This way we can capture all the cases where the mime type is not known but still parseable as text
* without having to constantly add new mime type overrides.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is text, false otherwise.
*/
function isTextType(filepath) { function isTextType(filepath) {
if (!fs.existsSync(filepath)) return false;
const result = isKnownTextMime(filepath);
if (result.valid) return true; // Known text type - return true.
if (result.reason !== "generic") return false; // If any other reason than generic - return false.
return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
}
/**
* Checks if a file is known to be text by checking the mime type.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is known to be text, false otherwise.
*/
function isKnownTextMime(filepath) {
try { try {
if (!fs.existsSync(filepath)) return false;
const mimeLib = new MimeDetector(); const mimeLib = new MimeDetector();
const mime = mimeLib.getType(filepath); const mime = mimeLib.getType(filepath);
if (mimeLib.badMimes.includes(mime)) return false; if (mimeLib.badMimes.includes(mime))
return { valid: false, reason: "bad_mime" };
const type = mime.split("/")[0]; const type = mime.split("/")[0];
if (mimeLib.nonTextTypes.includes(type)) return false; if (mimeLib.nonTextTypes.includes(type))
return true; return { valid: false, reason: "non_text_mime" };
return { valid: true, reason: "valid_mime" };
} catch (e) {
return { valid: false, reason: "generic" };
}
}
/**
* Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
* If the file looks too much like a binary file, it will return false.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
*/
function parseableAsText(filepath) {
try {
const fd = fs.openSync(filepath, "r");
const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
fs.closeSync(fd);
const content = buffer.subarray(0, bytesRead).toString("utf8");
const nullCount = (content.match(/\0/g) || []).length;
const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
.length;
const threshold = bytesRead * 0.1;
return nullCount + controlCount < threshold;
} catch { } catch {
return false; return false;
} }
......
const MimeLib = require("mime"); const MimeLib = require("mime");
const path = require("path");
class MimeDetector { class MimeDetector {
nonTextTypes = ["multipart", "image", "model", "audio", "video"]; nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
badMimes = [ badMimes = [
"application/octet-stream", "application/octet-stream",
"application/zip", "application/zip",
...@@ -48,11 +47,6 @@ class MimeDetector { ...@@ -48,11 +47,6 @@ class MimeDetector {
); );
} }
// These are file types that are not detected by the mime library and need to be processed as text files.
// You should only add file types that are not detected by the mime library, are parsable as text, and are files
// with no extension. Otherwise, their extension should be added to the overrides array.
#specialTextFileTypes = ["dockerfile", "jenkinsfile", "dockerignore"];
/** /**
* Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file. * Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
* @param {string} filepath * @param {string} filepath
...@@ -61,12 +55,6 @@ class MimeDetector { ...@@ -61,12 +55,6 @@ class MimeDetector {
getType(filepath) { getType(filepath) {
const parsedMime = this.lib.getType(filepath); const parsedMime = this.lib.getType(filepath);
if (!!parsedMime) return parsedMime; if (!!parsedMime) return parsedMime;
// If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
// which we can reliably process as text files.
const baseName = path.basename(filepath)?.toLowerCase();
if (this.#specialTextFileTypes.includes(baseName)) return "text/plain";
return null; return null;
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment