Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Anything Llm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
mirrored_repos
MachineLearning
Mintplex Labs
Anything Llm
Commits
9a4df22c
Unverified
Commit
9a4df22c
authored
1 month ago
by
Timothy Carambat
Committed by
GitHub
1 month ago
Browse files
Options
Downloads
Patches
Plain Diff
autodetect parseable text file contents (#3079)
parent
d1ca16f7
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
collector/utils/files/index.js
+50
-4
50 additions, 4 deletions
collector/utils/files/index.js
collector/utils/files/mime.js
+1
-13
1 addition, 13 deletions
collector/utils/files/mime.js
with
51 additions
and
17 deletions
collector/utils/files/index.js
+
50
−
4
View file @
9a4df22c
...
@@ -2,16 +2,62 @@ const fs = require("fs");
...
@@ -2,16 +2,62 @@ const fs = require("fs");
const
path
=
require
(
"
path
"
);
const
path
=
require
(
"
path
"
);
const
{
MimeDetector
}
=
require
(
"
./mime
"
);
const
{
MimeDetector
}
=
require
(
"
./mime
"
);
/**
* Checks if a file is text by checking the mime type and then falling back to buffer inspection.
* This way we can capture all the cases where the mime type is not known but still parseable as text
* without having to constantly add new mime type overrides.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is text, false otherwise.
*/
function
isTextType
(
filepath
)
{
function
isTextType
(
filepath
)
{
if
(
!
fs
.
existsSync
(
filepath
))
return
false
;
const
result
=
isKnownTextMime
(
filepath
);
if
(
result
.
valid
)
return
true
;
// Known text type - return true.
if
(
result
.
reason
!==
"
generic
"
)
return
false
;
// If any other reason than generic - return false.
return
parseableAsText
(
filepath
);
// Fallback to parsing as text via buffer inspection.
}
/**
* Checks if a file is known to be text by checking the mime type.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is known to be text, false otherwise.
*/
function
isKnownTextMime
(
filepath
)
{
try
{
try
{
if
(
!
fs
.
existsSync
(
filepath
))
return
false
;
const
mimeLib
=
new
MimeDetector
();
const
mimeLib
=
new
MimeDetector
();
const
mime
=
mimeLib
.
getType
(
filepath
);
const
mime
=
mimeLib
.
getType
(
filepath
);
if
(
mimeLib
.
badMimes
.
includes
(
mime
))
return
false
;
if
(
mimeLib
.
badMimes
.
includes
(
mime
))
return
{
valid
:
false
,
reason
:
"
bad_mime
"
};
const
type
=
mime
.
split
(
"
/
"
)[
0
];
const
type
=
mime
.
split
(
"
/
"
)[
0
];
if
(
mimeLib
.
nonTextTypes
.
includes
(
type
))
return
false
;
if
(
mimeLib
.
nonTextTypes
.
includes
(
type
))
return
true
;
return
{
valid
:
false
,
reason
:
"
non_text_mime
"
};
return
{
valid
:
true
,
reason
:
"
valid_mime
"
};
}
catch
(
e
)
{
return
{
valid
:
false
,
reason
:
"
generic
"
};
}
}
/**
* Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
* If the file looks too much like a binary file, it will return false.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
*/
function
parseableAsText
(
filepath
)
{
try
{
const
fd
=
fs
.
openSync
(
filepath
,
"
r
"
);
const
buffer
=
Buffer
.
alloc
(
1024
);
// Read first 1KB of the file synchronously
const
bytesRead
=
fs
.
readSync
(
fd
,
buffer
,
0
,
1024
,
0
);
fs
.
closeSync
(
fd
);
const
content
=
buffer
.
subarray
(
0
,
bytesRead
).
toString
(
"
utf8
"
);
const
nullCount
=
(
content
.
match
(
/
\0
/g
)
||
[]).
length
;
const
controlCount
=
(
content
.
match
(
/
[\x
00-
\x
08
\x
0B
\x
0C
\x
0E-
\x
1F
]
/g
)
||
[])
.
length
;
const
threshold
=
bytesRead
*
0.1
;
return
nullCount
+
controlCount
<
threshold
;
}
catch
{
}
catch
{
return
false
;
return
false
;
}
}
...
...
This diff is collapsed.
Click to expand it.
collector/utils/files/mime.js
+
1
−
13
View file @
9a4df22c
const
MimeLib
=
require
(
"
mime
"
);
const
MimeLib
=
require
(
"
mime
"
);
const
path
=
require
(
"
path
"
);
class
MimeDetector
{
class
MimeDetector
{
nonTextTypes
=
[
"
multipart
"
,
"
image
"
,
"
model
"
,
"
audio
"
,
"
video
"
];
nonTextTypes
=
[
"
multipart
"
,
"
image
"
,
"
model
"
,
"
audio
"
,
"
video
"
,
"
font
"
];
badMimes
=
[
badMimes
=
[
"
application/octet-stream
"
,
"
application/octet-stream
"
,
"
application/zip
"
,
"
application/zip
"
,
...
@@ -48,11 +47,6 @@ class MimeDetector {
...
@@ -48,11 +47,6 @@ class MimeDetector {
);
);
}
}
// These are file types that are not detected by the mime library and need to be processed as text files.
// You should only add file types that are not detected by the mime library, are parsable as text, and are files
// with no extension. Otherwise, their extension should be added to the overrides array.
#specialTextFileTypes
=
[
"
dockerfile
"
,
"
jenkinsfile
"
,
"
dockerignore
"
];
/**
/**
* Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
* Returns the MIME type of the file. If the file has no extension found, it will be processed as a text file.
* @param {string} filepath
* @param {string} filepath
...
@@ -61,12 +55,6 @@ class MimeDetector {
...
@@ -61,12 +55,6 @@ class MimeDetector {
getType
(
filepath
)
{
getType
(
filepath
)
{
const
parsedMime
=
this
.
lib
.
getType
(
filepath
);
const
parsedMime
=
this
.
lib
.
getType
(
filepath
);
if
(
!!
parsedMime
)
return
parsedMime
;
if
(
!!
parsedMime
)
return
parsedMime
;
// If the mime could not be parsed, it could be a special file type like Dockerfile or Jenkinsfile
// which we can reliably process as text files.
const
baseName
=
path
.
basename
(
filepath
)?.
toLowerCase
();
if
(
this
.
#specialTextFileTypes
.
includes
(
baseName
))
return
"
text/plain
"
;
return
null
;
return
null
;
}
}
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment