diff --git a/server/utils/files/index.js b/server/utils/files/index.js index e713a318ad0a03a7c6bbdf4c83200a85282bfc15..dff5bef9c2a0719810bb6c453078111ce055472d 100644 --- a/server/utils/files/index.js +++ b/server/utils/files/index.js @@ -1,37 +1,29 @@ const fs = require("fs"); const path = require("path"); const { v5: uuidv5 } = require("uuid"); +const documentsPath = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../storage/documents`) + : path.resolve(process.env.STORAGE_DIR, `documents`); +const vectorCachePath = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../storage/vector-cache`) + : path.resolve(process.env.STORAGE_DIR, `vector-cache`); // Should take in a folder that is a subfolder of documents // eg: youtube-subject/video-123.json async function fileData(filePath = null) { if (!filePath) throw new Error("No docPath provided in request"); + const fullFilePath = path.resolve(documentsPath, normalizePath(filePath)); + if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath)) + return null; - const fullPath = - process.env.NODE_ENV === "development" - ? path.resolve( - __dirname, - `../../storage/documents/${normalizePath(filePath)}` - ) - : path.resolve( - process.env.STORAGE_DIR, - `documents/${normalizePath(filePath)}` - ); - - const fileExists = fs.existsSync(fullPath); - if (!fileExists) return null; - - const data = fs.readFileSync(fullPath, "utf8"); + const data = fs.readFileSync(fullFilePath, "utf8"); return JSON.parse(data); } async function viewLocalFiles() { - const folder = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/documents`) - : path.resolve(process.env.STORAGE_DIR, `documents`); - const dirExists = fs.existsSync(folder); - if (!dirExists) fs.mkdirSync(folder); + if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath); const directory = { name: "documents", @@ -39,14 +31,9 @@ async function viewLocalFiles() { items: [], }; - for (const file of fs.readdirSync(folder)) { + for (const file of fs.readdirSync(documentsPath)) { if (path.extname(file) === ".md") continue; - - const folderPath = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/documents/${file}`) - : path.resolve(process.env.STORAGE_DIR, `documents/${file}`); - + const folderPath = path.resolve(documentsPath, file); const isFolder = fs.lstatSync(folderPath).isDirectory(); if (isFolder) { const subdocs = { @@ -83,10 +70,7 @@ async function cachedVectorInformation(filename = null, checkOnly = false) { if (!filename) return checkOnly ? false : { exists: false, chunks: [] }; const digest = uuidv5(filename, uuidv5.URL); - const file = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/vector-cache/${digest}.json`) - : path.resolve(process.env.STORAGE_DIR, `vector-cache/${digest}.json`); + const file = path.resolve(vectorCachePath, `${digest}.json`); const exists = fs.existsSync(file); if (checkOnly) return exists; @@ -106,15 +90,10 @@ async function storeVectorResult(vectorData = [], filename = null) { console.log( `Caching vectorized results of ${filename} to prevent duplicated embedding.` ); - const folder = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/vector-cache`) - : path.resolve(process.env.STORAGE_DIR, `vector-cache`); - - if (!fs.existsSync(folder)) fs.mkdirSync(folder); + if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath); const digest = uuidv5(filename, uuidv5.URL); - const writeTo = path.resolve(folder, `${digest}.json`); + const writeTo = path.resolve(vectorCachePath, `${digest}.json`); fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8"); return; } @@ -122,21 +101,16 @@ async function storeVectorResult(vectorData = [], filename = null) { // Purges a file from the documents/ folder. async function purgeSourceDocument(filename = null) { if (!filename) return; + const filePath = path.resolve(documentsPath, normalizePath(filename)); + + if ( + !fs.existsSync(filePath) || + !isWithin(documentsPath, filePath) || + !fs.lstatSync(filePath).isFile() + ) + return; + console.log(`Purging source document of ${filename}.`); - const filePath = - process.env.NODE_ENV === "development" - ? path.resolve( - __dirname, - `../../storage/documents`, - normalizePath(filename) - ) - : path.resolve( - process.env.STORAGE_DIR, - `documents`, - normalizePath(filename) - ); - - if (!fs.existsSync(filePath)) return; fs.rmSync(filePath); return; } @@ -144,15 +118,11 @@ async function purgeSourceDocument(filename = null) { // Purges a vector-cache file from the vector-cache/ folder. async function purgeVectorCache(filename = null) { if (!filename) return; - console.log(`Purging vector-cache of ${filename}.`); - const digest = uuidv5(filename, uuidv5.URL); - const filePath = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/vector-cache`, `${digest}.json`) - : path.resolve(process.env.STORAGE_DIR, `vector-cache`, `${digest}.json`); + const filePath = path.resolve(vectorCachePath, `${digest}.json`); - if (!fs.existsSync(filePath)) return; + if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return; + console.log(`Purging vector-cache of ${filename}.`); fs.rmSync(filePath); return; } @@ -161,24 +131,20 @@ async function purgeVectorCache(filename = null) { // folder via iteration of all folders and checking if the expected file exists. async function findDocumentInDocuments(documentName = null) { if (!documentName) return null; - const documentsFolder = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/documents`) - : path.resolve(process.env.STORAGE_DIR, `documents`); - - for (const folder of fs.readdirSync(documentsFolder)) { + for (const folder of fs.readdirSync(documentsPath)) { const isFolder = fs - .lstatSync(path.join(documentsFolder, folder)) + .lstatSync(path.join(documentsPath, folder)) .isDirectory(); if (!isFolder) continue; const targetFilename = normalizePath(documentName); - const targetFileLocation = path.join( - documentsFolder, - folder, - targetFilename - ); - if (!fs.existsSync(targetFileLocation)) continue; + const targetFileLocation = path.join(documentsPath, folder, targetFilename); + + if ( + !fs.existsSync(targetFileLocation) || + !isWithin(documentsPath, targetFileLocation) + ) + continue; const fileData = fs.readFileSync(targetFileLocation, "utf8"); const cachefilename = `${folder}/${targetFilename}`; @@ -194,8 +160,25 @@ async function findDocumentInDocuments(documentName = null) { return null; } +/** + * Checks if a given path is within another path. + * @param {string} outer - The outer path (should be resolved). + * @param {string} inner - The inner path (should be resolved). + * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise. + */ +function isWithin(outer, inner) { + if (outer === inner) return false; + const rel = path.relative(outer, inner); + return !rel.startsWith("../") && rel !== ".."; +} + function normalizePath(filepath = "") { - return path.normalize(filepath).replace(/^(\.\.(\/|\\|$))+/, ""); + const result = path + .normalize(filepath.trim()) + .replace(/^(\.\.(\/|\\|$))+/, "") + .trim(); + if (["..", ".", "/"].includes(result)) throw new Error("Invalid path."); + return result; } module.exports = { @@ -207,4 +190,6 @@ module.exports = { storeVectorResult, fileData, normalizePath, + isWithin, + documentsPath, }; diff --git a/server/utils/files/purgeDocument.js b/server/utils/files/purgeDocument.js index 46e9d37dada882d6a7e8c2506c1defe02b3a6bb7..7f32bd427c3331b2f6a75063fcf3d06c8aa11159 100644 --- a/server/utils/files/purgeDocument.js +++ b/server/utils/files/purgeDocument.js @@ -1,30 +1,53 @@ const fs = require("fs"); const path = require("path"); -const { purgeVectorCache, purgeSourceDocument, normalizePath } = require("."); +const { + purgeVectorCache, + purgeSourceDocument, + normalizePath, + isWithin, + documentsPath, +} = require("."); const { Document } = require("../../models/documents"); const { Workspace } = require("../../models/workspace"); -async function purgeDocument(filename) { +async function purgeDocument(filename = null) { + if (!filename || !normalizePath(filename)) return; + + await purgeVectorCache(filename); + await purgeSourceDocument(filename); const workspaces = await Workspace.where(); for (const workspace of workspaces) { await Document.removeDocuments(workspace, [filename]); } - await purgeVectorCache(filename); - await purgeSourceDocument(filename); return; } -async function purgeFolder(folderName) { - if (folderName === "custom-documents") return; - const documentsFolder = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../storage/documents`) - : path.resolve(process.env.STORAGE_DIR, `documents`); +async function purgeFolder(folderName = null) { + if (!folderName) return; + const subFolder = normalizePath(folderName); + const subFolderPath = path.resolve(documentsPath, subFolder); + const validRemovableSubFolders = fs + .readdirSync(documentsPath) + .map((folder) => { + // Filter out any results which are not folders or + // are the protected custom-documents folder. + if (folder === "custom-documents") return null; + const subfolderPath = path.resolve(documentsPath, folder); + if (!fs.lstatSync(subfolderPath).isDirectory()) return null; + return folder; + }) + .filter((subFolder) => !!subFolder); + + if ( + !validRemovableSubFolders.includes(subFolder) || + !fs.existsSync(subFolderPath) || + !isWithin(documentsPath, subFolderPath) + ) + return; - const folderPath = path.resolve(documentsFolder, normalizePath(folderName)); const filenames = fs - .readdirSync(folderPath) - .map((file) => path.join(folderPath, file)); + .readdirSync(subFolderPath) + .map((file) => path.join(subFolderPath, file)); const workspaces = await Workspace.where(); const purgePromises = []; @@ -47,7 +70,7 @@ async function purgeFolder(folderName) { } await Promise.all(purgePromises.flat().map((f) => f())); - fs.rmSync(folderPath, { recursive: true }); // Delete root document and source files. + fs.rmSync(subFolderPath, { recursive: true }); // Delete target document-folder and source files. return; }