From b44889a843ed49f33d28fabaefaa2ace8d968c10 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Tue, 24 Sep 2024 15:55:54 -0700
Subject: [PATCH] PR#2355 Continued + expanded scope (#2365)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* #2317 Fetch pinned documents once per folder to reduce the number of queries.

* Reorder the lines to keeps const declarations together.

* Add some comments to functions
move pinned document fetch for folder to function
move watched documents per-folder to also function the same
remove unused function in documents model

---------

Co-authored-by: Błażej Owczarczyk <blazeyy@gmail.com>
---
 server/models/documentSyncQueue.js |  7 +++
 server/models/documents.js         | 19 +-----
 server/utils/files/index.js        | 95 +++++++++++++++++++++++++-----
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/server/models/documentSyncQueue.js b/server/models/documentSyncQueue.js
index b034643ca..0ebaa0529 100644
--- a/server/models/documentSyncQueue.js
+++ b/server/models/documentSyncQueue.js
@@ -38,6 +38,13 @@ const DocumentSyncQueue = {
     return new Date(Number(new Date()) + queueRecord.staleAfterMs);
   },
 
+  /**
+   * Check if the document can be watched based on the metadata fields
+   * @param {object} metadata - metadata to check
+   * @param {string} metadata.title - title of the document
+   * @param {string} metadata.chunkSource - chunk source of the document
+   * @returns {boolean} - true if the document can be watched, false otherwise
+   */
   canWatch: function ({ title, chunkSource = null } = {}) {
     if (chunkSource.startsWith("link://") && title.endsWith(".html"))
       return true; // If is web-link material (prior to feature most chunkSources were links://)
diff --git a/server/models/documents.js b/server/models/documents.js
index 43ec5f9f4..81c2dd9a7 100644
--- a/server/models/documents.js
+++ b/server/models/documents.js
@@ -57,26 +57,12 @@ const Document = {
     }
   },
 
-  getOnlyWorkspaceIds: async function (clause = {}) {
-    try {
-      const workspaceIds = await prisma.workspace_documents.findMany({
-        where: clause,
-        select: {
-          workspaceId: true,
-        },
-      });
-      return workspaceIds.map((record) => record.workspaceId) || [];
-    } catch (error) {
-      console.error(error.message);
-      return [];
-    }
-  },
-
   where: async function (
     clause = {},
     limit = null,
     orderBy = null,
-    include = null
+    include = null,
+    select = null
   ) {
     try {
       const results = await prisma.workspace_documents.findMany({
@@ -84,6 +70,7 @@ const Document = {
         ...(limit !== null ? { take: limit } : {}),
         ...(orderBy !== null ? { orderBy } : {}),
         ...(include !== null ? { include } : {}),
+        ...(select !== null ? { select: { ...select } } : {}),
       });
       return results;
     } catch (error) {
diff --git a/server/utils/files/index.js b/server/utils/files/index.js
index 58bdf807a..598884f99 100644
--- a/server/utils/files/index.js
+++ b/server/utils/files/index.js
@@ -44,6 +44,7 @@ async function viewLocalFiles() {
         items: [],
       };
       const subfiles = fs.readdirSync(folderPath);
+      const filenames = {};
 
       for (const subfile of subfiles) {
         if (path.extname(subfile) !== ".json") continue;
@@ -51,30 +52,32 @@ async function viewLocalFiles() {
         const rawData = fs.readFileSync(filePath, "utf8");
         const cachefilename = `${file}/${subfile}`;
         const { pageContent, ...metadata } = JSON.parse(rawData);
-        const pinnedInWorkspaces = await Document.getOnlyWorkspaceIds({
-          docpath: cachefilename,
-          pinned: true,
-        });
-        const watchedInWorkspaces = liveSyncAvailable
-          ? await Document.getOnlyWorkspaceIds({
-              docpath: cachefilename,
-              watched: true,
-            })
-          : [];
-
         subdocs.items.push({
           name: subfile,
           type: "file",
           ...metadata,
           cached: await cachedVectorInformation(cachefilename, true),
-          pinnedWorkspaces: pinnedInWorkspaces,
           canWatch: liveSyncAvailable
             ? DocumentSyncQueue.canWatch(metadata)
             : false,
-          // Is file watched in any workspace since sync updates all workspaces where file is referenced
-          watched: watchedInWorkspaces.length !== 0,
+          // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
+          // watched: false, // boolean to indicate if this document is watched in ANY workspace
         });
+        filenames[cachefilename] = subfile;
       }
+
+      // Grab the pinned workspaces and watched documents for this folder's documents
+      // at the time of the query so we don't have to re-query the database for each file
+      const pinnedWorkspacesByDocument =
+        await getPinnedWorkspacesByDocument(filenames);
+      const watchedDocumentsFilenames =
+        await getWatchedDocumentFilenames(filenames);
+      for (const item of subdocs.items) {
+        item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
+        item.watched =
+          watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
+      }
+
       directory.items.push(subdocs);
     }
   }
@@ -88,8 +91,13 @@ async function viewLocalFiles() {
   return directory;
 }
 
-// Searches the vector-cache folder for existing information so we dont have to re-embed a
-// document and can instead push directly to vector db.
+/**
+ * Searches the vector-cache folder for existing information so we dont have to re-embed a
+ * document and can instead push directly to vector db.
+ * @param {string} filename - the filename to check for cached vector information
+ * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
+ * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
+ */
 async function cachedVectorInformation(filename = null, checkOnly = false) {
   if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
 
@@ -218,6 +226,61 @@ function hasVectorCachedFiles() {
   return false;
 }
 
+/**
+ * @param {string[]} filenames - array of filenames to check for pinned workspaces
+ * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
+ */
+async function getPinnedWorkspacesByDocument(filenames = []) {
+  return (
+    await Document.where(
+      {
+        docpath: {
+          in: Object.keys(filenames),
+        },
+        pinned: true,
+      },
+      null,
+      null,
+      null,
+      {
+        workspaceId: true,
+        docpath: true,
+      }
+    )
+  ).reduce((result, { workspaceId, docpath }) => {
+    const filename = filenames[docpath];
+    if (!result[filename]) result[filename] = [];
+    if (!result[filename].includes(workspaceId))
+      result[filename].push(workspaceId);
+    return result;
+  }, {});
+}
+
+/**
+ * Get a record of filenames and their corresponding workspaceIds that have watched a document
+ * that will be used to determine if a document should be displayed in the watched documents sidebar
+ * @param {string[]} filenames - array of filenames to check for watched workspaces
+ * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
+ */
+async function getWatchedDocumentFilenames(filenames = []) {
+  return (
+    await Document.where(
+      {
+        docpath: { in: Object.keys(filenames) },
+        watched: true,
+      },
+      null,
+      null,
+      null,
+      { workspaceId: true, docpath: true }
+    )
+  ).reduce((result, { workspaceId, docpath }) => {
+    const filename = filenames[docpath];
+    result[filename] = workspaceId;
+    return result;
+  }, {});
+}
+
 module.exports = {
   findDocumentInDocuments,
   cachedVectorInformation,
-- 
GitLab