From a8ec0d9584899cd855d38188f7a62b6924a5e9ed Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Thu, 26 Oct 2023 10:57:37 -0700
Subject: [PATCH] Compensate for upper OpenAI emedding limit chunk size (#292)

Limit is due to POST body max size. Sufficiently large requests will abort automatically
We should report that error back on the frontend during embedding
Update vectordb providers to return on failed
---
 .../Modals/MangeWorkspace/Documents/index.jsx | 12 ++---
 server/endpoints/workspaces.js                | 13 ++++-
 server/models/documents.js                    |  6 ++-
 server/utils/AiProviders/openAi/index.js      | 54 +++++++++++++++++--
 .../utils/vectorDbProviders/chroma/index.js   |  4 +-
 server/utils/vectorDbProviders/lance/index.js |  4 +-
 .../utils/vectorDbProviders/pinecone/index.js |  4 +-
 .../utils/vectorDbProviders/qdrant/index.js   |  4 +-
 .../utils/vectorDbProviders/weaviate/index.js |  4 +-
 9 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx
index 5e9942060..7485d0a91 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/index.jsx
@@ -91,13 +91,13 @@ export default function DocumentSettings({ workspace, fileTypes }) {
     setHighlightWorkspace(false);
     await Workspace.modifyEmbeddings(workspace.slug, changesToSend)
       .then((res) => {
-        if (res && res.workspace) {
-          showToast("Workspace updated successfully.", "success", {
-            clear: true,
-          });
-        } else {
-          showToast("Workspace update failed.", "error", { clear: true });
+        if (!!res.message) {
+          showToast(`Error: ${res.message}`, "error", { clear: true });
+          return;
         }
+        showToast("Workspace updated successfully.", "success", {
+          clear: true,
+        });
       })
       .catch((error) => {
         showToast(`Workspace update failed: ${error}`, "error", {
diff --git a/server/endpoints/workspaces.js b/server/endpoints/workspaces.js
index ea2be211d..57e37e654 100644
--- a/server/endpoints/workspaces.js
+++ b/server/endpoints/workspaces.js
@@ -114,9 +114,18 @@ function workspaceEndpoints(app) {
         }
 
         await Document.removeDocuments(currWorkspace, deletes);
-        await Document.addDocuments(currWorkspace, adds);
+        const { failed = [] } = await Document.addDocuments(
+          currWorkspace,
+          adds
+        );
         const updatedWorkspace = await Workspace.get({ id: currWorkspace.id });
-        response.status(200).json({ workspace: updatedWorkspace });
+        response.status(200).json({
+          workspace: updatedWorkspace,
+          message:
+            failed.length > 0
+              ? `${failed.length} documents could not be embedded.`
+              : null,
+        });
       } catch (e) {
         console.log(e.message, e);
         response.sendStatus(500).end();
diff --git a/server/models/documents.js b/server/models/documents.js
index 58189a66c..ab8d4a098 100644
--- a/server/models/documents.js
+++ b/server/models/documents.js
@@ -37,6 +37,8 @@ const Document = {
   addDocuments: async function (workspace, additions = []) {
     const VectorDb = getVectorDbClass();
     if (additions.length === 0) return;
+    const embedded = [];
+    const failedToEmbed = [];
 
     for (const path of additions) {
       const data = await fileData(path);
@@ -58,11 +60,13 @@ const Document = {
       );
       if (!vectorized) {
         console.error("Failed to vectorize", path);
+        failedToEmbed.push(path);
         continue;
       }
 
       try {
         await prisma.workspace_documents.create({ data: newDoc });
+        embedded.push(path);
       } catch (error) {
         console.error(error.message);
       }
@@ -72,7 +76,7 @@ const Document = {
       LLMSelection: process.env.LLM_PROVIDER || "openai",
       VectorDbSelection: process.env.VECTOR_DB || "pinecone",
     });
-    return;
+    return { failed: failedToEmbed, embedded };
   },
 
   removeDocuments: async function (workspace, removals = []) {
diff --git a/server/utils/AiProviders/openAi/index.js b/server/utils/AiProviders/openAi/index.js
index bacc56da0..dc7e47c6d 100644
--- a/server/utils/AiProviders/openAi/index.js
+++ b/server/utils/AiProviders/openAi/index.js
@@ -1,3 +1,5 @@
+const { toChunks } = require("../../helpers");
+
 class OpenAi {
   constructor() {
     const { Configuration, OpenAIApi } = require("openai");
@@ -6,6 +8,9 @@ class OpenAi {
     });
     const openai = new OpenAIApi(config);
     this.openai = openai;
+
+    // Arbitrary limit to ensure we stay within reasonable POST request size.
+    this.embeddingChunkLimit = 1_000;
   }
 
   isValidChatModel(modelName = "") {
@@ -99,13 +104,52 @@ class OpenAi {
   }
 
   async embedChunks(textChunks = []) {
-    const {
-      data: { data },
-    } = await this.openai.createEmbedding({
-      model: "text-embedding-ada-002",
-      input: textChunks,
+    // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
+    // we concurrently execute each max batch of text chunks possible.
+    // Refer to constructor embeddingChunkLimit for more info.
+    const embeddingRequests = [];
+    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+      embeddingRequests.push(
+        new Promise((resolve) => {
+          this.openai
+            .createEmbedding({
+              model: "text-embedding-ada-002",
+              input: chunk,
+            })
+            .then((res) => {
+              resolve({ data: res.data?.data, error: null });
+            })
+            .catch((e) => {
+              resolve({ data: [], error: e?.error });
+            });
+        })
+      );
+    }
+
+    const { data = [], error = null } = await Promise.all(
+      embeddingRequests
+    ).then((results) => {
+      // If any errors were returned from OpenAI abort the entire sequence because the embeddings
+      // will be incomplete.
+      const errors = results
+        .filter((res) => !!res.error)
+        .map((res) => res.error)
+        .flat();
+      if (errors.length > 0) {
+        return {
+          data: [],
+          error: `(${errors.length}) Embedding Errors! ${errors
+            .map((error) => `[${error.type}]: ${error.message}`)
+            .join(", ")}`,
+        };
+      }
+      return {
+        data: results.map((res) => res?.data || []).flat(),
+        error: null,
+      };
     });
 
+    if (!!error) throw new Error(`OpenAI Failed to embed: ${error}`);
     return data.length > 0 &&
       data.every((embd) => embd.hasOwnProperty("embedding"))
       ? data.map((embd) => embd.embedding)
diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
index e0f36f380..aeaab47ae 100644
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@@ -195,8 +195,8 @@ const Chroma = {
           documentVectors.push({ docId, vectorId: vectorRecord.id });
         }
       } else {
-        console.error(
-          "Could not use OpenAI to embed document chunks! This document will not be recorded."
+        throw new Error(
+          "Could not embed document chunks! This document will not be recorded."
         );
       }
 
diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
index 77bd95f96..22d5730d0 100644
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@@ -195,8 +195,8 @@ const LanceDb = {
           documentVectors.push({ docId, vectorId: vectorRecord.id });
         }
       } else {
-        console.error(
-          "Could not use OpenAI to embed document chunks! This document will not be recorded."
+        throw new Error(
+          "Could not embed document chunks! This document will not be recorded."
         );
       }
 
diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
index 7da4135ba..bcdf07b5f 100644
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@@ -148,8 +148,8 @@ const Pinecone = {
           documentVectors.push({ docId, vectorId: vectorRecord.id });
         }
       } else {
-        console.error(
-          "Could not use OpenAI to embed document chunks! This document will not be recorded."
+        throw new Error(
+          "Could not embed document chunks! This document will not be recorded."
         );
       }
 
diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
index 81c5a5a4d..376c7d8bc 100644
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@@ -201,8 +201,8 @@ const QDrant = {
           documentVectors.push({ docId, vectorId: vectorRecord.id });
         }
       } else {
-        console.error(
-          "Could not use OpenAI to embed document chunks! This document will not be recorded."
+        throw new Error(
+          "Could not embed document chunks! This document will not be recorded."
         );
       }
 
diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
index 0381214c4..cdc40acd7 100644
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@@ -267,8 +267,8 @@ const Weaviate = {
           documentVectors.push({ docId, vectorId: vectorRecord.id });
         }
       } else {
-        console.error(
-          "Could not use OpenAI to embed document chunks! This document will not be recorded."
+        throw new Error(
+          "Could not embed document chunks! This document will not be recorded."
         );
       }
 
-- 
GitLab