diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3fcc79cd5a3c4933b257f471c9f172ea26a7b36f..549fd1574243fca22573c0fef669ba6b695fb060 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -43,6 +43,7 @@
     "searxng",
     "Serper",
     "Serply",
+    "streamable",
     "textgenwebui",
     "togetherai",
     "Unembed",
diff --git a/server/endpoints/api/workspace/index.js b/server/endpoints/api/workspace/index.js
index c9a6cb51e5231bead1bcc21a6983eee381b20c43..1fe9ad8dc345ebbe73ce51e95e444e6efa9616bc 100644
--- a/server/endpoints/api/workspace/index.js
+++ b/server/endpoints/api/workspace/index.js
@@ -4,19 +4,16 @@ const { Telemetry } = require("../../../models/telemetry");
 const { DocumentVectors } = require("../../../models/vectors");
 const { Workspace } = require("../../../models/workspace");
 const { WorkspaceChats } = require("../../../models/workspaceChats");
-const { chatWithWorkspace } = require("../../../utils/chats");
 const { getVectorDbClass } = require("../../../utils/helpers");
 const { multiUserMode, reqBody } = require("../../../utils/http");
 const { validApiKey } = require("../../../utils/middleware/validApiKey");
-const {
-  streamChatWithWorkspace,
-  VALID_CHAT_MODE,
-} = require("../../../utils/chats/stream");
+const { VALID_CHAT_MODE } = require("../../../utils/chats/stream");
 const { EventLogs } = require("../../../models/eventLogs");
 const {
   convertToChatHistory,
   writeResponseChunk,
 } = require("../../../utils/helpers/chat/responses");
+const { ApiChatHandler } = require("../../../utils/chats/apiChatHandler");
 
 function apiWorkspaceEndpoints(app) {
   if (!app) return;
@@ -584,7 +581,7 @@ function apiWorkspaceEndpoints(app) {
       try {
         const { slug } = request.params;
         const { message, mode = "query" } = reqBody(request);
-        const workspace = await Workspace.get({ slug });
+        const workspace = await Workspace.get({ slug: String(slug) });
 
         if (!workspace) {
           response.status(400).json({
@@ -612,9 +609,17 @@ function apiWorkspaceEndpoints(app) {
           return;
         }
 
-        const result = await chatWithWorkspace(workspace, message, mode);
+        const result = await ApiChatHandler.chatSync({
+          workspace,
+          message,
+          mode,
+          user: null,
+          thread: null,
+        });
+
         await Telemetry.sendTelemetry("sent_chat", {
-          LLMSelection: process.env.LLM_PROVIDER || "openai",
+          LLMSelection:
+            workspace.chatProvider ?? process.env.LLM_PROVIDER ?? "openai",
           Embedder: process.env.EMBEDDING_ENGINE || "inherit",
           VectorDbSelection: process.env.VECTOR_DB || "lancedb",
           TTSSelection: process.env.TTS_PROVIDER || "native",
@@ -623,7 +628,7 @@ function apiWorkspaceEndpoints(app) {
           workspaceName: workspace?.name,
           chatModel: workspace?.chatModel || "System Default",
         });
-        response.status(200).json({ ...result });
+        return response.status(200).json({ ...result });
       } catch (e) {
         console.error(e.message, e);
         response.status(500).json({
@@ -702,7 +707,7 @@ function apiWorkspaceEndpoints(app) {
       try {
         const { slug } = request.params;
         const { message, mode = "query" } = reqBody(request);
-        const workspace = await Workspace.get({ slug });
+        const workspace = await Workspace.get({ slug: String(slug) });
 
         if (!workspace) {
           response.status(400).json({
@@ -736,9 +741,17 @@ function apiWorkspaceEndpoints(app) {
         response.setHeader("Connection", "keep-alive");
         response.flushHeaders();
 
-        await streamChatWithWorkspace(response, workspace, message, mode);
+        await ApiChatHandler.streamChat({
+          response,
+          workspace,
+          message,
+          mode,
+          user: null,
+          thread: null,
+        });
         await Telemetry.sendTelemetry("sent_chat", {
-          LLMSelection: process.env.LLM_PROVIDER || "openai",
+          LLMSelection:
+            workspace.chatProvider ?? process.env.LLM_PROVIDER ?? "openai",
           Embedder: process.env.EMBEDDING_ENGINE || "inherit",
           VectorDbSelection: process.env.VECTOR_DB || "lancedb",
           TTSSelection: process.env.TTS_PROVIDER || "native",
diff --git a/server/endpoints/api/workspaceThread/index.js b/server/endpoints/api/workspaceThread/index.js
index de30e24d0b53d80218d527ced6022949531d6cb6..cdc4d598cf90506a019c8ea831c4258045784d9f 100644
--- a/server/endpoints/api/workspaceThread/index.js
+++ b/server/endpoints/api/workspaceThread/index.js
@@ -3,7 +3,6 @@ const { WorkspaceThread } = require("../../../models/workspaceThread");
 const { Workspace } = require("../../../models/workspace");
 const { validApiKey } = require("../../../utils/middleware/validApiKey");
 const { reqBody, multiUserMode } = require("../../../utils/http");
-const { chatWithWorkspace } = require("../../../utils/chats");
 const {
   streamChatWithWorkspace,
   VALID_CHAT_MODE,
@@ -16,6 +15,7 @@ const {
 } = require("../../../utils/helpers/chat/responses");
 const { WorkspaceChats } = require("../../../models/workspaceChats");
 const { User } = require("../../../models/user");
+const { ApiChatHandler } = require("../../../utils/chats/apiChatHandler");
 
 function apiWorkspaceThreadEndpoints(app) {
   if (!app) return;
@@ -405,13 +405,13 @@ function apiWorkspaceThreadEndpoints(app) {
         }
 
         const user = userId ? await User.get({ id: Number(userId) }) : null;
-        const result = await chatWithWorkspace(
+        const result = await ApiChatHandler.chatSync({
           workspace,
           message,
           mode,
           user,
-          thread
-        );
+          thread,
+        });
         await Telemetry.sendTelemetry("sent_chat", {
           LLMSelection: process.env.LLM_PROVIDER || "openai",
           Embedder: process.env.EMBEDDING_ENGINE || "inherit",
@@ -556,14 +556,14 @@ function apiWorkspaceThreadEndpoints(app) {
         response.setHeader("Connection", "keep-alive");
         response.flushHeaders();
 
-        await streamChatWithWorkspace(
+        await ApiChatHandler.streamChat({
           response,
           workspace,
           message,
           mode,
           user,
-          thread
-        );
+          thread,
+        });
         await Telemetry.sendTelemetry("sent_chat", {
           LLMSelection: process.env.LLM_PROVIDER || "openai",
           Embedder: process.env.EMBEDDING_ENGINE || "inherit",
diff --git a/server/utils/chats/apiChatHandler.js b/server/utils/chats/apiChatHandler.js
new file mode 100644
index 0000000000000000000000000000000000000000..a52e2da14f814a1709d69e8c67238f3c1402b4a3
--- /dev/null
+++ b/server/utils/chats/apiChatHandler.js
@@ -0,0 +1,481 @@
+const { v4: uuidv4 } = require("uuid");
+const { DocumentManager } = require("../DocumentManager");
+const { WorkspaceChats } = require("../../models/workspaceChats");
+const { getVectorDbClass, getLLMProvider } = require("../helpers");
+const { writeResponseChunk } = require("../helpers/chat/responses");
+const { chatPrompt, sourceIdentifier, recentChatHistory } = require("./index");
+
+/**
+ * @typedef ResponseObject
+ * @property {string} id - uuid of response
+ * @property {string} type - Type of response
+ * @property {string|null} textResponse - full text response
+ * @property {object[]} sources
+ * @property {boolean} close
+ * @property {string|null} error
+ */
+
+/**
+ * Handle synchronous chats with your workspace via the developer API endpoint
+ * @param {{
+ *  workspace: import("@prisma/client").workspaces,
+ *  message:string,
+ *  mode: "chat"|"query",
+ *  user: import("@prisma/client").users|null,
+ *  thread: import("@prisma/client").workspace_threads|null,
+ * }} parameters
+ * @returns {Promise<ResponseObject>}
+ */
+async function chatSync({
+  workspace,
+  message = null,
+  mode = "chat",
+  user = null,
+  thread = null,
+}) {
+  const uuid = uuidv4();
+  const chatMode = mode ?? "chat";
+  const LLMConnector = getLLMProvider({
+    provider: workspace?.chatProvider,
+    model: workspace?.chatModel,
+  });
+  const VectorDb = getVectorDbClass();
+  const messageLimit = workspace?.openAiHistory || 20;
+  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
+  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
+
+  // User is trying to query-mode chat a workspace that has no data in it - so
+  // we should exit early as no information can be found under these conditions.
+  if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: String(message),
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      include: false,
+    });
+
+    return {
+      id: uuid,
+      type: "textResponse",
+      sources: [],
+      close: true,
+      error: null,
+      textResponse,
+    };
+  }
+
+  // If we are here we know that we are in a workspace that is:
+  // 1. Chatting in "chat" mode and may or may _not_ have embeddings
+  // 2. Chatting in "query" mode and has at least 1 embedding
+  let contextTexts = [];
+  let sources = [];
+  let pinnedDocIdentifiers = [];
+  const { rawHistory, chatHistory } = await recentChatHistory({
+    user,
+    workspace,
+    thread,
+    messageLimit,
+    chatMode,
+  });
+
+  await new DocumentManager({
+    workspace,
+    maxTokens: LLMConnector.promptWindowLimit(),
+  })
+    .pinnedDocs()
+    .then((pinnedDocs) => {
+      pinnedDocs.forEach((doc) => {
+        const { pageContent, ...metadata } = doc;
+        pinnedDocIdentifiers.push(sourceIdentifier(doc));
+        contextTexts.push(doc.pageContent);
+        sources.push({
+          text:
+            pageContent.slice(0, 1_000) +
+            "...continued on in source document...",
+          ...metadata,
+        });
+      });
+    });
+
+  const vectorSearchResults =
+    embeddingsCount !== 0
+      ? await VectorDb.performSimilaritySearch({
+          namespace: workspace.slug,
+          input: message,
+          LLMConnector,
+          similarityThreshold: workspace?.similarityThreshold,
+          topN: workspace?.topN,
+          filterIdentifiers: pinnedDocIdentifiers,
+        })
+      : {
+          contextTexts: [],
+          sources: [],
+          message: null,
+        };
+
+  // Failed similarity search if it was run at all and failed.
+  if (!!vectorSearchResults.message) {
+    return {
+      id: uuid,
+      type: "abort",
+      textResponse: null,
+      sources: [],
+      close: true,
+      error: vectorSearchResults.message,
+    };
+  }
+
+  const { fillSourceWindow } = require("../helpers/chat");
+  const filledSources = fillSourceWindow({
+    nDocs: workspace?.topN || 4,
+    searchResults: vectorSearchResults.sources,
+    history: rawHistory,
+    filterIdentifiers: pinnedDocIdentifiers,
+  });
+
+  // Why does contextTexts get all the info, but sources only get current search?
+  // This is to give the ability of the LLM to "comprehend" a contextual response without
+  // populating the Citations under a response with documents the user "thinks" are irrelevant
+  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
+  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
+  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
+  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
+  contextTexts = [...contextTexts, ...filledSources.contextTexts];
+  sources = [...sources, ...vectorSearchResults.sources];
+
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
+  // let the LLM try to hallucinate a response or use general knowledge and exit early
+  if (chatMode === "query" && contextTexts.length === 0) {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: message,
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+      },
+      threadId: thread?.id || null,
+      include: false,
+      user,
+    });
+
+    return {
+      id: uuid,
+      type: "textResponse",
+      sources: [],
+      close: true,
+      error: null,
+      textResponse,
+    };
+  }
+
+  // Compress & Assemble message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages(
+    {
+      systemPrompt: chatPrompt(workspace),
+      userPrompt: message,
+      contextTexts,
+      chatHistory,
+    },
+    rawHistory
+  );
+
+  // Send the text completion.
+  const textResponse = await LLMConnector.getChatCompletion(messages, {
+    temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
+  });
+
+  if (!textResponse) {
+    return {
+      id: uuid,
+      type: "abort",
+      textResponse: null,
+      sources: [],
+      close: true,
+      error: "No text completion could be completed with this input.",
+    };
+  }
+
+  const { chat } = await WorkspaceChats.new({
+    workspaceId: workspace.id,
+    prompt: message,
+    response: { text: textResponse, sources, type: chatMode },
+    threadId: thread?.id || null,
+    user,
+  });
+
+  return {
+    id: uuid,
+    type: "textResponse",
+    close: true,
+    error: null,
+    chatId: chat.id,
+    textResponse,
+    sources,
+  };
+}
+
+/**
+ * Handle streamable HTTP chunks for chats with your workspace via the developer API endpoint
+ * @param {{
+ * response: import("express").Response,
+ *  workspace: import("@prisma/client").workspaces,
+ *  message:string,
+ *  mode: "chat"|"query",
+ *  user: import("@prisma/client").users|null,
+ *  thread: import("@prisma/client").workspace_threads|null,
+ * }} parameters
+ * @returns {Promise<VoidFunction>}
+ */
+async function streamChat({
+  response,
+  workspace,
+  message = null,
+  mode = "chat",
+  user = null,
+  thread = null,
+}) {
+  const uuid = uuidv4();
+  const chatMode = mode ?? "chat";
+  const LLMConnector = getLLMProvider({
+    provider: workspace?.chatProvider,
+    model: workspace?.chatModel,
+  });
+
+  const VectorDb = getVectorDbClass();
+  const messageLimit = workspace?.openAiHistory || 20;
+  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
+  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
+
+  // User is trying to query-mode chat a workspace that has no data in it - so
+  // we should exit early as no information can be found under these conditions.
+  if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+    writeResponseChunk(response, {
+      id: uuid,
+      type: "textResponse",
+      textResponse,
+      sources: [],
+      attachments: [],
+      close: true,
+      error: null,
+    });
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: message,
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+        attachments: [],
+      },
+      threadId: thread?.id || null,
+      include: false,
+      user,
+    });
+    return;
+  }
+
+  // If we are here we know that we are in a workspace that is:
+  // 1. Chatting in "chat" mode and may or may _not_ have embeddings
+  // 2. Chatting in "query" mode and has at least 1 embedding
+  let completeText;
+  let contextTexts = [];
+  let sources = [];
+  let pinnedDocIdentifiers = [];
+  const { rawHistory, chatHistory } = await recentChatHistory({
+    user,
+    workspace,
+    thread,
+    messageLimit,
+  });
+
+  // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search
+  // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window.
+  // However we limit the maximum of appended context to 80% of its overall size, mostly because if it expands beyond this
+  // it will undergo prompt compression anyway to make it work. If there is so much pinned that the context here is bigger than
+  // what the model can support - it would get compressed anyway and that really is not the point of pinning. It is really best
+  // suited for high-context models.
+  await new DocumentManager({
+    workspace,
+    maxTokens: LLMConnector.promptWindowLimit(),
+  })
+    .pinnedDocs()
+    .then((pinnedDocs) => {
+      pinnedDocs.forEach((doc) => {
+        const { pageContent, ...metadata } = doc;
+        pinnedDocIdentifiers.push(sourceIdentifier(doc));
+        contextTexts.push(doc.pageContent);
+        sources.push({
+          text:
+            pageContent.slice(0, 1_000) +
+            "...continued on in source document...",
+          ...metadata,
+        });
+      });
+    });
+
+  const vectorSearchResults =
+    embeddingsCount !== 0
+      ? await VectorDb.performSimilaritySearch({
+          namespace: workspace.slug,
+          input: message,
+          LLMConnector,
+          similarityThreshold: workspace?.similarityThreshold,
+          topN: workspace?.topN,
+          filterIdentifiers: pinnedDocIdentifiers,
+        })
+      : {
+          contextTexts: [],
+          sources: [],
+          message: null,
+        };
+
+  // Failed similarity search if it was run at all and failed.
+  if (!!vectorSearchResults.message) {
+    writeResponseChunk(response, {
+      id: uuid,
+      type: "abort",
+      textResponse: null,
+      sources: [],
+      close: true,
+      error: vectorSearchResults.message,
+    });
+    return;
+  }
+
+  const { fillSourceWindow } = require("../helpers/chat");
+  const filledSources = fillSourceWindow({
+    nDocs: workspace?.topN || 4,
+    searchResults: vectorSearchResults.sources,
+    history: rawHistory,
+    filterIdentifiers: pinnedDocIdentifiers,
+  });
+
+  // Why does contextTexts get all the info, but sources only get current search?
+  // This is to give the ability of the LLM to "comprehend" a contextual response without
+  // populating the Citations under a response with documents the user "thinks" are irrelevant
+  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
+  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
+  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
+  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
+  contextTexts = [...contextTexts, ...filledSources.contextTexts];
+  sources = [...sources, ...vectorSearchResults.sources];
+
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
+  // let the LLM try to hallucinate a response or use general knowledge and exit early
+  if (chatMode === "query" && contextTexts.length === 0) {
+    const textResponse =
+      workspace?.queryRefusalResponse ??
+      "There is no relevant information in this workspace to answer your query.";
+    writeResponseChunk(response, {
+      id: uuid,
+      type: "textResponse",
+      textResponse,
+      sources: [],
+      close: true,
+      error: null,
+    });
+
+    await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: message,
+      response: {
+        text: textResponse,
+        sources: [],
+        type: chatMode,
+        attachments: [],
+      },
+      threadId: thread?.id || null,
+      include: false,
+      user,
+    });
+    return;
+  }
+
+  // Compress & Assemble message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages(
+    {
+      systemPrompt: chatPrompt(workspace),
+      userPrompt: message,
+      contextTexts,
+      chatHistory,
+    },
+    rawHistory
+  );
+
+  // If streaming is not explicitly enabled for connector
+  // we do regular waiting of a response and send a single chunk.
+  if (LLMConnector.streamingEnabled() !== true) {
+    console.log(
+      `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
+    );
+    completeText = await LLMConnector.getChatCompletion(messages, {
+      temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
+    });
+    writeResponseChunk(response, {
+      uuid,
+      sources,
+      type: "textResponseChunk",
+      textResponse: completeText,
+      close: true,
+      error: false,
+    });
+  } else {
+    const stream = await LLMConnector.streamGetChatCompletion(messages, {
+      temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
+    });
+    completeText = await LLMConnector.handleStream(response, stream, {
+      uuid,
+      sources,
+    });
+  }
+
+  if (completeText?.length > 0) {
+    const { chat } = await WorkspaceChats.new({
+      workspaceId: workspace.id,
+      prompt: message,
+      response: { text: completeText, sources, type: chatMode },
+      threadId: thread?.id || null,
+      user,
+    });
+
+    writeResponseChunk(response, {
+      uuid,
+      type: "finalizeResponseStream",
+      close: true,
+      error: false,
+      chatId: chat.id,
+    });
+    return;
+  }
+
+  writeResponseChunk(response, {
+    uuid,
+    type: "finalizeResponseStream",
+    close: true,
+    error: false,
+  });
+  return;
+}
+
+module.exports.ApiChatHandler = {
+  chatSync,
+  streamChat,
+};
diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js
index dd0f6076faaeaa9c99dd6fe05afa04999d6ffe2f..3ec358728ebde0123f2f9427d486bdf26b74fd62 100644
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@@ -1,9 +1,7 @@
 const { v4: uuidv4 } = require("uuid");
 const { WorkspaceChats } = require("../../models/workspaceChats");
 const { resetMemory } = require("./commands/reset");
-const { getVectorDbClass, getLLMProvider } = require("../helpers");
 const { convertToPromptHistory } = require("../helpers/chat/responses");
-const { DocumentManager } = require("../DocumentManager");
 const { SlashCommandPresets } = require("../../models/slashCommandsPresets");
 
 const VALID_COMMANDS = {
@@ -34,216 +32,6 @@ async function grepCommand(message, user = null) {
   return updatedMessage;
 }
 
-async function chatWithWorkspace(
-  workspace,
-  message,
-  chatMode = "chat",
-  user = null,
-  thread = null
-) {
-  const uuid = uuidv4();
-  const updatedMessage = await grepCommand(message, user);
-
-  if (Object.keys(VALID_COMMANDS).includes(updatedMessage)) {
-    return await VALID_COMMANDS[updatedMessage](workspace, message, uuid, user);
-  }
-
-  const LLMConnector = getLLMProvider({
-    provider: workspace?.chatProvider,
-    model: workspace?.chatModel,
-  });
-  const VectorDb = getVectorDbClass();
-
-  const messageLimit = workspace?.openAiHistory || 20;
-  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
-  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
-
-  // User is trying to query-mode chat a workspace that has no data in it - so
-  // we should exit early as no information can be found under these conditions.
-  if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
-    const textResponse =
-      workspace?.queryRefusalResponse ??
-      "There is no relevant information in this workspace to answer your query.";
-
-    await WorkspaceChats.new({
-      workspaceId: workspace.id,
-      prompt: message,
-      response: {
-        text: textResponse,
-        sources: [],
-        type: chatMode,
-      },
-      threadId: thread?.id || null,
-      include: false,
-      user,
-    });
-
-    return {
-      id: uuid,
-      type: "textResponse",
-      sources: [],
-      close: true,
-      error: null,
-      textResponse,
-    };
-  }
-
-  // If we are here we know that we are in a workspace that is:
-  // 1. Chatting in "chat" mode and may or may _not_ have embeddings
-  // 2. Chatting in "query" mode and has at least 1 embedding
-  let contextTexts = [];
-  let sources = [];
-  let pinnedDocIdentifiers = [];
-  const { rawHistory, chatHistory } = await recentChatHistory({
-    user,
-    workspace,
-    thread,
-    messageLimit,
-    chatMode,
-  });
-
-  // See stream.js comment for more information on this implementation.
-  await new DocumentManager({
-    workspace,
-    maxTokens: LLMConnector.promptWindowLimit(),
-  })
-    .pinnedDocs()
-    .then((pinnedDocs) => {
-      pinnedDocs.forEach((doc) => {
-        const { pageContent, ...metadata } = doc;
-        pinnedDocIdentifiers.push(sourceIdentifier(doc));
-        contextTexts.push(doc.pageContent);
-        sources.push({
-          text:
-            pageContent.slice(0, 1_000) +
-            "...continued on in source document...",
-          ...metadata,
-        });
-      });
-    });
-
-  const vectorSearchResults =
-    embeddingsCount !== 0
-      ? await VectorDb.performSimilaritySearch({
-          namespace: workspace.slug,
-          input: message,
-          LLMConnector,
-          similarityThreshold: workspace?.similarityThreshold,
-          topN: workspace?.topN,
-          filterIdentifiers: pinnedDocIdentifiers,
-        })
-      : {
-          contextTexts: [],
-          sources: [],
-          message: null,
-        };
-
-  // Failed similarity search if it was run at all and failed.
-  if (!!vectorSearchResults.message) {
-    return {
-      id: uuid,
-      type: "abort",
-      textResponse: null,
-      sources: [],
-      close: true,
-      error: vectorSearchResults.message,
-    };
-  }
-
-  const { fillSourceWindow } = require("../helpers/chat");
-  const filledSources = fillSourceWindow({
-    nDocs: workspace?.topN || 4,
-    searchResults: vectorSearchResults.sources,
-    history: rawHistory,
-    filterIdentifiers: pinnedDocIdentifiers,
-  });
-
-  // Why does contextTexts get all the info, but sources only get current search?
-  // This is to give the ability of the LLM to "comprehend" a contextual response without
-  // populating the Citations under a response with documents the user "thinks" are irrelevant
-  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
-  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
-  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
-  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
-  contextTexts = [...contextTexts, ...filledSources.contextTexts];
-  sources = [...sources, ...vectorSearchResults.sources];
-
-  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
-  // let the LLM try to hallucinate a response or use general knowledge and exit early
-  if (chatMode === "query" && contextTexts.length === 0) {
-    const textResponse =
-      workspace?.queryRefusalResponse ??
-      "There is no relevant information in this workspace to answer your query.";
-
-    await WorkspaceChats.new({
-      workspaceId: workspace.id,
-      prompt: message,
-      response: {
-        text: textResponse,
-        sources: [],
-        type: chatMode,
-      },
-      threadId: thread?.id || null,
-      include: false,
-      user,
-    });
-
-    return {
-      id: uuid,
-      type: "textResponse",
-      sources: [],
-      close: true,
-      error: null,
-      textResponse,
-    };
-  }
-
-  // Compress & Assemble message to ensure prompt passes token limit with room for response
-  // and build system messages based on inputs and history.
-  const messages = await LLMConnector.compressMessages(
-    {
-      systemPrompt: chatPrompt(workspace),
-      userPrompt: updatedMessage,
-      contextTexts,
-      chatHistory,
-    },
-    rawHistory
-  );
-
-  // Send the text completion.
-  const textResponse = await LLMConnector.getChatCompletion(messages, {
-    temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
-  });
-
-  if (!textResponse) {
-    return {
-      id: uuid,
-      type: "abort",
-      textResponse: null,
-      sources: [],
-      close: true,
-      error: "No text completion could be completed with this input.",
-    };
-  }
-
-  const { chat } = await WorkspaceChats.new({
-    workspaceId: workspace.id,
-    prompt: message,
-    response: { text: textResponse, sources, type: chatMode },
-    threadId: thread?.id || null,
-    user,
-  });
-  return {
-    id: uuid,
-    type: "textResponse",
-    close: true,
-    error: null,
-    chatId: chat.id,
-    textResponse,
-    sources,
-  };
-}
-
 async function recentChatHistory({
   user = null,
   workspace,