diff --git a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
index c2b29468c8b41b0b8563bbafa385ef0a0c9c14e9..5209410575b8e42d7a3c5de382eff16afdf6325b 100644
--- a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
@@ -24,7 +24,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
       <div className="w-full flex items-center gap-4">
         <div className="flex flex-col w-60">
           <label className="text-white text-sm font-semibold block mb-4">
-            Anthropic Claude-2 API Key
+            Anthropic API Key
           </label>
           <input
             type="password"
@@ -48,7 +48,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
             required={true}
             className="bg-zinc-900 border border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
           >
-            {["claude-2"].map((model) => {
+            {["claude-2", "claude-instant-1"].map((model) => {
               return (
                 <option key={model} value={model}>
                   {model}
diff --git a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
index 99b04fa8ece1ee653bd138c1b1a4e228e8dde4b3..c319e9c6f2cfe7b4dfc11d56960f34975a32710e 100644
--- a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
@@ -49,6 +49,23 @@ export default function AzureAiOptions({ settings }) {
         />
       </div>
 
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Chat Model Token Limit
+        </label>
+        <select
+          name="AzureOpenAiTokenLimit"
+          defaultValue={settings?.AzureOpenAiTokenLimit || 4096}
+          className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          required={true}
+        >
+          <option value={4096}>4,096 (gpt-3.5-turbo)</option>
+          <option value={16384}>16,384 (gpt-3.5-16k)</option>
+          <option value={8192}>8,192 (gpt-4)</option>
+          <option value={32768}>32,768 (gpt-4-32k)</option>
+        </select>
+      </div>
+
       <div className="flex flex-col w-60">
         <label className="text-white text-sm font-semibold block mb-4">
           Embedding Deployment Name
diff --git a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
index e12e88d12fe77465aca7aa5a9a89abcb122c1489..27f3892c6750a03ac27e45b3b3b02333c02ef77e 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
@@ -224,7 +224,6 @@ export default function WorkspaceSettings({ workspace }) {
                 </div>
                 <textarea
                   name="openAiPrompt"
-                  maxLength={500}
                   rows={5}
                   defaultValue={chatPrompt(workspace)}
                   className="bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5"
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
index 6ff16cdc51b81abaf9f1e21d5ddc5f636c327e5b..5d4b1f573d04d1d56c8a0393c93ce38d629c5c48 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@@ -55,7 +55,6 @@ export default function PromptInput({
                 onKeyDown={captureEnter}
                 onChange={onChange}
                 required={true}
-                maxLength={240}
                 disabled={inputDisabled}
                 onFocus={() => setFocused(true)}
                 onBlur={(e) => {
diff --git a/server/endpoints/chat.js b/server/endpoints/chat.js
index 5c1276244d84d79ceeaf153911fab687e0af03b0..de2a4b4557787a03f57e65a22fafa4d3f9a57b1f 100644
--- a/server/endpoints/chat.js
+++ b/server/endpoints/chat.js
@@ -71,6 +71,7 @@ function chatEndpoints(app) {
         });
         response.status(200).json({ ...result });
       } catch (e) {
+        console.error(e);
         response.status(500).json({
           id: uuidv4(),
           type: "abort",
diff --git a/server/models/cacheData.js b/server/models/cacheData.js
new file mode 100644
index 0000000000000000000000000000000000000000..43c281d553d8eb76f499c824c3834a2cbf69f19c
--- /dev/null
+++ b/server/models/cacheData.js
@@ -0,0 +1,69 @@
+const prisma = require("../utils/prisma");
+
+const CacheData = {
+  new: async function (inputs = {}) {
+    try {
+      const cache = await prisma.cache_data.create({
+        data: inputs,
+      });
+      return { cache, message: null };
+    } catch (error) {
+      console.error(error.message);
+      return { cache: null, message: error.message };
+    }
+  },
+
+  get: async function (clause = {}, limit = null, orderBy = null) {
+    try {
+      const cache = await prisma.cache_data.findFirst({
+        where: clause,
+        ...(limit !== null ? { take: limit } : {}),
+        ...(orderBy !== null ? { orderBy } : {}),
+      });
+      return cache || null;
+    } catch (error) {
+      console.error(error.message);
+      return null;
+    }
+  },
+
+  delete: async function (clause = {}) {
+    try {
+      await prisma.cache_data.deleteMany({
+        where: clause,
+      });
+      return true;
+    } catch (error) {
+      console.error(error.message);
+      return false;
+    }
+  },
+
+  where: async function (clause = {}, limit = null, orderBy = null) {
+    try {
+      const caches = await prisma.cache_data.findMany({
+        where: clause,
+        ...(limit !== null ? { take: limit } : {}),
+        ...(orderBy !== null ? { orderBy } : {}),
+      });
+      return caches;
+    } catch (error) {
+      console.error(error.message);
+      return [];
+    }
+  },
+
+  count: async function (clause = {}) {
+    try {
+      const count = await prisma.cache_data.count({
+        where: clause,
+      });
+      return count;
+    } catch (error) {
+      console.error(error.message);
+      return 0;
+    }
+  },
+};
+
+module.exports = { CacheData };
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
index 4d2f73b30f4332b79997b2dedcd43ec7c871e757..d15f73060b7c7d2732ab5909d0e2c89fa5dc0067 100644
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -65,6 +65,7 @@ const SystemSettings = {
             AzureOpenAiKey: !!process.env.AZURE_OPENAI_KEY,
             AzureOpenAiModelPref: process.env.OPEN_MODEL_PREF,
             AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
+            AzureOpenAiTokenLimit: process.env.AZURE_OPENAI_TOKEN_LIMIT || 4096,
           }
         : {}),
 
diff --git a/server/package.json b/server/package.json
index 62879b83836fa23f5a2b1875365d147d24a08935..6bdb90aa772a258b9440f4b7ffd88711aa2b5f34 100644
--- a/server/package.json
+++ b/server/package.json
@@ -36,6 +36,7 @@
     "express": "^4.18.2",
     "extract-zip": "^2.0.1",
     "graphql": "^16.7.1",
+    "js-tiktoken": "^1.0.7",
     "jsonwebtoken": "^8.5.1",
     "langchain": "^0.0.90",
     "mime": "^3.0.0",
diff --git a/server/prisma/migrations/20231101195421_init/migration.sql b/server/prisma/migrations/20231101195421_init/migration.sql
new file mode 100644
index 0000000000000000000000000000000000000000..705bca3c3c4978f19e93ff545221dd36fbc5ccf7
--- /dev/null
+++ b/server/prisma/migrations/20231101195421_init/migration.sql
@@ -0,0 +1,11 @@
+-- CreateTable
+CREATE TABLE "cache_data" (
+    "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    "name" TEXT NOT NULL,
+    "data" TEXT NOT NULL,
+    "belongsTo" TEXT,
+    "byId" INTEGER,
+    "expiresAt" DATETIME,
+    "createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "lastUpdatedAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
diff --git a/server/prisma/schema.prisma b/server/prisma/schema.prisma
index 29a3fde299db535ece50bb1cd9e918e888c352ae..0f3190c97be31878edb0131490e4750b281701f6 100644
--- a/server/prisma/schema.prisma
+++ b/server/prisma/schema.prisma
@@ -116,3 +116,14 @@ model workspace_users {
   workspaces    workspaces @relation(fields: [workspace_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
   users         users      @relation(fields: [user_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
 }
+
+model cache_data {
+  id            Int       @id @default(autoincrement())
+  name          String
+  data          String
+  belongsTo     String?
+  byId          Int?
+  expiresAt     DateTime?
+  createdAt     DateTime  @default(now())
+  lastUpdatedAt DateTime  @default(now())
+}
diff --git a/server/utils/AiProviders/anthropic/index.js b/server/utils/AiProviders/anthropic/index.js
index d3dd68f257aac3e84139820c032c1fda4333f2c0..dca21422bd7631ccf0e700b7c1285030d8a0a4b3 100644
--- a/server/utils/AiProviders/anthropic/index.js
+++ b/server/utils/AiProviders/anthropic/index.js
@@ -12,6 +12,12 @@ class AnthropicLLM {
       apiKey: process.env.ANTHROPIC_API_KEY,
     });
     this.anthropic = anthropic;
+    this.model = process.env.ANTHROPIC_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
 
     if (!embedder)
       throw new Error(
@@ -21,8 +27,19 @@ class AnthropicLLM {
     this.answerKey = v4().split("-")[0];
   }
 
-  isValidChatModel(modelName = "") {
-    const validModels = ["claude-2"];
+  promptWindowLimit() {
+    switch (this.model) {
+      case "claude-instant-1":
+        return 72_000;
+      case "claude-2":
+        return 100_000;
+      default:
+        return 72_000; // assume a claude-instant-1 model
+    }
+  }
+
+  isValidChatCompletionModel(modelName = "") {
+    const validModels = ["claude-2", "claude-instant-1"];
     return validModels.includes(modelName);
   }
 
@@ -62,24 +79,25 @@ class AnthropicLLM {
     \n\nAssistant:`;
   }
 
-  // This is the interface used when no embeddings are present in the workspace
-  // This is just having a conversation with the LLM as one would normally.
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
-    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
-    if (!this.isValidChatModel(model))
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
+    if (!this.isValidChatCompletionModel(this.model))
       throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
       );
 
+    const compressedPrompt = await this.compressMessages(
+      {
+        systemPrompt: chatPrompt(workspace),
+        userPrompt: prompt,
+        chatHistory,
+      },
+      rawHistory
+    );
     const { content, error } = await this.anthropic.completions
       .create({
-        model: "claude-2",
+        model: this.model,
         max_tokens_to_sample: 300,
-        prompt: this.constructPrompt({
-          systemPrompt: chatPrompt(workspace),
-          userPrompt: prompt,
-          chatHistory,
-        }),
+        prompt: compressedPrompt,
       })
       .then((res) => {
         const { completion } = res;
@@ -100,15 +118,14 @@ class AnthropicLLM {
   }
 
   async getChatCompletion(prompt = "", _opts = {}) {
-    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
-    if (!this.isValidChatModel(model))
+    if (!this.isValidChatCompletionModel(this.model))
       throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
       );
 
     const { content, error } = await this.anthropic.completions
       .create({
-        model: "claude-2",
+        model: this.model,
         max_tokens_to_sample: 300,
         prompt,
       })
@@ -130,6 +147,16 @@ class AnthropicLLM {
     return content;
   }
 
+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageStringCompressor } = require("../../helpers/chat");
+    const compressedPrompt = await messageStringCompressor(
+      this,
+      promptArgs,
+      rawHistory
+    );
+    return compressedPrompt;
+  }
+
   // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
   async embedTextInput(textInput) {
     return await this.embedder.embedTextInput(textInput);
diff --git a/server/utils/AiProviders/azureOpenAi/index.js b/server/utils/AiProviders/azureOpenAi/index.js
index 6c450c5d38e0e75a39586ebe95f5d74a7b58e5f2..30059035df1f1e09323739cab9af889a5499017d 100644
--- a/server/utils/AiProviders/azureOpenAi/index.js
+++ b/server/utils/AiProviders/azureOpenAi/index.js
@@ -1,4 +1,5 @@
 const { AzureOpenAiEmbedder } = require("../../EmbeddingEngines/azureOpenAi");
+const { chatPrompt } = require("../../chats");
 
 class AzureOpenAiLLM extends AzureOpenAiEmbedder {
   constructor() {
@@ -13,9 +14,24 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
       process.env.AZURE_OPENAI_ENDPOINT,
       new AzureKeyCredential(process.env.AZURE_OPENAI_KEY)
     );
+    this.model = process.env.OPEN_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
+  }
+
+  // Sure the user selected a proper value for the token limit
+  // could be any of these https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-models
+  // and if undefined - assume it is the lowest end.
+  promptWindowLimit() {
+    return !!process.env.AZURE_OPENAI_TOKEN_LIMIT
+      ? Number(process.env.AZURE_OPENAI_TOKEN_LIMIT)
+      : 4096;
   }
 
-  isValidChatModel(_modelName = "") {
+  isValidChatCompletionModel(_modelName = "") {
     // The Azure user names their "models" as deployments and they can be any name
     // so we rely on the user to put in the correct deployment as only they would
     // know it.
@@ -31,7 +47,7 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
     const prompt = {
       role: "system",
       content: `${systemPrompt}
-    Context:
+Context:
     ${contextTexts
       .map((text, i) => {
         return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@@ -46,26 +62,25 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
     return { safe: true, reasons: [] };
   }
 
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
-    const model = process.env.OPEN_MODEL_PREF;
-    if (!model)
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
+    if (!this.model)
       throw new Error(
         "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
       );
 
+    const messages = await this.compressMessages(
+      {
+        systemPrompt: chatPrompt(workspace),
+        userPrompt: prompt,
+        chatHistory,
+      },
+      rawHistory
+    );
     const textResponse = await this.openai
-      .getChatCompletions(
-        model,
-        [
-          { role: "system", content: "" },
-          ...chatHistory,
-          { role: "user", content: prompt },
-        ],
-        {
-          temperature: Number(workspace?.openAiTemp ?? 0.7),
-          n: 1,
-        }
-      )
+      .getChatCompletions(this.model, messages, {
+        temperature: Number(workspace?.openAiTemp ?? 0.7),
+        n: 1,
+      })
       .then((res) => {
         if (!res.hasOwnProperty("choices"))
           throw new Error("OpenAI chat: No results!");
@@ -83,18 +98,23 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
   }
 
   async getChatCompletion(messages = [], { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF;
-    if (!model)
+    if (!this.model)
       throw new Error(
         "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
       );
 
-    const data = await this.openai.getChatCompletions(model, messages, {
+    const data = await this.openai.getChatCompletions(this.model, messages, {
       temperature,
     });
     if (!data.hasOwnProperty("choices")) return null;
     return data.choices[0].message.content;
   }
+
+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageArrayCompressor } = require("../../helpers/chat");
+    const messageArray = this.constructPrompt(promptArgs);
+    return await messageArrayCompressor(this, messageArray, rawHistory);
+  }
 }
 
 module.exports = {
diff --git a/server/utils/AiProviders/openAi/index.js b/server/utils/AiProviders/openAi/index.js
index 1efaa7466ef61f64355623050eac358ab483c6c4..91c11592f7fb23196a25755a8db9df66a55b6097 100644
--- a/server/utils/AiProviders/openAi/index.js
+++ b/server/utils/AiProviders/openAi/index.js
@@ -1,4 +1,5 @@
 const { OpenAiEmbedder } = require("../../EmbeddingEngines/openAi");
+const { chatPrompt } = require("../../chats");
 
 class OpenAiLLM extends OpenAiEmbedder {
   constructor() {
@@ -10,6 +11,23 @@ class OpenAiLLM extends OpenAiEmbedder {
       apiKey: process.env.OPEN_AI_KEY,
     });
     this.openai = new OpenAIApi(config);
+    this.model = process.env.OPEN_MODEL_PREF;
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
+  }
+
+  promptWindowLimit() {
+    switch (this.model) {
+      case "gpt-3.5-turbo":
+        return 4096;
+      case "gpt-4":
+        return 8192;
+      default:
+        return 4096; // assume a fine-tune 3.5
+    }
   }
 
   async isValidChatCompletionModel(modelName = "") {
@@ -33,7 +51,7 @@ class OpenAiLLM extends OpenAiEmbedder {
     const prompt = {
       role: "system",
       content: `${systemPrompt}
-    Context:
+Context:
     ${contextTexts
       .map((text, i) => {
         return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@@ -75,7 +93,7 @@ class OpenAiLLM extends OpenAiEmbedder {
     return { safe: false, reasons };
   }
 
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
     const model = process.env.OPEN_MODEL_PREF;
     if (!(await this.isValidChatCompletionModel(model)))
       throw new Error(
@@ -87,11 +105,14 @@ class OpenAiLLM extends OpenAiEmbedder {
         model,
         temperature: Number(workspace?.openAiTemp ?? 0.7),
         n: 1,
-        messages: [
-          { role: "system", content: "" },
-          ...chatHistory,
-          { role: "user", content: prompt },
-        ],
+        messages: await this.compressMessages(
+          {
+            systemPrompt: chatPrompt(workspace),
+            userPrompt: prompt,
+            chatHistory,
+          },
+          rawHistory
+        ),
       })
       .then((json) => {
         const res = json.data;
@@ -111,14 +132,13 @@ class OpenAiLLM extends OpenAiEmbedder {
   }
 
   async getChatCompletion(messages = null, { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
-    if (!(await this.isValidChatCompletionModel(model)))
+    if (!(await this.isValidChatCompletionModel(this.model)))
       throw new Error(
-        `OpenAI chat: ${model} is not valid for chat completion!`
+        `OpenAI chat: ${this.model} is not valid for chat completion!`
       );
 
     const { data } = await this.openai.createChatCompletion({
-      model,
+      model: this.model,
       messages,
       temperature,
     });
@@ -126,6 +146,12 @@ class OpenAiLLM extends OpenAiEmbedder {
     if (!data.hasOwnProperty("choices")) return null;
     return data.choices[0].message.content;
   }
+
+  async compressMessages(promptArgs = {}, rawHistory = []) {
+    const { messageArrayCompressor } = require("../../helpers/chat");
+    const messageArray = this.constructPrompt(promptArgs);
+    return await messageArrayCompressor(this, messageArray, rawHistory);
+  }
 }
 
 module.exports = {
diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js
index 77d413323b80bc2ace772f5c87575db32be1f83b..b2c8b8d3e11c02e61b60a2e71e4068980aea9487 100644
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@@ -91,91 +91,146 @@ async function chatWithWorkspace(
   const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
   const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
   if (!hasVectorizedSpace || embeddingsCount === 0) {
-    const rawHistory = (
-      user
-        ? await WorkspaceChats.forWorkspaceByUser(
-            workspace.id,
-            user.id,
-            messageLimit,
-            { id: "desc" }
-          )
-        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
-            id: "desc",
-          })
-    ).reverse();
-    const chatHistory = convertToPromptHistory(rawHistory);
-    const response = await LLMConnector.sendChat(
-      chatHistory,
-      message,
-      workspace
-    );
-    const data = { text: response, sources: [], type: "chat" };
-
-    await WorkspaceChats.new({
-      workspaceId: workspace.id,
-      prompt: message,
-      response: data,
+    // If there are no embeddings - chat like a normal LLM chat interface.
+    return await emptyEmbeddingChat({
+      uuid,
       user,
+      message,
+      workspace,
+      messageLimit,
+      LLMConnector,
     });
+  }
+
+  const { rawHistory, chatHistory } = await recentChatHistory(
+    user,
+    workspace,
+    messageLimit,
+    chatMode
+  );
+  const {
+    contextTexts = [],
+    sources = [],
+    message: error,
+  } = await VectorDb.performSimilaritySearch({
+    namespace: workspace.slug,
+    input: message,
+    LLMConnector,
+  });
+
+  // Failed similarity search.
+  if (!!error) {
     return {
       id: uuid,
-      type: "textResponse",
-      textResponse: response,
+      type: "abort",
+      textResponse: null,
       sources: [],
       close: true,
-      error: null,
+      error,
     };
-  } else {
-    const rawHistory = (
-      user
-        ? await WorkspaceChats.forWorkspaceByUser(
-            workspace.id,
-            user.id,
-            messageLimit,
-            { id: "desc" }
-          )
-        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
-            id: "desc",
-          })
-    ).reverse();
-    const chatHistory = convertToPromptHistory(rawHistory);
-    const {
-      response,
-      sources,
-      message: error,
-    } = await VectorDb[chatMode]({
-      namespace: workspace.slug,
-      input: message,
-      workspace,
+  }
+
+  // Compress message to ensure prompt passes token limit with room for response
+  // and build system messages based on inputs and history.
+  const messages = await LLMConnector.compressMessages(
+    {
+      systemPrompt: chatPrompt(workspace),
+      userPrompt: message,
+      contextTexts,
       chatHistory,
-    });
-    if (!response) {
-      return {
-        id: uuid,
-        type: "abort",
-        textResponse: null,
-        sources: [],
-        close: true,
-        error,
-      };
-    }
+    },
+    rawHistory
+  );
 
-    const data = { text: response, sources, type: chatMode };
-    await WorkspaceChats.new({
-      workspaceId: workspace.id,
-      prompt: message,
-      response: data,
-      user,
-    });
+  // Send the text completion.
+  const textResponse = await LLMConnector.getChatCompletion(messages, {
+    temperature: workspace?.openAiTemp ?? 0.7,
+  });
+
+  if (!textResponse) {
     return {
       id: uuid,
-      type: "textResponse",
-      textResponse: response,
-      sources,
+      type: "abort",
+      textResponse: null,
+      sources: [],
       close: true,
-      error,
+      error: "No text completion could be completed with this input.",
     };
   }
+
+  await WorkspaceChats.new({
+    workspaceId: workspace.id,
+    prompt: message,
+    response: { text: textResponse, sources, type: chatMode },
+    user,
+  });
+  return {
+    id: uuid,
+    type: "textResponse",
+    close: true,
+    textResponse,
+    sources,
+    error,
+  };
+}
+
+// On query we dont return message history. All other chat modes and when chatting
+// with no embeddings we return history.
+async function recentChatHistory(
+  user = null,
+  workspace,
+  messageLimit = 20,
+  chatMode = null
+) {
+  if (chatMode === "query") return [];
+  const rawHistory = (
+    user
+      ? await WorkspaceChats.forWorkspaceByUser(
+          workspace.id,
+          user.id,
+          messageLimit,
+          { id: "desc" }
+        )
+      : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
+          id: "desc",
+        })
+  ).reverse();
+  return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) };
+}
+
+async function emptyEmbeddingChat({
+  uuid,
+  user,
+  message,
+  workspace,
+  messageLimit,
+  LLMConnector,
+}) {
+  const { rawHistory, chatHistory } = await recentChatHistory(
+    user,
+    workspace,
+    messageLimit
+  );
+  const textResponse = await LLMConnector.sendChat(
+    chatHistory,
+    message,
+    workspace,
+    rawHistory
+  );
+  await WorkspaceChats.new({
+    workspaceId: workspace.id,
+    prompt: message,
+    response: { text: textResponse, sources: [], type: "chat" },
+    user,
+  });
+  return {
+    id: uuid,
+    type: "textResponse",
+    sources: [],
+    close: true,
+    error: null,
+    textResponse,
+  };
 }
 
 function chatPrompt(workspace) {
@@ -186,6 +241,7 @@ function chatPrompt(workspace) {
 }
 
 module.exports = {
+  convertToPromptHistory,
   convertToChatHistory,
   chatWithWorkspace,
   chatPrompt,
diff --git a/server/utils/helpers/chat/index.js b/server/utils/helpers/chat/index.js
new file mode 100644
index 0000000000000000000000000000000000000000..ed7eab90fc86be39639e5f3cb62884306c683abd
--- /dev/null
+++ b/server/utils/helpers/chat/index.js
@@ -0,0 +1,325 @@
+const { convertToPromptHistory } = require("../../chats");
+const { TokenManager } = require("../tiktoken");
+
+/*
+What is the message Array compressor?
+TLDR: So anyway, i started blasting (your prompts & stuff)
+
+messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
+and also maintain coherent history, system instructions and context, if applicable.
+
+We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
+under most user conditions that a user would take while using this specific system. While other systems may
+use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
+
+We "cannonball" the input.
+Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
+This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
+You may think: "Doesn't this result in massive data loss?" - yes & no.
+Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
+on the quality of responses.
+
+We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to 
+be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
+In general:
+  system: at best 15% of token capacity
+  history: at best 15% of token capacity
+  prompt: at best 70% of token capacity.
+
+we handle overflows by taking an aggressive path for two main cases.
+
+1. Very large user prompt
+- Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
+- We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
+
+2. Context window is exceeded in regular use.
+- We do not touch prompt since it is very likely to be <70% of window.
+- We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
+- We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a 
+preference for recent history if we can cannonball to fit it, otherwise it is omitted.
+
+We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
+We also take the approach that history is the least important and most flexible of the items in this array of responses.
+
+There is a supplemental version of this function that also returns a formatted string for models like Claude-2
+*/
+
+async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
+  // assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
+  // run the compressor to ensure the prompt has enough space to reply.
+  // realistically - most users will not be impacted by this.
+  const tokenBuffer = 600;
+  const tokenManager = new TokenManager(llm.model);
+  // If no work needs to be done, just pass through.
+  if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
+    return messages;
+
+  const system = messages.shift();
+  const user = messages.pop();
+  const userPromptSize = tokenManager.countFromString(user.content);
+
+  // User prompt is the main focus here - we we prioritize it and allow
+  // it to highjack the entire conversation thread. We are going to
+  // cannonball the prompt through to ensure the reply has at least 20% of
+  // the token supply to reply with.
+  if (userPromptSize > llm.limits.user) {
+    return [
+      {
+        role: "user",
+        content: cannonball({
+          input: user.content,
+          targetTokenSize: llm.promptWindowLimit() * 0.8,
+          tiktokenInstance: tokenManager,
+        }),
+      },
+    ];
+  }
+
+  const compressedSystem = new Promise(async (resolve) => {
+    const count = tokenManager.countFromString(system.content);
+    if (count < llm.limits.system) {
+      resolve(system);
+      return;
+    }
+
+    // Split context from system prompt - cannonball since its over the window.
+    // We assume the context + user prompt is enough tokens to fit.
+    const [prompt, context = ""] = system.content.split("Context:");
+    system.content = `${cannonball({
+      input: prompt,
+      targetTokenSize: llm.limits.system,
+      tiktokenInstance: tokenManager,
+    })}${context ? `\nContext: ${context}` : ""}`;
+    resolve(system);
+  });
+
+  // Prompt is allowed to take up to 70% of window - we know its under
+  // if we are here, so passthrough.
+  const compressedPrompt = new Promise(async (resolve) => resolve(user));
+
+  // We always aggressively compress history because it is the least
+  // important data to retain in full-fidelity.
+  const compressedHistory = new Promise((resolve) => {
+    const eligibleHistoryItems = [];
+    var historyTokenCount = 0;
+
+    for (const [i, history] of rawHistory.reverse().entries()) {
+      const [user, assistant] = convertToPromptHistory([history]);
+      const [userTokens, assistantTokens] = [
+        tokenManager.countFromString(user.content),
+        tokenManager.countFromString(assistant.content),
+      ];
+      const total = userTokens + assistantTokens;
+
+      // If during the loop the token cost of adding this history
+      // is small, we can add it to history and move onto next.
+      if (historyTokenCount + total < llm.limits.history) {
+        eligibleHistoryItems.unshift(user, assistant);
+        historyTokenCount += total;
+        continue;
+      }
+
+      // If we reach here the overhead of adding this history item will
+      // be too much of the limit. So now, we are prioritizing
+      // the most recent 3 message pairs - if we are already past those - exit loop and stop
+      // trying to make history work.
+      if (i > 2) break;
+
+      // We are over the limit and we are within the first 3 most recent chats.
+      // so now we cannonball them to make them fit into the window.
+      // max size = llm.limit.history; Each component of the message, can at most
+      // be 50% of the history. We cannonball whichever is the problem.
+      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
+      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
+      if (userTokens > maxTargetSize) {
+        user.content = cannonball({
+          input: user.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      if (assistantTokens > maxTargetSize) {
+        assistant.content = cannonball({
+          input: assistant.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      const newTotal = tokenManager.statsFrom([user, assistant]);
+      if (historyTokenCount + newTotal > llm.limits.history) continue;
+      eligibleHistoryItems.unshift(user, assistant);
+      historyTokenCount += newTotal;
+    }
+    resolve(eligibleHistoryItems);
+  });
+
+  const [cSystem, cHistory, cPrompt] = await Promise.all([
+    compressedSystem,
+    compressedHistory,
+    compressedPrompt,
+  ]);
+  return [cSystem, ...cHistory, cPrompt];
+}
+
+// Implementation of messageArrayCompressor, but for string only completion models
+async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
+  const tokenBuffer = 600;
+  const tokenManager = new TokenManager(llm.model);
+  const initialPrompt = llm.constructPrompt(promptArgs);
+  if (
+    tokenManager.statsFrom(initialPrompt) + tokenBuffer <
+    llm.promptWindowLimit()
+  )
+    return initialPrompt;
+
+  const system = promptArgs.systemPrompt;
+  const user = promptArgs.userPrompt;
+  const userPromptSize = tokenManager.countFromString(user);
+
+  // User prompt is the main focus here - we we prioritize it and allow
+  // it to highjack the entire conversation thread. We are going to
+  // cannonball the prompt through to ensure the reply has at least 20% of
+  // the token supply to reply with.
+  if (userPromptSize > llm.limits.user) {
+    return llm.constructPrompt({
+      userPrompt: cannonball({
+        input: user,
+        targetTokenSize: llm.promptWindowLimit() * 0.8,
+        tiktokenInstance: tokenManager,
+      }),
+    });
+  }
+
+  const compressedSystem = new Promise(async (resolve) => {
+    const count = tokenManager.countFromString(system);
+    if (count < llm.limits.system) {
+      resolve(system);
+      return;
+    }
+    resolve(
+      cannonball({
+        input: system,
+        targetTokenSize: llm.limits.system,
+        tiktokenInstance: tokenManager,
+      })
+    );
+  });
+
+  // Prompt is allowed to take up to 70% of window - we know its under
+  // if we are here, so passthrough.
+  const compressedPrompt = new Promise(async (resolve) => resolve(user));
+
+  // We always aggressively compress history because it is the least
+  // important data to retain in full-fidelity.
+  const compressedHistory = new Promise((resolve) => {
+    const eligibleHistoryItems = [];
+    var historyTokenCount = 0;
+
+    for (const [i, history] of rawHistory.reverse().entries()) {
+      const [user, assistant] = convertToPromptHistory([history]);
+      const [userTokens, assistantTokens] = [
+        tokenManager.countFromString(user.content),
+        tokenManager.countFromString(assistant.content),
+      ];
+      const total = userTokens + assistantTokens;
+
+      // If during the loop the token cost of adding this history
+      // is small, we can add it to history and move onto next.
+      if (historyTokenCount + total < llm.limits.history) {
+        eligibleHistoryItems.unshift(user, assistant);
+        historyTokenCount += total;
+        continue;
+      }
+
+      // If we reach here the overhead of adding this history item will
+      // be too much of the limit. So now, we are prioritizing
+      // the most recent 3 message pairs - if we are already past those - exit loop and stop
+      // trying to make history work.
+      if (i > 2) break;
+
+      // We are over the limit and we are within the first 3 most recent chats.
+      // so now we cannonball them to make them fit into the window.
+      // max size = llm.limit.history; Each component of the message, can at most
+      // be 50% of the history. We cannonball whichever is the problem.
+      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
+      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
+      if (userTokens > maxTargetSize) {
+        user.content = cannonball({
+          input: user.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      if (assistantTokens > maxTargetSize) {
+        assistant.content = cannonball({
+          input: assistant.content,
+          targetTokenSize: maxTargetSize,
+          tiktokenInstance: tokenManager,
+        });
+      }
+
+      const newTotal = tokenManager.statsFrom([user, assistant]);
+      if (historyTokenCount + newTotal > llm.limits.history) continue;
+      eligibleHistoryItems.unshift(user, assistant);
+      historyTokenCount += newTotal;
+    }
+    resolve(eligibleHistoryItems);
+  });
+
+  const [cSystem, cHistory, cPrompt] = await Promise.all([
+    compressedSystem,
+    compressedHistory,
+    compressedPrompt,
+  ]);
+
+  return llm.constructPrompt({
+    systemPrompt: cSystem,
+    contextTexts: promptArgs?.contextTexts || [],
+    chatHistory: cHistory,
+    userPrompt: cPrompt,
+  });
+}
+
+// Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
+// Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
+function cannonball({
+  input = "",
+  targetTokenSize = 0,
+  tiktokenInstance = null,
+  ellipsesStr = null,
+}) {
+  if (!input || !targetTokenSize) return input;
+  const tokenManager = tiktokenInstance || new TokenManager();
+  const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
+  const initialInputSize = tokenManager.countFromString(input);
+  if (initialInputSize < targetTokenSize) return input;
+
+  // if the delta is the token difference between where our prompt is in size
+  // and where we ideally need to land.
+  const delta = initialInputSize - targetTokenSize;
+  const tokenChunks = tokenManager.tokensFromString(input);
+  const middleIdx = Math.floor(tokenChunks.length / 2);
+
+  // middle truncate the text going left and right of midpoint
+  const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
+  const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
+  const truncatedText =
+    tokenManager.bytesFromTokens(leftChunks) +
+    truncText +
+    tokenManager.bytesFromTokens(rightChunks);
+
+  console.log(
+    `Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
+      truncatedText
+    )} tokens.`
+  );
+  return truncatedText;
+}
+
+module.exports = {
+  messageArrayCompressor,
+  messageStringCompressor,
+};
diff --git a/server/utils/helpers/tiktoken.js b/server/utils/helpers/tiktoken.js
new file mode 100644
index 0000000000000000000000000000000000000000..ad1cdd444374b4d3cf11bcd981fd866317e9a4cb
--- /dev/null
+++ b/server/utils/helpers/tiktoken.js
@@ -0,0 +1,57 @@
+const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");
+
+class TokenManager {
+  constructor(model = "gpt-3.5-turbo") {
+    this.model = model;
+    this.encoderName = this.getEncodingFromModel(model);
+    this.encoder = getEncoding(this.encoderName);
+    this.buffer = 50;
+  }
+
+  getEncodingFromModel(model) {
+    try {
+      return getEncodingNameForModel(model);
+    } catch {
+      return "cl100k_base";
+    }
+  }
+
+  tokensFromString(input = "") {
+    const tokens = this.encoder.encode(input);
+    return tokens;
+  }
+
+  bytesFromTokens(tokens = []) {
+    const bytes = this.encoder.decode(tokens);
+    return bytes;
+  }
+
+  countFromString(input = "") {
+    const tokens = this.encoder.encode(input);
+    return tokens.length;
+  }
+
+  statsFrom(input) {
+    if (typeof input === "string") return this.countFromString(input);
+
+    // What is going on here?
+    // https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb Item 6.
+    // The only option is to estimate. From repeated testing using the static values in the code we are always 2 off,
+    // which means as of Nov 1, 2023 the additional factor on ln: 476 changed from 3 to 5.
+    if (Array.isArray(input)) {
+      const perMessageFactorTokens = input.length * 3;
+      const tokensFromContent = input.reduce(
+        (a, b) => a + this.countFromString(b.content),
+        0
+      );
+      const diffCoefficient = 5;
+      return perMessageFactorTokens + tokensFromContent + diffCoefficient;
+    }
+
+    throw new Error("Not a supported tokenized format.");
+  }
+}
+
+module.exports = {
+  TokenManager,
+};
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index 9cfb243fffec70c05200ab7083f3884a78d367fe..976849d923db2df07e09c15243856c51270a356b 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -17,6 +17,10 @@ const KEY_MAPPING = {
     envKey: "AZURE_OPENAI_ENDPOINT",
     checks: [isNotEmpty, validAzureURL],
   },
+  AzureOpenAiTokenLimit: {
+    envKey: "AZURE_OPENAI_TOKEN_LIMIT",
+    checks: [validOpenAiTokenLimit],
+  },
   AzureOpenAiKey: {
     envKey: "AZURE_OPENAI_KEY",
     checks: [isNotEmpty],
@@ -137,7 +141,7 @@ function supportedLLM(input = "") {
 }
 
 function validAnthropicModel(input = "") {
-  const validModels = ["claude-2"];
+  const validModels = ["claude-2", "claude-instant-1"];
   return validModels.includes(input)
     ? null
     : `Invalid Model type. Must be one of ${validModels.join(", ")}.`;
@@ -174,6 +178,14 @@ function validAzureURL(input = "") {
   }
 }
 
+function validOpenAiTokenLimit(input = "") {
+  const tokenLimit = Number(input);
+  if (isNaN(tokenLimit)) return "Token limit is not a number";
+  if (![4_096, 16_384, 8_192, 32_768].includes(tokenLimit))
+    return "Invalid OpenAI token limit.";
+  return null;
+}
+
 function requiresForceMode(_, forceModeEnabled = false) {
   return forceModeEnabled === true ? null : "Cannot set this setting.";
 }
diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
index fdc4cbe4103fbfaffb67cc0ca737e4281b55660e..2bdb0133d5801ef2d00d94bb967801f8e609cf25 100644
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");
 
 const Chroma = {
   name: "Chroma",
@@ -253,92 +252,35 @@ const Chroma = {
     await DocumentVectors.deleteIds(indexes);
     return true;
   },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+  performSimilaritySearch: async function ({
+    namespace = null,
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");
 
     const { client } = await this.connect();
     if (!(await this.namespaceExists(client, namespace))) {
       return {
-        response: null,
+        contextTexts: [],
         sources: [],
         message: "Invalid query - no documents found for workspace!",
       };
     }
 
-    const LLMConnector = getLLMProvider();
     const queryVector = await LLMConnector.embedTextInput(input);
     const { contextTexts, sourceDocuments } = await this.similarityResponse(
       client,
       namespace,
       queryVector
     );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    // When we roll out own response we have separate metadata and texts,
-    // so for source collection we need to combine them.
-    const sources = sourceDocuments.map((metadata, i) => {
-      return { metadata: { ...metadata, text: contextTexts[i] } };
-    });
-    return {
-      response: responseText,
-      sources: this.curateSources(sources),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
-      namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
 
-    // When we roll out own response we have separate metadata and texts,
-    // so for source collection we need to combine them.
     const sources = sourceDocuments.map((metadata, i) => {
       return { metadata: { ...metadata, text: contextTexts[i] } };
     });
     return {
-      response: responseText,
+      contextTexts,
       sources: this.curateSources(sources),
       message: false,
     };
diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
index bb150958535c1b46d676dc3c0ba346de9bae8b20..c18766a84dd4641417b2f99d2098b604be8686a6 100644
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@@ -4,7 +4,6 @@ const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { chatPrompt } = require("../../chats");
 
 const LanceDb = {
   uri: `${
@@ -226,83 +225,36 @@ const LanceDb = {
       return false;
     }
   },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+  performSimilaritySearch: async function ({
+    namespace = null,
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");
 
     const { client } = await this.connect();
     if (!(await this.namespaceExists(client, namespace))) {
       return {
-        response: null,
+        contextTexts: [],
         sources: [],
         message: "Invalid query - no documents found for workspace!",
       };
     }
 
-    const LLMConnector = getLLMProvider();
     const queryVector = await LLMConnector.embedTextInput(input);
     const { contextTexts, sourceDocuments } = await this.similarityResponse(
       client,
       namespace,
       queryVector
     );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
-      namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
 
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
+    const sources = sourceDocuments.map((metadata, i) => {
+      return { metadata: { ...metadata, text: contextTexts[i] } };
     });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
     return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
       message: false,
     };
   },
@@ -337,9 +289,13 @@ const LanceDb = {
   curateSources: function (sources = []) {
     const documents = [];
     for (const source of sources) {
-      const { text, vector: _v, score: _s, ...metadata } = source;
+      const { text, vector: _v, score: _s, ...rest } = source;
+      const metadata = rest.hasOwnProperty("metadata") ? rest.metadata : rest;
       if (Object.keys(metadata).length > 0) {
-        documents.push({ ...metadata, text });
+        documents.push({
+          ...metadata,
+          ...(text ? { text } : {}),
+        });
       }
     }
 
diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
index fc7f4d3172ef8df85b6565e7cd0c0b5b1d45d920..f9600cf0c856fdae80587cd74ad303136613ec58 100644
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");
 
 const Pinecone = {
   name: "Pinecone",
@@ -222,80 +221,33 @@ const Pinecone = {
       message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
     };
   },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
-
-    const { pineconeIndex } = await this.connect();
-    if (!(await this.namespaceExists(pineconeIndex, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      pineconeIndex,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
-      namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+  performSimilaritySearch: async function ({
+    namespace = null,
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");
 
     const { pineconeIndex } = await this.connect();
     if (!(await this.namespaceExists(pineconeIndex, namespace)))
       throw new Error(
-        "Invalid namespace - has it been collected and seeded yet?"
+        "Invalid namespace - has it been collected and populated yet?"
       );
 
-    const LLMConnector = getLLMProvider();
     const queryVector = await LLMConnector.embedTextInput(input);
     const { contextTexts, sourceDocuments } = await this.similarityResponse(
       pineconeIndex,
       namespace,
       queryVector
     );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
 
+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
+    });
     return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
       message: false,
     };
   },
diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
index 9925c6e49be8923d1004aca130e8f854a5452755..c565daa7aac03c84f97ae4913a32a2c0fe120754 100644
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");
 
 const QDrant = {
   name: "QDrant",
@@ -262,83 +261,36 @@ const QDrant = {
     await DocumentVectors.deleteIds(indexes);
     return true;
   },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+  performSimilaritySearch: async function ({
+    namespace = null,
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");
 
     const { client } = await this.connect();
     if (!(await this.namespaceExists(client, namespace))) {
       return {
-        response: null,
+        contextTexts: [],
         sources: [],
         message: "Invalid query - no documents found for workspace!",
       };
     }
 
-    const LLMConnector = getLLMProvider();
     const queryVector = await LLMConnector.embedTextInput(input);
     const { contextTexts, sourceDocuments } = await this.similarityResponse(
       client,
       namespace,
       queryVector
     );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
-      namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
 
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
     });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
     return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
       message: false,
     };
   },
@@ -377,8 +329,11 @@ const QDrant = {
     const documents = [];
     for (const source of sources) {
       if (Object.keys(source).length > 0) {
+        const metadata = source.hasOwnProperty("metadata")
+          ? source.metadata
+          : source;
         documents.push({
-          ...source,
+          ...metadata,
         });
       }
     }
diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
index 1a43e3c5f59027de85e01e373f6ce1876710a485..052ad58617a9e72074720fa839eba3759bba8318 100644
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
-const { chatPrompt } = require("../../chats");
 const { camelCase } = require("../../helpers/camelcase");
 
 const Weaviate = {
@@ -333,83 +332,36 @@ const Weaviate = {
     await DocumentVectors.deleteIds(indexes);
     return true;
   },
-  query: async function (reqBody = {}) {
-    const { namespace = null, input, workspace = {} } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
+  performSimilaritySearch: async function ({
+    namespace = null,
+    input = "",
+    LLMConnector = null,
+  }) {
+    if (!namespace || !input || !LLMConnector)
+      throw new Error("Invalid request to performSimilaritySearch.");
 
     const { client } = await this.connect();
     if (!(await this.namespaceExists(client, namespace))) {
       return {
-        response: null,
+        contextTexts: [],
         sources: [],
         message: "Invalid query - no documents found for workspace!",
       };
     }
 
-    const LLMConnector = getLLMProvider();
     const queryVector = await LLMConnector.embedTextInput(input);
     const { contextTexts, sourceDocuments } = await this.similarityResponse(
       client,
       namespace,
       queryVector
     );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-    });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
-    return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
-      message: false,
-    };
-  },
-  // This implementation of chat uses the chat history and modifies the system prompt at execution
-  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
-  // because then multi-user support will have all conversations mutating the base vector collection to which then
-  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
-  chat: async function (reqBody = {}) {
-    const {
-      namespace = null,
-      input,
-      workspace = {},
-      chatHistory = [],
-    } = reqBody;
-    if (!namespace || !input) throw new Error("Invalid request body");
 
-    const { client } = await this.connect();
-    if (!(await this.namespaceExists(client, namespace))) {
-      return {
-        response: null,
-        sources: [],
-        message: "Invalid query - no documents found for workspace!",
-      };
-    }
-
-    const LLMConnector = getLLMProvider();
-    const queryVector = await LLMConnector.embedTextInput(input);
-    const { contextTexts, sourceDocuments } = await this.similarityResponse(
-      client,
-      namespace,
-      queryVector
-    );
-    const memory = LLMConnector.constructPrompt({
-      systemPrompt: chatPrompt(workspace),
-      contextTexts: contextTexts,
-      userPrompt: input,
-      chatHistory,
+    const sources = sourceDocuments.map((metadata, i) => {
+      return { ...metadata, text: contextTexts[i] };
     });
-    const responseText = await LLMConnector.getChatCompletion(memory, {
-      temperature: workspace?.openAiTemp ?? 0.7,
-    });
-
     return {
-      response: responseText,
-      sources: this.curateSources(sourceDocuments),
+      contextTexts,
+      sources: this.curateSources(sources),
       message: false,
     };
   },
@@ -445,7 +397,10 @@ const Weaviate = {
     const documents = [];
     for (const source of sources) {
       if (Object.keys(source).length > 0) {
-        documents.push(source);
+        const metadata = source.hasOwnProperty("metadata")
+          ? source.metadata
+          : source;
+        documents.push({ ...metadata });
       }
     }
 
diff --git a/server/yarn.lock b/server/yarn.lock
index 01479024e8e4790470ce83845151f60db83e312e..3226f9f54062de406f0d48d115326a474683101e 100644
--- a/server/yarn.lock
+++ b/server/yarn.lock
@@ -1556,7 +1556,7 @@ isomorphic-fetch@^3.0.0:
     node-fetch "^2.6.1"
     whatwg-fetch "^3.4.1"
 
-js-tiktoken@^1.0.6:
+js-tiktoken@^1.0.6, js-tiktoken@^1.0.7:
   version "1.0.7"
   resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
   integrity sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==