diff --git a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx index c2b29468c8b41b0b8563bbafa385ef0a0c9c14e9..5209410575b8e42d7a3c5de382eff16afdf6325b 100644 --- a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx +++ b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx @@ -24,7 +24,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) { <div className="w-full flex items-center gap-4"> <div className="flex flex-col w-60"> <label className="text-white text-sm font-semibold block mb-4"> - Anthropic Claude-2 API Key + Anthropic API Key </label> <input type="password" @@ -48,7 +48,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) { required={true} className="bg-zinc-900 border border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" > - {["claude-2"].map((model) => { + {["claude-2", "claude-instant-1"].map((model) => { return ( <option key={model} value={model}> {model} diff --git a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx index 99b04fa8ece1ee653bd138c1b1a4e228e8dde4b3..c319e9c6f2cfe7b4dfc11d56960f34975a32710e 100644 --- a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx +++ b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx @@ -49,6 +49,23 @@ export default function AzureAiOptions({ settings }) { /> </div> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Chat Model Token Limit + </label> + <select + name="AzureOpenAiTokenLimit" + defaultValue={settings?.AzureOpenAiTokenLimit || 4096} + className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5" + required={true} + > + <option value={4096}>4,096 (gpt-3.5-turbo)</option> + <option value={16384}>16,384 (gpt-3.5-16k)</option> + <option value={8192}>8,192 (gpt-4)</option> + <option value={32768}>32,768 (gpt-4-32k)</option> + </select> + </div> + <div className="flex flex-col w-60"> <label className="text-white text-sm font-semibold block mb-4"> Embedding Deployment Name diff --git a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx index e12e88d12fe77465aca7aa5a9a89abcb122c1489..27f3892c6750a03ac27e45b3b3b02333c02ef77e 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx @@ -224,7 +224,6 @@ export default function WorkspaceSettings({ workspace }) { </div> <textarea name="openAiPrompt" - maxLength={500} rows={5} defaultValue={chatPrompt(workspace)} className="bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5" diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx index 6ff16cdc51b81abaf9f1e21d5ddc5f636c327e5b..5d4b1f573d04d1d56c8a0393c93ce38d629c5c48 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx @@ -55,7 +55,6 @@ export default function PromptInput({ onKeyDown={captureEnter} onChange={onChange} required={true} - maxLength={240} disabled={inputDisabled} onFocus={() => setFocused(true)} onBlur={(e) => { diff --git a/server/endpoints/chat.js b/server/endpoints/chat.js index 5c1276244d84d79ceeaf153911fab687e0af03b0..de2a4b4557787a03f57e65a22fafa4d3f9a57b1f 100644 --- a/server/endpoints/chat.js +++ b/server/endpoints/chat.js @@ -71,6 +71,7 @@ function chatEndpoints(app) { }); response.status(200).json({ ...result }); } catch (e) { + console.error(e); response.status(500).json({ id: uuidv4(), type: "abort", diff --git a/server/models/cacheData.js b/server/models/cacheData.js new file mode 100644 index 0000000000000000000000000000000000000000..43c281d553d8eb76f499c824c3834a2cbf69f19c --- /dev/null +++ b/server/models/cacheData.js @@ -0,0 +1,69 @@ +const prisma = require("../utils/prisma"); + +const CacheData = { + new: async function (inputs = {}) { + try { + const cache = await prisma.cache_data.create({ + data: inputs, + }); + return { cache, message: null }; + } catch (error) { + console.error(error.message); + return { cache: null, message: error.message }; + } + }, + + get: async function (clause = {}, limit = null, orderBy = null) { + try { + const cache = await prisma.cache_data.findFirst({ + where: clause, + ...(limit !== null ? { take: limit } : {}), + ...(orderBy !== null ? { orderBy } : {}), + }); + return cache || null; + } catch (error) { + console.error(error.message); + return null; + } + }, + + delete: async function (clause = {}) { + try { + await prisma.cache_data.deleteMany({ + where: clause, + }); + return true; + } catch (error) { + console.error(error.message); + return false; + } + }, + + where: async function (clause = {}, limit = null, orderBy = null) { + try { + const caches = await prisma.cache_data.findMany({ + where: clause, + ...(limit !== null ? { take: limit } : {}), + ...(orderBy !== null ? { orderBy } : {}), + }); + return caches; + } catch (error) { + console.error(error.message); + return []; + } + }, + + count: async function (clause = {}) { + try { + const count = await prisma.cache_data.count({ + where: clause, + }); + return count; + } catch (error) { + console.error(error.message); + return 0; + } + }, +}; + +module.exports = { CacheData }; diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 4d2f73b30f4332b79997b2dedcd43ec7c871e757..d15f73060b7c7d2732ab5909d0e2c89fa5dc0067 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -65,6 +65,7 @@ const SystemSettings = { AzureOpenAiKey: !!process.env.AZURE_OPENAI_KEY, AzureOpenAiModelPref: process.env.OPEN_MODEL_PREF, AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF, + AzureOpenAiTokenLimit: process.env.AZURE_OPENAI_TOKEN_LIMIT || 4096, } : {}), diff --git a/server/package.json b/server/package.json index 62879b83836fa23f5a2b1875365d147d24a08935..6bdb90aa772a258b9440f4b7ffd88711aa2b5f34 100644 --- a/server/package.json +++ b/server/package.json @@ -36,6 +36,7 @@ "express": "^4.18.2", "extract-zip": "^2.0.1", "graphql": "^16.7.1", + "js-tiktoken": "^1.0.7", "jsonwebtoken": "^8.5.1", "langchain": "^0.0.90", "mime": "^3.0.0", diff --git a/server/prisma/migrations/20231101195421_init/migration.sql b/server/prisma/migrations/20231101195421_init/migration.sql new file mode 100644 index 0000000000000000000000000000000000000000..705bca3c3c4978f19e93ff545221dd36fbc5ccf7 --- /dev/null +++ b/server/prisma/migrations/20231101195421_init/migration.sql @@ -0,0 +1,11 @@ +-- CreateTable +CREATE TABLE "cache_data" ( + "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + "name" TEXT NOT NULL, + "data" TEXT NOT NULL, + "belongsTo" TEXT, + "byId" INTEGER, + "expiresAt" DATETIME, + "createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + "lastUpdatedAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP +); diff --git a/server/prisma/schema.prisma b/server/prisma/schema.prisma index 29a3fde299db535ece50bb1cd9e918e888c352ae..0f3190c97be31878edb0131490e4750b281701f6 100644 --- a/server/prisma/schema.prisma +++ b/server/prisma/schema.prisma @@ -116,3 +116,14 @@ model workspace_users { workspaces workspaces @relation(fields: [workspace_id], references: [id], onDelete: Cascade, onUpdate: Cascade) users users @relation(fields: [user_id], references: [id], onDelete: Cascade, onUpdate: Cascade) } + +model cache_data { + id Int @id @default(autoincrement()) + name String + data String + belongsTo String? + byId Int? + expiresAt DateTime? + createdAt DateTime @default(now()) + lastUpdatedAt DateTime @default(now()) +} diff --git a/server/utils/AiProviders/anthropic/index.js b/server/utils/AiProviders/anthropic/index.js index d3dd68f257aac3e84139820c032c1fda4333f2c0..dca21422bd7631ccf0e700b7c1285030d8a0a4b3 100644 --- a/server/utils/AiProviders/anthropic/index.js +++ b/server/utils/AiProviders/anthropic/index.js @@ -12,6 +12,12 @@ class AnthropicLLM { apiKey: process.env.ANTHROPIC_API_KEY, }); this.anthropic = anthropic; + this.model = process.env.ANTHROPIC_MODEL_PREF; + this.limits = { + history: this.promptWindowLimit() * 0.15, + system: this.promptWindowLimit() * 0.15, + user: this.promptWindowLimit() * 0.7, + }; if (!embedder) throw new Error( @@ -21,8 +27,19 @@ class AnthropicLLM { this.answerKey = v4().split("-")[0]; } - isValidChatModel(modelName = "") { - const validModels = ["claude-2"]; + promptWindowLimit() { + switch (this.model) { + case "claude-instant-1": + return 72_000; + case "claude-2": + return 100_000; + default: + return 72_000; // assume a claude-instant-1 model + } + } + + isValidChatCompletionModel(modelName = "") { + const validModels = ["claude-2", "claude-instant-1"]; return validModels.includes(modelName); } @@ -62,24 +79,25 @@ class AnthropicLLM { \n\nAssistant:`; } - // This is the interface used when no embeddings are present in the workspace - // This is just having a conversation with the LLM as one would normally. - async sendChat(chatHistory = [], prompt, workspace = {}) { - const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2"; - if (!this.isValidChatModel(model)) + async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) { + if (!this.isValidChatCompletionModel(this.model)) throw new Error( - `Anthropic chat: ${model} is not valid for chat completion!` + `Anthropic chat: ${this.model} is not valid for chat completion!` ); + const compressedPrompt = await this.compressMessages( + { + systemPrompt: chatPrompt(workspace), + userPrompt: prompt, + chatHistory, + }, + rawHistory + ); const { content, error } = await this.anthropic.completions .create({ - model: "claude-2", + model: this.model, max_tokens_to_sample: 300, - prompt: this.constructPrompt({ - systemPrompt: chatPrompt(workspace), - userPrompt: prompt, - chatHistory, - }), + prompt: compressedPrompt, }) .then((res) => { const { completion } = res; @@ -100,15 +118,14 @@ class AnthropicLLM { } async getChatCompletion(prompt = "", _opts = {}) { - const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2"; - if (!this.isValidChatModel(model)) + if (!this.isValidChatCompletionModel(this.model)) throw new Error( - `Anthropic chat: ${model} is not valid for chat completion!` + `Anthropic chat: ${this.model} is not valid for chat completion!` ); const { content, error } = await this.anthropic.completions .create({ - model: "claude-2", + model: this.model, max_tokens_to_sample: 300, prompt, }) @@ -130,6 +147,16 @@ class AnthropicLLM { return content; } + async compressMessages(promptArgs = {}, rawHistory = []) { + const { messageStringCompressor } = require("../../helpers/chat"); + const compressedPrompt = await messageStringCompressor( + this, + promptArgs, + rawHistory + ); + return compressedPrompt; + } + // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations async embedTextInput(textInput) { return await this.embedder.embedTextInput(textInput); diff --git a/server/utils/AiProviders/azureOpenAi/index.js b/server/utils/AiProviders/azureOpenAi/index.js index 6c450c5d38e0e75a39586ebe95f5d74a7b58e5f2..30059035df1f1e09323739cab9af889a5499017d 100644 --- a/server/utils/AiProviders/azureOpenAi/index.js +++ b/server/utils/AiProviders/azureOpenAi/index.js @@ -1,4 +1,5 @@ const { AzureOpenAiEmbedder } = require("../../EmbeddingEngines/azureOpenAi"); +const { chatPrompt } = require("../../chats"); class AzureOpenAiLLM extends AzureOpenAiEmbedder { constructor() { @@ -13,9 +14,24 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder { process.env.AZURE_OPENAI_ENDPOINT, new AzureKeyCredential(process.env.AZURE_OPENAI_KEY) ); + this.model = process.env.OPEN_MODEL_PREF; + this.limits = { + history: this.promptWindowLimit() * 0.15, + system: this.promptWindowLimit() * 0.15, + user: this.promptWindowLimit() * 0.7, + }; + } + + // Sure the user selected a proper value for the token limit + // could be any of these https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-models + // and if undefined - assume it is the lowest end. + promptWindowLimit() { + return !!process.env.AZURE_OPENAI_TOKEN_LIMIT + ? Number(process.env.AZURE_OPENAI_TOKEN_LIMIT) + : 4096; } - isValidChatModel(_modelName = "") { + isValidChatCompletionModel(_modelName = "") { // The Azure user names their "models" as deployments and they can be any name // so we rely on the user to put in the correct deployment as only they would // know it. @@ -31,7 +47,7 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder { const prompt = { role: "system", content: `${systemPrompt} - Context: +Context: ${contextTexts .map((text, i) => { return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`; @@ -46,26 +62,25 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder { return { safe: true, reasons: [] }; } - async sendChat(chatHistory = [], prompt, workspace = {}) { - const model = process.env.OPEN_MODEL_PREF; - if (!model) + async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) { + if (!this.model) throw new Error( "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5." ); + const messages = await this.compressMessages( + { + systemPrompt: chatPrompt(workspace), + userPrompt: prompt, + chatHistory, + }, + rawHistory + ); const textResponse = await this.openai - .getChatCompletions( - model, - [ - { role: "system", content: "" }, - ...chatHistory, - { role: "user", content: prompt }, - ], - { - temperature: Number(workspace?.openAiTemp ?? 0.7), - n: 1, - } - ) + .getChatCompletions(this.model, messages, { + temperature: Number(workspace?.openAiTemp ?? 0.7), + n: 1, + }) .then((res) => { if (!res.hasOwnProperty("choices")) throw new Error("OpenAI chat: No results!"); @@ -83,18 +98,23 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder { } async getChatCompletion(messages = [], { temperature = 0.7 }) { - const model = process.env.OPEN_MODEL_PREF; - if (!model) + if (!this.model) throw new Error( "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5." ); - const data = await this.openai.getChatCompletions(model, messages, { + const data = await this.openai.getChatCompletions(this.model, messages, { temperature, }); if (!data.hasOwnProperty("choices")) return null; return data.choices[0].message.content; } + + async compressMessages(promptArgs = {}, rawHistory = []) { + const { messageArrayCompressor } = require("../../helpers/chat"); + const messageArray = this.constructPrompt(promptArgs); + return await messageArrayCompressor(this, messageArray, rawHistory); + } } module.exports = { diff --git a/server/utils/AiProviders/openAi/index.js b/server/utils/AiProviders/openAi/index.js index 1efaa7466ef61f64355623050eac358ab483c6c4..91c11592f7fb23196a25755a8db9df66a55b6097 100644 --- a/server/utils/AiProviders/openAi/index.js +++ b/server/utils/AiProviders/openAi/index.js @@ -1,4 +1,5 @@ const { OpenAiEmbedder } = require("../../EmbeddingEngines/openAi"); +const { chatPrompt } = require("../../chats"); class OpenAiLLM extends OpenAiEmbedder { constructor() { @@ -10,6 +11,23 @@ class OpenAiLLM extends OpenAiEmbedder { apiKey: process.env.OPEN_AI_KEY, }); this.openai = new OpenAIApi(config); + this.model = process.env.OPEN_MODEL_PREF; + this.limits = { + history: this.promptWindowLimit() * 0.15, + system: this.promptWindowLimit() * 0.15, + user: this.promptWindowLimit() * 0.7, + }; + } + + promptWindowLimit() { + switch (this.model) { + case "gpt-3.5-turbo": + return 4096; + case "gpt-4": + return 8192; + default: + return 4096; // assume a fine-tune 3.5 + } } async isValidChatCompletionModel(modelName = "") { @@ -33,7 +51,7 @@ class OpenAiLLM extends OpenAiEmbedder { const prompt = { role: "system", content: `${systemPrompt} - Context: +Context: ${contextTexts .map((text, i) => { return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`; @@ -75,7 +93,7 @@ class OpenAiLLM extends OpenAiEmbedder { return { safe: false, reasons }; } - async sendChat(chatHistory = [], prompt, workspace = {}) { + async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) { const model = process.env.OPEN_MODEL_PREF; if (!(await this.isValidChatCompletionModel(model))) throw new Error( @@ -87,11 +105,14 @@ class OpenAiLLM extends OpenAiEmbedder { model, temperature: Number(workspace?.openAiTemp ?? 0.7), n: 1, - messages: [ - { role: "system", content: "" }, - ...chatHistory, - { role: "user", content: prompt }, - ], + messages: await this.compressMessages( + { + systemPrompt: chatPrompt(workspace), + userPrompt: prompt, + chatHistory, + }, + rawHistory + ), }) .then((json) => { const res = json.data; @@ -111,14 +132,13 @@ class OpenAiLLM extends OpenAiEmbedder { } async getChatCompletion(messages = null, { temperature = 0.7 }) { - const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo"; - if (!(await this.isValidChatCompletionModel(model))) + if (!(await this.isValidChatCompletionModel(this.model))) throw new Error( - `OpenAI chat: ${model} is not valid for chat completion!` + `OpenAI chat: ${this.model} is not valid for chat completion!` ); const { data } = await this.openai.createChatCompletion({ - model, + model: this.model, messages, temperature, }); @@ -126,6 +146,12 @@ class OpenAiLLM extends OpenAiEmbedder { if (!data.hasOwnProperty("choices")) return null; return data.choices[0].message.content; } + + async compressMessages(promptArgs = {}, rawHistory = []) { + const { messageArrayCompressor } = require("../../helpers/chat"); + const messageArray = this.constructPrompt(promptArgs); + return await messageArrayCompressor(this, messageArray, rawHistory); + } } module.exports = { diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js index 77d413323b80bc2ace772f5c87575db32be1f83b..b2c8b8d3e11c02e61b60a2e71e4068980aea9487 100644 --- a/server/utils/chats/index.js +++ b/server/utils/chats/index.js @@ -91,91 +91,146 @@ async function chatWithWorkspace( const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug); const embeddingsCount = await VectorDb.namespaceCount(workspace.slug); if (!hasVectorizedSpace || embeddingsCount === 0) { - const rawHistory = ( - user - ? await WorkspaceChats.forWorkspaceByUser( - workspace.id, - user.id, - messageLimit, - { id: "desc" } - ) - : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, { - id: "desc", - }) - ).reverse(); - const chatHistory = convertToPromptHistory(rawHistory); - const response = await LLMConnector.sendChat( - chatHistory, - message, - workspace - ); - const data = { text: response, sources: [], type: "chat" }; - - await WorkspaceChats.new({ - workspaceId: workspace.id, - prompt: message, - response: data, + // If there are no embeddings - chat like a normal LLM chat interface. + return await emptyEmbeddingChat({ + uuid, user, + message, + workspace, + messageLimit, + LLMConnector, }); + } + + const { rawHistory, chatHistory } = await recentChatHistory( + user, + workspace, + messageLimit, + chatMode + ); + const { + contextTexts = [], + sources = [], + message: error, + } = await VectorDb.performSimilaritySearch({ + namespace: workspace.slug, + input: message, + LLMConnector, + }); + + // Failed similarity search. + if (!!error) { return { id: uuid, - type: "textResponse", - textResponse: response, + type: "abort", + textResponse: null, sources: [], close: true, - error: null, + error, }; - } else { - const rawHistory = ( - user - ? await WorkspaceChats.forWorkspaceByUser( - workspace.id, - user.id, - messageLimit, - { id: "desc" } - ) - : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, { - id: "desc", - }) - ).reverse(); - const chatHistory = convertToPromptHistory(rawHistory); - const { - response, - sources, - message: error, - } = await VectorDb[chatMode]({ - namespace: workspace.slug, - input: message, - workspace, + } + + // Compress message to ensure prompt passes token limit with room for response + // and build system messages based on inputs and history. + const messages = await LLMConnector.compressMessages( + { + systemPrompt: chatPrompt(workspace), + userPrompt: message, + contextTexts, chatHistory, - }); - if (!response) { - return { - id: uuid, - type: "abort", - textResponse: null, - sources: [], - close: true, - error, - }; - } + }, + rawHistory + ); - const data = { text: response, sources, type: chatMode }; - await WorkspaceChats.new({ - workspaceId: workspace.id, - prompt: message, - response: data, - user, - }); + // Send the text completion. + const textResponse = await LLMConnector.getChatCompletion(messages, { + temperature: workspace?.openAiTemp ?? 0.7, + }); + + if (!textResponse) { return { id: uuid, - type: "textResponse", - textResponse: response, - sources, + type: "abort", + textResponse: null, + sources: [], close: true, - error, + error: "No text completion could be completed with this input.", }; } + + await WorkspaceChats.new({ + workspaceId: workspace.id, + prompt: message, + response: { text: textResponse, sources, type: chatMode }, + user, + }); + return { + id: uuid, + type: "textResponse", + close: true, + textResponse, + sources, + error, + }; +} + +// On query we dont return message history. All other chat modes and when chatting +// with no embeddings we return history. +async function recentChatHistory( + user = null, + workspace, + messageLimit = 20, + chatMode = null +) { + if (chatMode === "query") return []; + const rawHistory = ( + user + ? await WorkspaceChats.forWorkspaceByUser( + workspace.id, + user.id, + messageLimit, + { id: "desc" } + ) + : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, { + id: "desc", + }) + ).reverse(); + return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) }; +} + +async function emptyEmbeddingChat({ + uuid, + user, + message, + workspace, + messageLimit, + LLMConnector, +}) { + const { rawHistory, chatHistory } = await recentChatHistory( + user, + workspace, + messageLimit + ); + const textResponse = await LLMConnector.sendChat( + chatHistory, + message, + workspace, + rawHistory + ); + await WorkspaceChats.new({ + workspaceId: workspace.id, + prompt: message, + response: { text: textResponse, sources: [], type: "chat" }, + user, + }); + return { + id: uuid, + type: "textResponse", + sources: [], + close: true, + error: null, + textResponse, + }; } function chatPrompt(workspace) { @@ -186,6 +241,7 @@ function chatPrompt(workspace) { } module.exports = { + convertToPromptHistory, convertToChatHistory, chatWithWorkspace, chatPrompt, diff --git a/server/utils/helpers/chat/index.js b/server/utils/helpers/chat/index.js new file mode 100644 index 0000000000000000000000000000000000000000..ed7eab90fc86be39639e5f3cb62884306c683abd --- /dev/null +++ b/server/utils/helpers/chat/index.js @@ -0,0 +1,325 @@ +const { convertToPromptHistory } = require("../../chats"); +const { TokenManager } = require("../tiktoken"); + +/* +What is the message Array compressor? +TLDR: So anyway, i started blasting (your prompts & stuff) + +messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts +and also maintain coherent history, system instructions and context, if applicable. + +We took an opinionated approach that after much back-testing we have found retained a highly coherent answer +under most user conditions that a user would take while using this specific system. While other systems may +use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple. + +We "cannonball" the input. +Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so. +This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied. +You may think: "Doesn't this result in massive data loss?" - yes & no. +Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback +on the quality of responses. + +We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to +be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable. +In general: + system: at best 15% of token capacity + history: at best 15% of token capacity + prompt: at best 70% of token capacity. + +we handle overflows by taking an aggressive path for two main cases. + +1. Very large user prompt +- Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread. +- We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone. + +2. Context window is exceeded in regular use. +- We do not touch prompt since it is very likely to be <70% of window. +- We check system prompt is not outrageous - if it is we cannonball it and keep context if present. +- We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a +preference for recent history if we can cannonball to fit it, otherwise it is omitted. + +We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases. +We also take the approach that history is the least important and most flexible of the items in this array of responses. + +There is a supplemental version of this function that also returns a formatted string for models like Claude-2 +*/ + +async function messageArrayCompressor(llm, messages = [], rawHistory = []) { + // assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively + // run the compressor to ensure the prompt has enough space to reply. + // realistically - most users will not be impacted by this. + const tokenBuffer = 600; + const tokenManager = new TokenManager(llm.model); + // If no work needs to be done, just pass through. + if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit()) + return messages; + + const system = messages.shift(); + const user = messages.pop(); + const userPromptSize = tokenManager.countFromString(user.content); + + // User prompt is the main focus here - we we prioritize it and allow + // it to highjack the entire conversation thread. We are going to + // cannonball the prompt through to ensure the reply has at least 20% of + // the token supply to reply with. + if (userPromptSize > llm.limits.user) { + return [ + { + role: "user", + content: cannonball({ + input: user.content, + targetTokenSize: llm.promptWindowLimit() * 0.8, + tiktokenInstance: tokenManager, + }), + }, + ]; + } + + const compressedSystem = new Promise(async (resolve) => { + const count = tokenManager.countFromString(system.content); + if (count < llm.limits.system) { + resolve(system); + return; + } + + // Split context from system prompt - cannonball since its over the window. + // We assume the context + user prompt is enough tokens to fit. + const [prompt, context = ""] = system.content.split("Context:"); + system.content = `${cannonball({ + input: prompt, + targetTokenSize: llm.limits.system, + tiktokenInstance: tokenManager, + })}${context ? `\nContext: ${context}` : ""}`; + resolve(system); + }); + + // Prompt is allowed to take up to 70% of window - we know its under + // if we are here, so passthrough. + const compressedPrompt = new Promise(async (resolve) => resolve(user)); + + // We always aggressively compress history because it is the least + // important data to retain in full-fidelity. + const compressedHistory = new Promise((resolve) => { + const eligibleHistoryItems = []; + var historyTokenCount = 0; + + for (const [i, history] of rawHistory.reverse().entries()) { + const [user, assistant] = convertToPromptHistory([history]); + const [userTokens, assistantTokens] = [ + tokenManager.countFromString(user.content), + tokenManager.countFromString(assistant.content), + ]; + const total = userTokens + assistantTokens; + + // If during the loop the token cost of adding this history + // is small, we can add it to history and move onto next. + if (historyTokenCount + total < llm.limits.history) { + eligibleHistoryItems.unshift(user, assistant); + historyTokenCount += total; + continue; + } + + // If we reach here the overhead of adding this history item will + // be too much of the limit. So now, we are prioritizing + // the most recent 3 message pairs - if we are already past those - exit loop and stop + // trying to make history work. + if (i > 2) break; + + // We are over the limit and we are within the first 3 most recent chats. + // so now we cannonball them to make them fit into the window. + // max size = llm.limit.history; Each component of the message, can at most + // be 50% of the history. We cannonball whichever is the problem. + // The math isnt perfect for tokens, so we have to add a fudge factor for safety. + const maxTargetSize = Math.floor(llm.limits.history / 2.2); + if (userTokens > maxTargetSize) { + user.content = cannonball({ + input: user.content, + targetTokenSize: maxTargetSize, + tiktokenInstance: tokenManager, + }); + } + + if (assistantTokens > maxTargetSize) { + assistant.content = cannonball({ + input: assistant.content, + targetTokenSize: maxTargetSize, + tiktokenInstance: tokenManager, + }); + } + + const newTotal = tokenManager.statsFrom([user, assistant]); + if (historyTokenCount + newTotal > llm.limits.history) continue; + eligibleHistoryItems.unshift(user, assistant); + historyTokenCount += newTotal; + } + resolve(eligibleHistoryItems); + }); + + const [cSystem, cHistory, cPrompt] = await Promise.all([ + compressedSystem, + compressedHistory, + compressedPrompt, + ]); + return [cSystem, ...cHistory, cPrompt]; +} + +// Implementation of messageArrayCompressor, but for string only completion models +async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) { + const tokenBuffer = 600; + const tokenManager = new TokenManager(llm.model); + const initialPrompt = llm.constructPrompt(promptArgs); + if ( + tokenManager.statsFrom(initialPrompt) + tokenBuffer < + llm.promptWindowLimit() + ) + return initialPrompt; + + const system = promptArgs.systemPrompt; + const user = promptArgs.userPrompt; + const userPromptSize = tokenManager.countFromString(user); + + // User prompt is the main focus here - we we prioritize it and allow + // it to highjack the entire conversation thread. We are going to + // cannonball the prompt through to ensure the reply has at least 20% of + // the token supply to reply with. + if (userPromptSize > llm.limits.user) { + return llm.constructPrompt({ + userPrompt: cannonball({ + input: user, + targetTokenSize: llm.promptWindowLimit() * 0.8, + tiktokenInstance: tokenManager, + }), + }); + } + + const compressedSystem = new Promise(async (resolve) => { + const count = tokenManager.countFromString(system); + if (count < llm.limits.system) { + resolve(system); + return; + } + resolve( + cannonball({ + input: system, + targetTokenSize: llm.limits.system, + tiktokenInstance: tokenManager, + }) + ); + }); + + // Prompt is allowed to take up to 70% of window - we know its under + // if we are here, so passthrough. + const compressedPrompt = new Promise(async (resolve) => resolve(user)); + + // We always aggressively compress history because it is the least + // important data to retain in full-fidelity. + const compressedHistory = new Promise((resolve) => { + const eligibleHistoryItems = []; + var historyTokenCount = 0; + + for (const [i, history] of rawHistory.reverse().entries()) { + const [user, assistant] = convertToPromptHistory([history]); + const [userTokens, assistantTokens] = [ + tokenManager.countFromString(user.content), + tokenManager.countFromString(assistant.content), + ]; + const total = userTokens + assistantTokens; + + // If during the loop the token cost of adding this history + // is small, we can add it to history and move onto next. + if (historyTokenCount + total < llm.limits.history) { + eligibleHistoryItems.unshift(user, assistant); + historyTokenCount += total; + continue; + } + + // If we reach here the overhead of adding this history item will + // be too much of the limit. So now, we are prioritizing + // the most recent 3 message pairs - if we are already past those - exit loop and stop + // trying to make history work. + if (i > 2) break; + + // We are over the limit and we are within the first 3 most recent chats. + // so now we cannonball them to make them fit into the window. + // max size = llm.limit.history; Each component of the message, can at most + // be 50% of the history. We cannonball whichever is the problem. + // The math isnt perfect for tokens, so we have to add a fudge factor for safety. + const maxTargetSize = Math.floor(llm.limits.history / 2.2); + if (userTokens > maxTargetSize) { + user.content = cannonball({ + input: user.content, + targetTokenSize: maxTargetSize, + tiktokenInstance: tokenManager, + }); + } + + if (assistantTokens > maxTargetSize) { + assistant.content = cannonball({ + input: assistant.content, + targetTokenSize: maxTargetSize, + tiktokenInstance: tokenManager, + }); + } + + const newTotal = tokenManager.statsFrom([user, assistant]); + if (historyTokenCount + newTotal > llm.limits.history) continue; + eligibleHistoryItems.unshift(user, assistant); + historyTokenCount += newTotal; + } + resolve(eligibleHistoryItems); + }); + + const [cSystem, cHistory, cPrompt] = await Promise.all([ + compressedSystem, + compressedHistory, + compressedPrompt, + ]); + + return llm.constructPrompt({ + systemPrompt: cSystem, + contextTexts: promptArgs?.contextTexts || [], + chatHistory: cHistory, + userPrompt: cPrompt, + }); +} + +// Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt +// Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it. +function cannonball({ + input = "", + targetTokenSize = 0, + tiktokenInstance = null, + ellipsesStr = null, +}) { + if (!input || !targetTokenSize) return input; + const tokenManager = tiktokenInstance || new TokenManager(); + const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n"; + const initialInputSize = tokenManager.countFromString(input); + if (initialInputSize < targetTokenSize) return input; + + // if the delta is the token difference between where our prompt is in size + // and where we ideally need to land. + const delta = initialInputSize - targetTokenSize; + const tokenChunks = tokenManager.tokensFromString(input); + const middleIdx = Math.floor(tokenChunks.length / 2); + + // middle truncate the text going left and right of midpoint + const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2)); + const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2)); + const truncatedText = + tokenManager.bytesFromTokens(leftChunks) + + truncText + + tokenManager.bytesFromTokens(rightChunks); + + console.log( + `Cannonball results ${initialInputSize} -> ${tokenManager.countFromString( + truncatedText + )} tokens.` + ); + return truncatedText; +} + +module.exports = { + messageArrayCompressor, + messageStringCompressor, +}; diff --git a/server/utils/helpers/tiktoken.js b/server/utils/helpers/tiktoken.js new file mode 100644 index 0000000000000000000000000000000000000000..ad1cdd444374b4d3cf11bcd981fd866317e9a4cb --- /dev/null +++ b/server/utils/helpers/tiktoken.js @@ -0,0 +1,57 @@ +const { getEncodingNameForModel, getEncoding } = require("js-tiktoken"); + +class TokenManager { + constructor(model = "gpt-3.5-turbo") { + this.model = model; + this.encoderName = this.getEncodingFromModel(model); + this.encoder = getEncoding(this.encoderName); + this.buffer = 50; + } + + getEncodingFromModel(model) { + try { + return getEncodingNameForModel(model); + } catch { + return "cl100k_base"; + } + } + + tokensFromString(input = "") { + const tokens = this.encoder.encode(input); + return tokens; + } + + bytesFromTokens(tokens = []) { + const bytes = this.encoder.decode(tokens); + return bytes; + } + + countFromString(input = "") { + const tokens = this.encoder.encode(input); + return tokens.length; + } + + statsFrom(input) { + if (typeof input === "string") return this.countFromString(input); + + // What is going on here? + // https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb Item 6. + // The only option is to estimate. From repeated testing using the static values in the code we are always 2 off, + // which means as of Nov 1, 2023 the additional factor on ln: 476 changed from 3 to 5. + if (Array.isArray(input)) { + const perMessageFactorTokens = input.length * 3; + const tokensFromContent = input.reduce( + (a, b) => a + this.countFromString(b.content), + 0 + ); + const diffCoefficient = 5; + return perMessageFactorTokens + tokensFromContent + diffCoefficient; + } + + throw new Error("Not a supported tokenized format."); + } +} + +module.exports = { + TokenManager, +}; diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 9cfb243fffec70c05200ab7083f3884a78d367fe..976849d923db2df07e09c15243856c51270a356b 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -17,6 +17,10 @@ const KEY_MAPPING = { envKey: "AZURE_OPENAI_ENDPOINT", checks: [isNotEmpty, validAzureURL], }, + AzureOpenAiTokenLimit: { + envKey: "AZURE_OPENAI_TOKEN_LIMIT", + checks: [validOpenAiTokenLimit], + }, AzureOpenAiKey: { envKey: "AZURE_OPENAI_KEY", checks: [isNotEmpty], @@ -137,7 +141,7 @@ function supportedLLM(input = "") { } function validAnthropicModel(input = "") { - const validModels = ["claude-2"]; + const validModels = ["claude-2", "claude-instant-1"]; return validModels.includes(input) ? null : `Invalid Model type. Must be one of ${validModels.join(", ")}.`; @@ -174,6 +178,14 @@ function validAzureURL(input = "") { } } +function validOpenAiTokenLimit(input = "") { + const tokenLimit = Number(input); + if (isNaN(tokenLimit)) return "Token limit is not a number"; + if (![4_096, 16_384, 8_192, 32_768].includes(tokenLimit)) + return "Invalid OpenAI token limit."; + return null; +} + function requiresForceMode(_, forceModeEnabled = false) { return forceModeEnabled === true ? null : "Cannot set this setting."; } diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index fdc4cbe4103fbfaffb67cc0ca737e4281b55660e..2bdb0133d5801ef2d00d94bb967801f8e609cf25 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { toChunks, getLLMProvider } = require("../../helpers"); -const { chatPrompt } = require("../../chats"); const Chroma = { name: "Chroma", @@ -253,92 +252,35 @@ const Chroma = { await DocumentVectors.deleteIds(indexes); return true; }, - query: async function (reqBody = {}) { - const { namespace = null, input, workspace = {} } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); + performSimilaritySearch: async function ({ + namespace = null, + input = "", + LLMConnector = null, + }) { + if (!namespace || !input || !LLMConnector) + throw new Error("Invalid request to performSimilaritySearch."); const { client } = await this.connect(); if (!(await this.namespaceExists(client, namespace))) { return { - response: null, + contextTexts: [], sources: [], message: "Invalid query - no documents found for workspace!", }; } - const LLMConnector = getLLMProvider(); const queryVector = await LLMConnector.embedTextInput(input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, queryVector ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - - // When we roll out own response we have separate metadata and texts, - // so for source collection we need to combine them. - const sources = sourceDocuments.map((metadata, i) => { - return { metadata: { ...metadata, text: contextTexts[i] } }; - }); - return { - response: responseText, - sources: this.curateSources(sources), - message: false, - }; - }, - // This implementation of chat uses the chat history and modifies the system prompt at execution - // this is improved over the regular langchain implementation so that chats do not directly modify embeddings - // because then multi-user support will have all conversations mutating the base vector collection to which then - // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs - chat: async function (reqBody = {}) { - const { - namespace = null, - input, - workspace = {}, - chatHistory = [], - } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); - - const { client } = await this.connect(); - if (!(await this.namespaceExists(client, namespace))) { - return { - response: null, - sources: [], - message: "Invalid query - no documents found for workspace!", - }; - } - - const LLMConnector = getLLMProvider(); - const queryVector = await LLMConnector.embedTextInput(input); - const { contextTexts, sourceDocuments } = await this.similarityResponse( - client, - namespace, - queryVector - ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - chatHistory, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - // When we roll out own response we have separate metadata and texts, - // so for source collection we need to combine them. const sources = sourceDocuments.map((metadata, i) => { return { metadata: { ...metadata, text: contextTexts[i] } }; }); return { - response: responseText, + contextTexts, sources: this.curateSources(sources), message: false, }; diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index bb150958535c1b46d676dc3c0ba346de9bae8b20..c18766a84dd4641417b2f99d2098b604be8686a6 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -4,7 +4,6 @@ const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); -const { chatPrompt } = require("../../chats"); const LanceDb = { uri: `${ @@ -226,83 +225,36 @@ const LanceDb = { return false; } }, - query: async function (reqBody = {}) { - const { namespace = null, input, workspace = {} } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); + performSimilaritySearch: async function ({ + namespace = null, + input = "", + LLMConnector = null, + }) { + if (!namespace || !input || !LLMConnector) + throw new Error("Invalid request to performSimilaritySearch."); const { client } = await this.connect(); if (!(await this.namespaceExists(client, namespace))) { return { - response: null, + contextTexts: [], sources: [], message: "Invalid query - no documents found for workspace!", }; } - const LLMConnector = getLLMProvider(); const queryVector = await LLMConnector.embedTextInput(input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, queryVector ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - - return { - response: responseText, - sources: this.curateSources(sourceDocuments), - message: false, - }; - }, - // This implementation of chat uses the chat history and modifies the system prompt at execution - // this is improved over the regular langchain implementation so that chats do not directly modify embeddings - // because then multi-user support will have all conversations mutating the base vector collection to which then - // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs - chat: async function (reqBody = {}) { - const { - namespace = null, - input, - workspace = {}, - chatHistory = [], - } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); - const { client } = await this.connect(); - if (!(await this.namespaceExists(client, namespace))) { - return { - response: null, - sources: [], - message: "Invalid query - no documents found for workspace!", - }; - } - - const LLMConnector = getLLMProvider(); - const queryVector = await LLMConnector.embedTextInput(input); - const { contextTexts, sourceDocuments } = await this.similarityResponse( - client, - namespace, - queryVector - ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - chatHistory, + const sources = sourceDocuments.map((metadata, i) => { + return { metadata: { ...metadata, text: contextTexts[i] } }; }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - return { - response: responseText, - sources: this.curateSources(sourceDocuments), + contextTexts, + sources: this.curateSources(sources), message: false, }; }, @@ -337,9 +289,13 @@ const LanceDb = { curateSources: function (sources = []) { const documents = []; for (const source of sources) { - const { text, vector: _v, score: _s, ...metadata } = source; + const { text, vector: _v, score: _s, ...rest } = source; + const metadata = rest.hasOwnProperty("metadata") ? rest.metadata : rest; if (Object.keys(metadata).length > 0) { - documents.push({ ...metadata, text }); + documents.push({ + ...metadata, + ...(text ? { text } : {}), + }); } } diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index fc7f4d3172ef8df85b6565e7cd0c0b5b1d45d920..f9600cf0c856fdae80587cd74ad303136613ec58 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { toChunks, getLLMProvider } = require("../../helpers"); -const { chatPrompt } = require("../../chats"); const Pinecone = { name: "Pinecone", @@ -222,80 +221,33 @@ const Pinecone = { message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`, }; }, - query: async function (reqBody = {}) { - const { namespace = null, input, workspace = {} } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); - - const { pineconeIndex } = await this.connect(); - if (!(await this.namespaceExists(pineconeIndex, namespace))) { - return { - response: null, - sources: [], - message: "Invalid query - no documents found for workspace!", - }; - } - - const LLMConnector = getLLMProvider(); - const queryVector = await LLMConnector.embedTextInput(input); - const { contextTexts, sourceDocuments } = await this.similarityResponse( - pineconeIndex, - namespace, - queryVector - ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - - return { - response: responseText, - sources: this.curateSources(sourceDocuments), - message: false, - }; - }, - // This implementation of chat uses the chat history and modifies the system prompt at execution - // this is improved over the regular langchain implementation so that chats do not directly modify embeddings - // because then multi-user support will have all conversations mutating the base vector collection to which then - // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs - chat: async function (reqBody = {}) { - const { - namespace = null, - input, - workspace = {}, - chatHistory = [], - } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); + performSimilaritySearch: async function ({ + namespace = null, + input = "", + LLMConnector = null, + }) { + if (!namespace || !input || !LLMConnector) + throw new Error("Invalid request to performSimilaritySearch."); const { pineconeIndex } = await this.connect(); if (!(await this.namespaceExists(pineconeIndex, namespace))) throw new Error( - "Invalid namespace - has it been collected and seeded yet?" + "Invalid namespace - has it been collected and populated yet?" ); - const LLMConnector = getLLMProvider(); const queryVector = await LLMConnector.embedTextInput(input); const { contextTexts, sourceDocuments } = await this.similarityResponse( pineconeIndex, namespace, queryVector ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - chatHistory, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); + const sources = sourceDocuments.map((metadata, i) => { + return { ...metadata, text: contextTexts[i] }; + }); return { - response: responseText, - sources: this.curateSources(sourceDocuments), + contextTexts, + sources: this.curateSources(sources), message: false, }; }, diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js index 9925c6e49be8923d1004aca130e8f854a5452755..c565daa7aac03c84f97ae4913a32a2c0fe120754 100644 --- a/server/utils/vectorDbProviders/qdrant/index.js +++ b/server/utils/vectorDbProviders/qdrant/index.js @@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { toChunks, getLLMProvider } = require("../../helpers"); -const { chatPrompt } = require("../../chats"); const QDrant = { name: "QDrant", @@ -262,83 +261,36 @@ const QDrant = { await DocumentVectors.deleteIds(indexes); return true; }, - query: async function (reqBody = {}) { - const { namespace = null, input, workspace = {} } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); + performSimilaritySearch: async function ({ + namespace = null, + input = "", + LLMConnector = null, + }) { + if (!namespace || !input || !LLMConnector) + throw new Error("Invalid request to performSimilaritySearch."); const { client } = await this.connect(); if (!(await this.namespaceExists(client, namespace))) { return { - response: null, + contextTexts: [], sources: [], message: "Invalid query - no documents found for workspace!", }; } - const LLMConnector = getLLMProvider(); const queryVector = await LLMConnector.embedTextInput(input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, queryVector ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - - return { - response: responseText, - sources: this.curateSources(sourceDocuments), - message: false, - }; - }, - // This implementation of chat uses the chat history and modifies the system prompt at execution - // this is improved over the regular langchain implementation so that chats do not directly modify embeddings - // because then multi-user support will have all conversations mutating the base vector collection to which then - // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs - chat: async function (reqBody = {}) { - const { - namespace = null, - input, - workspace = {}, - chatHistory = [], - } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); - const { client } = await this.connect(); - if (!(await this.namespaceExists(client, namespace))) { - return { - response: null, - sources: [], - message: "Invalid query - no documents found for workspace!", - }; - } - - const LLMConnector = getLLMProvider(); - const queryVector = await LLMConnector.embedTextInput(input); - const { contextTexts, sourceDocuments } = await this.similarityResponse( - client, - namespace, - queryVector - ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - chatHistory, + const sources = sourceDocuments.map((metadata, i) => { + return { ...metadata, text: contextTexts[i] }; }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - return { - response: responseText, - sources: this.curateSources(sourceDocuments), + contextTexts, + sources: this.curateSources(sources), message: false, }; }, @@ -377,8 +329,11 @@ const QDrant = { const documents = []; for (const source of sources) { if (Object.keys(source).length > 0) { + const metadata = source.hasOwnProperty("metadata") + ? source.metadata + : source; documents.push({ - ...source, + ...metadata, }); } } diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js index 1a43e3c5f59027de85e01e373f6ce1876710a485..052ad58617a9e72074720fa839eba3759bba8318 100644 --- a/server/utils/vectorDbProviders/weaviate/index.js +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { toChunks, getLLMProvider } = require("../../helpers"); -const { chatPrompt } = require("../../chats"); const { camelCase } = require("../../helpers/camelcase"); const Weaviate = { @@ -333,83 +332,36 @@ const Weaviate = { await DocumentVectors.deleteIds(indexes); return true; }, - query: async function (reqBody = {}) { - const { namespace = null, input, workspace = {} } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); + performSimilaritySearch: async function ({ + namespace = null, + input = "", + LLMConnector = null, + }) { + if (!namespace || !input || !LLMConnector) + throw new Error("Invalid request to performSimilaritySearch."); const { client } = await this.connect(); if (!(await this.namespaceExists(client, namespace))) { return { - response: null, + contextTexts: [], sources: [], message: "Invalid query - no documents found for workspace!", }; } - const LLMConnector = getLLMProvider(); const queryVector = await LLMConnector.embedTextInput(input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, queryVector ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - - return { - response: responseText, - sources: this.curateSources(sourceDocuments), - message: false, - }; - }, - // This implementation of chat uses the chat history and modifies the system prompt at execution - // this is improved over the regular langchain implementation so that chats do not directly modify embeddings - // because then multi-user support will have all conversations mutating the base vector collection to which then - // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs - chat: async function (reqBody = {}) { - const { - namespace = null, - input, - workspace = {}, - chatHistory = [], - } = reqBody; - if (!namespace || !input) throw new Error("Invalid request body"); - const { client } = await this.connect(); - if (!(await this.namespaceExists(client, namespace))) { - return { - response: null, - sources: [], - message: "Invalid query - no documents found for workspace!", - }; - } - - const LLMConnector = getLLMProvider(); - const queryVector = await LLMConnector.embedTextInput(input); - const { contextTexts, sourceDocuments } = await this.similarityResponse( - client, - namespace, - queryVector - ); - const memory = LLMConnector.constructPrompt({ - systemPrompt: chatPrompt(workspace), - contextTexts: contextTexts, - userPrompt: input, - chatHistory, + const sources = sourceDocuments.map((metadata, i) => { + return { ...metadata, text: contextTexts[i] }; }); - const responseText = await LLMConnector.getChatCompletion(memory, { - temperature: workspace?.openAiTemp ?? 0.7, - }); - return { - response: responseText, - sources: this.curateSources(sourceDocuments), + contextTexts, + sources: this.curateSources(sources), message: false, }; }, @@ -445,7 +397,10 @@ const Weaviate = { const documents = []; for (const source of sources) { if (Object.keys(source).length > 0) { - documents.push(source); + const metadata = source.hasOwnProperty("metadata") + ? source.metadata + : source; + documents.push({ ...metadata }); } } diff --git a/server/yarn.lock b/server/yarn.lock index 01479024e8e4790470ce83845151f60db83e312e..3226f9f54062de406f0d48d115326a474683101e 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -1556,7 +1556,7 @@ isomorphic-fetch@^3.0.0: node-fetch "^2.6.1" whatwg-fetch "^3.4.1" -js-tiktoken@^1.0.6: +js-tiktoken@^1.0.6, js-tiktoken@^1.0.7: version "1.0.7" resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5" integrity sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==