diff --git a/docker/.env.example b/docker/.env.example index e61b157c05f92861c989310e1481740648038c3a..650fad8d6d44d034d71eca5c5a5198de01b8b056 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -47,6 +47,7 @@ GID='1000' # EMBEDDING_ENGINE='localai' # EMBEDDING_BASE_PATH='https://localhost:8080/v1' # EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be ########################################### ######## Vector Database Selection ######## diff --git a/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx b/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx index 73b8a5df892e5db72ab57d15bcdd8c06f81d137a..232c33205fb07f706d9b58082cf3618cb7994b00 100644 --- a/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx +++ b/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx @@ -30,6 +30,22 @@ export default function LocalAiOptions({ settings }) { /> </div> <LocalAIModelSelection settings={settings} basePath={basePath} /> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Max embedding chunk length + </label> + <input + type="number" + name="EmbeddingModelMaxChunkLength" + className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="1000" + min={1} + onScroll={(e) => e.target.blur()} + defaultValue={settings?.EmbeddingModelMaxChunkLength} + required={false} + autoComplete="off" + /> + </div> </> ); } diff --git a/server/.env.example b/server/.env.example index 4127f9357d99b3140d9076ed543e13e3960b3102..242eec265ef4595671a66340e99492da126f13a0 100644 --- a/server/.env.example +++ b/server/.env.example @@ -44,6 +44,7 @@ JWT_SECRET="my-random-string-for-seeding" # Please generate random string at lea # EMBEDDING_ENGINE='localai' # EMBEDDING_BASE_PATH='https://localhost:8080/v1' # EMBEDDING_MODEL_PREF='text-embedding-ada-002' +# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be ########################################### ######## Vector Database Selection ######## diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index f1ab5fb9c6d2a4247cfc266d1cc85c3da3b1371c..66f7108bd838497d187446c368f74b54cf15b79e 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -27,6 +27,8 @@ const SystemSettings = { EmbeddingEngine: process.env.EMBEDDING_ENGINE, EmbeddingBasePath: process.env.EMBEDDING_BASE_PATH, EmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF, + EmbeddingModelMaxChunkLength: + process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH, ...(vectorDB === "pinecone" ? { PineConeEnvironment: process.env.PINECONE_ENVIRONMENT, diff --git a/server/utils/EmbeddingEngines/azureOpenAi/index.js b/server/utils/EmbeddingEngines/azureOpenAi/index.js index 554538fad39ca65459d2cacd93bc1389a116b88f..8959b00070dfdf451d7f1b83e43ff95db9ba181e 100644 --- a/server/utils/EmbeddingEngines/azureOpenAi/index.js +++ b/server/utils/EmbeddingEngines/azureOpenAi/index.js @@ -16,7 +16,7 @@ class AzureOpenAiEmbedder { // The maximum amount of "inputs" that OpenAI API can process in a single call. // https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request - this.embeddingChunkLimit = 16; + this.embeddingMaxChunkLength = 16; } async embedTextInput(textInput) { @@ -34,9 +34,9 @@ class AzureOpenAiEmbedder { // Because there is a limit on how many chunks can be sent at once to Azure OpenAI // we concurrently execute each max batch of text chunks possible. - // Refer to constructor embeddingChunkLimit for more info. + // Refer to constructor embeddingMaxChunkLength for more info. const embeddingRequests = []; - for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) { + for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) { embeddingRequests.push( new Promise((resolve) => { this.openai diff --git a/server/utils/EmbeddingEngines/localAi/index.js b/server/utils/EmbeddingEngines/localAi/index.js index d4b37d3997b9d122d147546317f448155026ec2e..aa36b5d13309c8226dac69c20e59fa9f7db753dd 100644 --- a/server/utils/EmbeddingEngines/localAi/index.js +++ b/server/utils/EmbeddingEngines/localAi/index.js @@ -1,4 +1,4 @@ -const { toChunks } = require("../../helpers"); +const { toChunks, maximumChunkLength } = require("../../helpers"); class LocalAiEmbedder { constructor() { @@ -12,8 +12,8 @@ class LocalAiEmbedder { }); this.openai = new OpenAIApi(config); - // Arbitrary limit to ensure we stay within reasonable POST request size. - this.embeddingChunkLimit = 1_000; + // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size. + this.embeddingMaxChunkLength = maximumChunkLength(); } async embedTextInput(textInput) { @@ -23,7 +23,7 @@ class LocalAiEmbedder { async embedChunks(textChunks = []) { const embeddingRequests = []; - for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) { + for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) { embeddingRequests.push( new Promise((resolve) => { this.openai diff --git a/server/utils/EmbeddingEngines/native/index.js b/server/utils/EmbeddingEngines/native/index.js index 5f5e174305c10ec539a313e55b2f070a090c8f91..2081e3fdee4fe66b51ca4db0e054038f59db4a89 100644 --- a/server/utils/EmbeddingEngines/native/index.js +++ b/server/utils/EmbeddingEngines/native/index.js @@ -4,6 +4,7 @@ const { toChunks } = require("../../helpers"); class NativeEmbedder { constructor() { + // Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 this.model = "Xenova/all-MiniLM-L6-v2"; this.cacheDir = path.resolve( process.env.STORAGE_DIR @@ -12,8 +13,8 @@ class NativeEmbedder { ); this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2"); - // Limit the number of chunks to send per loop to not overload compute. - this.embeddingChunkLimit = 16; + // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size. + this.embeddingMaxChunkLength = 1_000; // Make directory when it does not exist in existing installations if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir); @@ -62,7 +63,7 @@ class NativeEmbedder { async embedChunks(textChunks = []) { const Embedder = await this.embedderClient(); const embeddingResults = []; - for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) { + for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) { const output = await Embedder(chunk, { pooling: "mean", normalize: true, diff --git a/server/utils/EmbeddingEngines/openAi/index.js b/server/utils/EmbeddingEngines/openAi/index.js index 2c4c8c2494896cb18a6b9d8f6a8549f339c8d41d..8cfa235169c01080c23f53a28f46daf81c6b22cd 100644 --- a/server/utils/EmbeddingEngines/openAi/index.js +++ b/server/utils/EmbeddingEngines/openAi/index.js @@ -10,8 +10,8 @@ class OpenAiEmbedder { const openai = new OpenAIApi(config); this.openai = openai; - // Arbitrary limit to ensure we stay within reasonable POST request size. - this.embeddingChunkLimit = 1_000; + // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size. + this.embeddingMaxChunkLength = 1_000; } async embedTextInput(textInput) { @@ -22,9 +22,9 @@ class OpenAiEmbedder { async embedChunks(textChunks = []) { // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb) // we concurrently execute each max batch of text chunks possible. - // Refer to constructor embeddingChunkLimit for more info. + // Refer to constructor embeddingMaxChunkLength for more info. const embeddingRequests = []; - for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) { + for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) { embeddingRequests.push( new Promise((resolve) => { this.openai diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index 41f7ad7694bf295c0feaef1650dfcd682dc9c699..3b7f4ccc241114fe130f6a724a3d6b8ab9faf72e 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -70,6 +70,20 @@ function getEmbeddingEngineSelection() { } } +// Some models have lower restrictions on chars that can be encoded in a single pass +// and by default we assume it can handle 1,000 chars, but some models use work with smaller +// chars so here we can override that value when embedding information. +function maximumChunkLength() { + if ( + !!process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH && + !isNaN(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) && + Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) > 1 + ) + return Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH); + + return 1_000; +} + function toChunks(arr, size) { return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) => arr.slice(i * size, i * size + size) @@ -78,6 +92,7 @@ function toChunks(arr, size) { module.exports = { getEmbeddingEngineSelection, + maximumChunkLength, getVectorDbClass, getLLMProvider, toChunks, diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 63be0c6aa0f2f8a00254ceb4d6d07256752a583e..e995fb63ff7e72701c3b5a65b99ff3e5319a329f 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -90,6 +90,10 @@ const KEY_MAPPING = { envKey: "EMBEDDING_MODEL_PREF", checks: [isNotEmpty], }, + EmbeddingModelMaxChunkLength: { + envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH", + checks: [nonZero], + }, // Vector Database Selection Settings VectorDB: { diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index 0e75fa07fb056a9e7acd1122a89ee4afeb04fef0..878cf05f8526cf3a5b9b0bb89810b4dcf635b287 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -2,7 +2,11 @@ const { ChromaClient } = require("chromadb"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); -const { toChunks, getLLMProvider } = require("../../helpers"); +const { + toChunks, + getLLMProvider, + getEmbeddingEngineSelection, +} = require("../../helpers"); const Chroma = { name: "Chroma", @@ -175,7 +179,8 @@ const Chroma = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, + chunkSize: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, chunkOverlap: 20, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index 69adec662d369bf8a98f0bed8b6de957ac4ea49d..5e58ef1c8618b4f52190332c5c1daf98669358e0 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -1,5 +1,9 @@ const lancedb = require("vectordb"); -const { toChunks, getLLMProvider } = require("../../helpers"); +const { + toChunks, + getLLMProvider, + getEmbeddingEngineSelection, +} = require("../../helpers"); const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); @@ -176,7 +180,8 @@ const LanceDb = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, + chunkSize: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, chunkOverlap: 20, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index 3b0e09e96622835816b32c9b706b5de4033da939..7a7f862c2319c335dcc9befb08930055d47a0e85 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -2,7 +2,11 @@ const { PineconeClient } = require("@pinecone-database/pinecone"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); -const { toChunks, getLLMProvider } = require("../../helpers"); +const { + toChunks, + getLLMProvider, + getEmbeddingEngineSelection, +} = require("../../helpers"); const Pinecone = { name: "Pinecone", @@ -130,7 +134,8 @@ const Pinecone = { // from vectordb. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167 const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, + chunkSize: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, chunkOverlap: 20, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js index 86d9415488b5c7df7ccee1f6bb4ebec0da081492..54f53927ab5bc50150ef4363a074e09c9c43a578 100644 --- a/server/utils/vectorDbProviders/qdrant/index.js +++ b/server/utils/vectorDbProviders/qdrant/index.js @@ -2,7 +2,11 @@ const { QdrantClient } = require("@qdrant/js-client-rest"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); -const { toChunks, getLLMProvider } = require("../../helpers"); +const { + toChunks, + getLLMProvider, + getEmbeddingEngineSelection, +} = require("../../helpers"); const QDrant = { name: "QDrant", @@ -174,7 +178,8 @@ const QDrant = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, + chunkSize: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, chunkOverlap: 20, }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js index 93c63e8b95e3280d9e705edf82bb20d7640d6d23..91faff64ef54ce2f4a55839cb022d30b634bbea4 100644 --- a/server/utils/vectorDbProviders/weaviate/index.js +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -2,7 +2,11 @@ const { default: weaviate } = require("weaviate-ts-client"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); -const { toChunks, getLLMProvider } = require("../../helpers"); +const { + toChunks, + getLLMProvider, + getEmbeddingEngineSelection, +} = require("../../helpers"); const { camelCase } = require("../../helpers/camelcase"); const Weaviate = { @@ -237,7 +241,8 @@ const Weaviate = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: 1000, + chunkSize: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, chunkOverlap: 20, }); const textChunks = await textSplitter.splitText(pageContent);