diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index dbd61623db17eaa92e60dcf6bb79426b6b8f5342..0a5ed65fc85b5345a7429dbd025523a9f09c5d6b 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -35,6 +35,9 @@ const GeneralTranscriptionPreference = lazy( const GeneralEmbeddingPreference = lazy( () => import("@/pages/GeneralSettings/EmbeddingPreference") ); +const EmbeddingTextSplitterPreference = lazy( + () => import("@/pages/GeneralSettings/EmbeddingTextSplitterPreference") +); const GeneralVectorDatabase = lazy( () => import("@/pages/GeneralSettings/VectorDatabase") ); @@ -86,6 +89,12 @@ export default function App() { path="/settings/embedding-preference" element={<AdminRoute Component={GeneralEmbeddingPreference} />} /> + <Route + path="/settings/text-splitter-preference" + element={ + <AdminRoute Component={EmbeddingTextSplitterPreference} /> + } + /> <Route path="/settings/vector-database" element={<AdminRoute Component={GeneralVectorDatabase} />} diff --git a/frontend/src/components/LLMSelection/LMStudioOptions/index.jsx b/frontend/src/components/LLMSelection/LMStudioOptions/index.jsx index c94a99d7871f9e61ee4f8efa686df4b6140f6217..9a1c59bc73f1c5ff4880a1de1a9779d4cdc89095 100644 --- a/frontend/src/components/LLMSelection/LMStudioOptions/index.jsx +++ b/frontend/src/components/LLMSelection/LMStudioOptions/index.jsx @@ -21,7 +21,7 @@ export default function LMStudioOptions({ settings, showAlert = false }) { </p> </div> <a - href={paths.settings.embeddingPreference()} + href={paths.settings.embedder.modelPreference()} className="text-sm md:text-base my-2 underline" > Manage embedding → diff --git a/frontend/src/components/LLMSelection/LocalAiOptions/index.jsx b/frontend/src/components/LLMSelection/LocalAiOptions/index.jsx index 36b2f2588ecc41a861bee1f0c58163007445b6d4..1304c9e1ba573ec8f7cd4d4a21435010d404f9fd 100644 --- a/frontend/src/components/LLMSelection/LocalAiOptions/index.jsx +++ b/frontend/src/components/LLMSelection/LocalAiOptions/index.jsx @@ -21,7 +21,7 @@ export default function LocalAiOptions({ settings, showAlert = false }) { </p> </div> <a - href={paths.settings.embeddingPreference()} + href={paths.settings.embedder.modelPreference()} className="text-sm md:text-base my-2 underline" > Manage embedding → diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx index 40450d4e19a47b195f184211ab7b9a82563d6c37..67797d266190570339b0a6e64f14213828085eb5 100644 --- a/frontend/src/components/SettingsSidebar/index.jsx +++ b/frontend/src/components/SettingsSidebar/index.jsx @@ -20,6 +20,7 @@ import { Barcode, ClosedCaptioning, EyeSlash, + SplitVertical, } from "@phosphor-icons/react"; import useUser from "@/hooks/useUser"; import { USER_BACKGROUND_COLOR } from "@/utils/constants"; @@ -288,12 +289,25 @@ const SidebarOptions = ({ user = null }) => ( allowedRole={["admin"]} /> <Option - href={paths.settings.embeddingPreference()} - btnText="Embedding Model" + href={paths.settings.embedder.modelPreference()} + childLinks={[paths.settings.embedder.chunkingPreference()]} + btnText="Embedder Preferences" icon={<FileCode className="h-5 w-5 flex-shrink-0" />} user={user} flex={true} allowedRole={["admin"]} + subOptions={ + <> + <Option + href={paths.settings.embedder.chunkingPreference()} + btnText="Text Splitter & Chunking" + icon={<SplitVertical className="h-5 w-5 flex-shrink-0" />} + user={user} + flex={true} + allowedRole={["admin"]} + /> + </> + } /> <Option href={paths.settings.vectorDatabase()} diff --git a/frontend/src/pages/GeneralSettings/EmbeddingTextSplitterPreference/index.jsx b/frontend/src/pages/GeneralSettings/EmbeddingTextSplitterPreference/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..5ee1197f117ce3cc70ca2ec3a2083c19a56f9247 --- /dev/null +++ b/frontend/src/pages/GeneralSettings/EmbeddingTextSplitterPreference/index.jsx @@ -0,0 +1,180 @@ +import React, { useEffect, useState } from "react"; +import Sidebar from "@/components/SettingsSidebar"; +import { isMobile } from "react-device-detect"; +import PreLoader from "@/components/Preloader"; +import CTAButton from "@/components/lib/CTAButton"; +import Admin from "@/models/admin"; +import showToast from "@/utils/toast"; +import { nFormatter, numberWithCommas } from "@/utils/numbers"; + +function isNullOrNaN(value) { + if (value === null) return true; + return isNaN(value); +} + +export default function EmbeddingTextSplitterPreference() { + const [settings, setSettings] = useState({}); + const [loading, setLoading] = useState(true); + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = new FormData(e.target); + + if ( + Number(form.get("text_splitter_chunk_overlap")) >= + Number(form.get("text_splitter_chunk_size")) + ) { + showToast( + "Chunk overlap cannot be larger or equal to chunk size.", + "error" + ); + return; + } + + setSaving(true); + await Admin.updateSystemPreferences({ + text_splitter_chunk_size: isNullOrNaN( + form.get("text_splitter_chunk_size") + ) + ? 1000 + : Number(form.get("text_splitter_chunk_size")), + text_splitter_chunk_overlap: isNullOrNaN( + form.get("text_splitter_chunk_overlap") + ) + ? 1000 + : Number(form.get("text_splitter_chunk_overlap")), + }); + setSaving(false); + setHasChanges(false); + showToast("Text chunking strategy settings saved.", "success"); + }; + + useEffect(() => { + async function fetchSettings() { + const _settings = (await Admin.systemPreferences())?.settings; + setSettings(_settings ?? {}); + setLoading(false); + } + fetchSettings(); + }, []); + + return ( + <div className="w-screen h-screen overflow-hidden bg-sidebar flex"> + <Sidebar /> + {loading ? ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <div className="w-full h-full flex justify-center items-center"> + <PreLoader /> + </div> + </div> + ) : ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <form + onSubmit={handleSubmit} + onChange={() => setHasChanges(true)} + className="flex w-full" + > + <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16"> + <div className="w-full flex flex-col gap-y-1 pb-4 border-white border-b-2 border-opacity-10"> + <div className="flex gap-x-4 items-center"> + <p className="text-lg leading-6 font-bold text-white"> + Text splitting & Chunking Preferences + </p> + </div> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60"> + Sometimes, you may want to change the default way that new + documents are split and chunked before being inserted into + your vector database. <br /> + You should only modify this setting if you understand how text + splitting works and it's side effects. + </p> + <p className="text-xs leading-[18px] font-semibold text-white/80"> + Changes here will only apply to{" "} + <i>newly embedded documents</i>, not existing documents. + </p> + </div> + <div className="w-full justify-end flex"> + {hasChanges && ( + <CTAButton className="mt-3 mr-0 -mb-14 z-10"> + {saving ? "Saving..." : "Save changes"} + </CTAButton> + )} + </div> + + <div className="flex flex-col gap-y-4 mt-8"> + <div className="flex flex-col max-w-[300px]"> + <div className="flex flex-col gap-y-2 mb-4"> + <label className="text-white text-sm font-semibold block"> + Text Chunk Size + </label> + <p className="text-xs text-white/60"> + This is the maximum length of characters that can be + present in a single vector. + </p> + </div> + <input + type="number" + name="text_splitter_chunk_size" + min={1} + max={settings?.max_embed_chunk_size || 1000} + onWheel={(e) => e?.currentTarget?.blur()} + className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="maximum length of vectorized text" + defaultValue={ + isNullOrNaN(settings?.text_splitter_chunk_size) + ? 1000 + : Number(settings?.text_splitter_chunk_size) + } + required={true} + autoComplete="off" + /> + <p className="text-xs text-white/40"> + Embed model maximum length is{" "} + {numberWithCommas(settings?.max_embed_chunk_size || 1000)}. + </p> + </div> + </div> + + <div className="flex flex-col gap-y-4 mt-8"> + <div className="flex flex-col max-w-[300px]"> + <div className="flex flex-col gap-y-2 mb-4"> + <label className="text-white text-sm font-semibold block"> + Text Chunk Overlap + </label> + <p className="text-xs text-white/60"> + This is the maximum overlap of characters that occurs + during chunking between two adjacent text chunks. + </p> + </div> + <input + type="number" + name="text_splitter_chunk_overlap" + min={0} + onWheel={(e) => e?.currentTarget?.blur()} + className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="maximum length of vectorized text" + defaultValue={ + isNullOrNaN(settings?.text_splitter_chunk_overlap) + ? 20 + : Number(settings?.text_splitter_chunk_overlap) + } + required={true} + autoComplete="off" + /> + </div> + </div> + </div> + </form> + </div> + )} + </div> + ); +} diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js index af0331a9200c07298b9591e1a4794b262a072fe0..ffbf04c0c73bfec4b4d151bfe565aa8fa138ff39 100644 --- a/frontend/src/utils/paths.js +++ b/frontend/src/utils/paths.js @@ -98,6 +98,10 @@ export default { transcriptionPreference: () => { return "/settings/transcription-preference"; }, + embedder: { + modelPreference: () => "/settings/embedding-preference", + chunkingPreference: () => "/settings/text-splitter-preference", + }, embeddingPreference: () => { return "/settings/embedding-preference"; }, diff --git a/server/endpoints/admin.js b/server/endpoints/admin.js index 34bd66c3fecfa51232f85c991b91ce33e5a81888..4bf816a04731d30d8e711f883e9290e258593669 100644 --- a/server/endpoints/admin.js +++ b/server/endpoints/admin.js @@ -8,7 +8,10 @@ const { User } = require("../models/user"); const { DocumentVectors } = require("../models/vectors"); const { Workspace } = require("../models/workspace"); const { WorkspaceChats } = require("../models/workspaceChats"); -const { getVectorDbClass } = require("../utils/helpers"); +const { + getVectorDbClass, + getEmbeddingEngineSelection, +} = require("../utils/helpers"); const { validRoleSelection, canModifyAdmin, @@ -311,6 +314,7 @@ function adminEndpoints(app) { } ); + // TODO: Allow specification of which props to get instead of returning all of them all the time. app.get( "/admin/system-preferences", [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])], @@ -333,6 +337,16 @@ function adminEndpoints(app) { support_email: (await SystemSettings.get({ label: "support_email" }))?.value || null, + text_splitter_chunk_size: + (await SystemSettings.get({ label: "text_splitter_chunk_size" })) + ?.value || + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || + null, + text_splitter_chunk_overlap: + (await SystemSettings.get({ label: "text_splitter_chunk_overlap" })) + ?.value || null, + max_embed_chunk_size: + getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1000, }; response.status(200).json({ settings }); } catch (e) { diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 080a01f08727982edc698badfff06f01d0650a23..604e43073ffbce1d17d4964b6da5bf17ad620011 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -5,6 +5,11 @@ process.env.NODE_ENV === "development" const { isValidUrl } = require("../utils/http"); const prisma = require("../utils/prisma"); +function isNullOrNaN(value) { + if (value === null) return true; + return isNaN(value); +} + const SystemSettings = { protectedFields: ["multi_user_mode"], supportedFields: [ @@ -15,6 +20,8 @@ const SystemSettings = { "telemetry_id", "footer_data", "support_email", + "text_splitter_chunk_size", + "text_splitter_chunk_overlap", ], validations: { footer_data: (updates) => { @@ -28,6 +35,32 @@ const SystemSettings = { return JSON.stringify([]); } }, + text_splitter_chunk_size: (update) => { + try { + if (isNullOrNaN(update)) throw new Error("Value is not a number."); + if (Number(update) <= 0) throw new Error("Value must be non-zero."); + return Number(update); + } catch (e) { + console.error( + `Failed to run validation function on text_splitter_chunk_size`, + e.message + ); + return 1000; + } + }, + text_splitter_chunk_overlap: (update) => { + try { + if (isNullOrNaN(update)) throw new Error("Value is not a number"); + if (Number(update) < 0) throw new Error("Value cannot be less than 0."); + return Number(update); + } catch (e) { + console.error( + `Failed to run validation function on text_splitter_chunk_overlap`, + e.message + ); + return 20; + } + }, }, currentSettings: async function () { const llmProvider = process.env.LLM_PROVIDER; @@ -84,6 +117,15 @@ const SystemSettings = { } }, + getValueOrFallback: async function (clause = {}, fallback = null) { + try { + return (await this.get(clause))?.value ?? fallback; + } catch (error) { + console.error(error.message); + return fallback; + } + }, + where: async function (clause = {}, limit) { try { const settings = await prisma.system_settings.findMany({ diff --git a/server/utils/EmbeddingEngines/azureOpenAi/index.js b/server/utils/EmbeddingEngines/azureOpenAi/index.js index 4193e860d1e05ee601c6361c827162a2cfc4503f..1f9362c95d97222c231b107fd8e186af7994b018 100644 --- a/server/utils/EmbeddingEngines/azureOpenAi/index.js +++ b/server/utils/EmbeddingEngines/azureOpenAi/index.js @@ -17,7 +17,9 @@ class AzureOpenAiEmbedder { // Limit of how many strings we can process in a single pass to stay with resource or network limits // https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request this.maxConcurrentChunks = 16; - this.embeddingMaxChunkLength = 1_000; + + // https://learn.microsoft.com/en-us/answers/questions/1188074/text-embedding-ada-002-token-context-length + this.embeddingMaxChunkLength = 2048; } async embedTextInput(textInput) { diff --git a/server/utils/EmbeddingEngines/openAi/index.js b/server/utils/EmbeddingEngines/openAi/index.js index b52e78c6f433654bab8857fe056c6ea4deb75210..49841343a61058e7747062ebda55290fa1e74561 100644 --- a/server/utils/EmbeddingEngines/openAi/index.js +++ b/server/utils/EmbeddingEngines/openAi/index.js @@ -13,7 +13,9 @@ class OpenAiEmbedder { // Limit of how many strings we can process in a single pass to stay with resource or network limits this.maxConcurrentChunks = 500; - this.embeddingMaxChunkLength = 1_000; + + // https://platform.openai.com/docs/guides/embeddings/embedding-models + this.embeddingMaxChunkLength = 8_191; } async embedTextInput(textInput) { diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js new file mode 100644 index 0000000000000000000000000000000000000000..d7829827ca0ec1c2936e55ce35be95a8eceb22cc --- /dev/null +++ b/server/utils/TextSplitter/index.js @@ -0,0 +1,84 @@ +function isNullOrNaN(value) { + if (value === null) return true; + return isNaN(value); +} + +class TextSplitter { + #splitter; + constructor(config = {}) { + /* + config can be a ton of things depending on what is required or optional by the specific splitter. + Non-splitter related keys + { + splitByFilename: string, // TODO + } + ------ + Default: "RecursiveCharacterTextSplitter" + Config: { + chunkSize: number, + chunkOverlap: number, + } + ------ + */ + this.config = config; + this.#splitter = this.#setSplitter(config); + } + + log(text, ...args) { + console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args); + } + + // Does a quick check to determine the text chunk length limit. + // Embedder models have hard-set limits that cannot be exceeded, just like an LLM context + // so here we want to allow override of the default 1000, but up to the models maximum, which is + // sometimes user defined. + static determineMaxChunkSize(preferred = null, embedderLimit = 1000) { + const prefValue = isNullOrNaN(preferred) + ? Number(embedderLimit) + : Number(preferred); + const limit = Number(embedderLimit); + if (prefValue > limit) + console.log( + `\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.` + ); + return prefValue > limit ? limit : prefValue; + } + + #setSplitter(config = {}) { + // if (!config?.splitByFilename) {// TODO do something when specific extension is present? } + return new RecursiveSplitter({ + chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize), + chunkOverlap: isNaN(config?.chunkOverlap) + ? 20 + : Number(config?.chunkOverlap), + }); + } + + async splitText(documentText) { + return this.#splitter._splitText(documentText); + } +} + +// Wrapper for Langchain default RecursiveCharacterTextSplitter class. +class RecursiveSplitter { + constructor({ chunkSize, chunkOverlap }) { + const { + RecursiveCharacterTextSplitter, + } = require("langchain/text_splitter"); + this.log(`Will split with`, { chunkSize, chunkOverlap }); + this.engine = new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap, + }); + } + + log(text, ...args) { + console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args); + } + + async _splitText(documentText) { + return this.engine.splitText(documentText); + } +} + +module.exports.TextSplitter = TextSplitter; diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js index df983d4f488a393375fc3f8c88cb07af17f5e224..b6f8981bb19b674e5fad746538643467e5f42ef5 100644 --- a/server/utils/vectorDbProviders/astra/index.js +++ b/server/utils/vectorDbProviders/astra/index.js @@ -1,5 +1,5 @@ const { AstraDB: AstraClient } = require("@datastax/astra-db-ts"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { @@ -147,10 +147,17 @@ const AstraDB = { return { vectorized: true, error: null }; } - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index 9e3caa7adca018dde5b498c94925aef2f299be5c..1b9cbb53a4b3b150041e936e0040d58ca35e3634 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -1,5 +1,5 @@ const { ChromaClient } = require("chromadb"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { @@ -180,10 +180,17 @@ const Chroma = { // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index ecf10007f5100763fa9b45dbafb37546023679fd..f2fc8eee10fececee9fca475cfb4bdc1f464bd64 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -5,9 +5,10 @@ const { getEmbeddingEngineSelection, } = require("../../helpers"); const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); +const { SystemSettings } = require("../../../models/systemSettings"); const LanceDb = { uri: `${ @@ -180,10 +181,17 @@ const LanceDb = { // We have to do this manually as opposed to using LangChains `xyz.fromDocuments` // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js index a304e8714e56eb65295f37a138b28e0b075e1e14..3bd5be6d6498414090e3223c4110e07e98b54368 100644 --- a/server/utils/vectorDbProviders/milvus/index.js +++ b/server/utils/vectorDbProviders/milvus/index.js @@ -4,7 +4,7 @@ const { IndexType, MilvusClient, } = require("@zilliz/milvus2-sdk-node"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { v4: uuidv4 } = require("uuid"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { @@ -182,10 +182,17 @@ const Milvus = { return { vectorized: true, error: null }; } - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index b8f288c06198569d2b82349fb5755ac8d974e1ef..efcecddc9d4a726e9e58d270970aa26ebb3d3e66 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -1,5 +1,5 @@ const { Pinecone } = require("@pinecone-database/pinecone"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { @@ -125,10 +125,17 @@ const PineconeDB = { // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167 - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js index e7e00fe64d3e3e8207d91fcc2df5b8840b376161..aaca51118f506c603cbd2691353708d7754dd863 100644 --- a/server/utils/vectorDbProviders/qdrant/index.js +++ b/server/utils/vectorDbProviders/qdrant/index.js @@ -1,5 +1,5 @@ const { QdrantClient } = require("@qdrant/js-client-rest"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { @@ -198,10 +198,17 @@ const QDrant = { // We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments` // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js index 13668303f9a520b613ec3dcbf23c596c01816fb8..35112327b4f10a3a8c041a1596c72419ece3ffef 100644 --- a/server/utils/vectorDbProviders/weaviate/index.js +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -1,5 +1,5 @@ const { default: weaviate } = require("weaviate-ts-client"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { v4: uuidv4 } = require("uuid"); const { @@ -241,10 +241,17 @@ const Weaviate = { // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` // because we then cannot atomically control our namespace to granularly find/remove documents // from vectordb. - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js index be0e9e7d40d5d5174e1713c19bc1ee0564cb9369..72e6829d9a539947055205ad21e365786894eeba 100644 --- a/server/utils/vectorDbProviders/zilliz/index.js +++ b/server/utils/vectorDbProviders/zilliz/index.js @@ -4,7 +4,7 @@ const { IndexType, MilvusClient, } = require("@zilliz/milvus2-sdk-node"); -const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); +const { TextSplitter } = require("../../TextSplitter"); const { v4: uuidv4 } = require("uuid"); const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { @@ -183,10 +183,17 @@ const Zilliz = { return { vectorized: true, error: null }; } - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize: - getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, - chunkOverlap: 20, + const textSplitter = new TextSplitter({ + chunkSize: TextSplitter.determineMaxChunkSize( + await SystemSettings.getValueOrFallback({ + label: "text_splitter_chunk_size", + }), + getEmbeddingEngineSelection()?.embeddingMaxChunkLength + ), + chunkOverlap: await SystemSettings.getValueOrFallback( + { label: "text_splitter_chunk_overlap" }, + 20 + ), }); const textChunks = await textSplitter.splitText(pageContent);