diff --git a/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx b/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx index d2846704d74048702d96be4108d833ac5b81e851..87e058827555ba437089fed0a198bc9db54bcff2 100644 --- a/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx +++ b/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx @@ -30,7 +30,11 @@ export default function GeminiLLMOptions({ settings }) { required={true} className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" > - {["gemini-pro", "gemini-1.5-pro-latest"].map((model) => { + {[ + "gemini-pro", + "gemini-1.5-pro-latest", + "gemini-1.5-flash-latest", + ].map((model) => { return ( <option key={model} value={model}> {model} diff --git a/frontend/src/hooks/useGetProvidersModels.js b/frontend/src/hooks/useGetProvidersModels.js index 29075c557931a138fabd378fc9a0b011ebc5a974..6687f0a7bb445f27a408cc1d9b17b1ce6cb02072 100644 --- a/frontend/src/hooks/useGetProvidersModels.js +++ b/frontend/src/hooks/useGetProvidersModels.js @@ -10,7 +10,7 @@ export const DISABLED_PROVIDERS = [ ]; const PROVIDER_DEFAULT_MODELS = { openai: [], - gemini: ["gemini-pro", "gemini-1.5-pro-latest"], + gemini: ["gemini-pro", "gemini-1.5-pro-latest", "gemini-1.5-flash-latest"], anthropic: [ "claude-instant-1.2", "claude-2.0", diff --git a/server/utils/AiProviders/gemini/index.js b/server/utils/AiProviders/gemini/index.js index 0c2cc7697aca94d034a9a156e2cb4627e4580a9c..30c9ffa35739e9a979bd1169270705148638b53d 100644 --- a/server/utils/AiProviders/gemini/index.js +++ b/server/utils/AiProviders/gemini/index.js @@ -17,8 +17,12 @@ class GeminiLLM { this.gemini = genAI.getGenerativeModel( { model: this.model }, { - // Gemini-1.5-pro is only available on the v1beta API. - apiVersion: this.model === "gemini-1.5-pro-latest" ? "v1beta" : "v1", + // Gemini-1.5-pro and Gemini-1.5-flash are only available on the v1beta API. + apiVersion: + this.model === "gemini-1.5-pro-latest" || + this.model === "gemini-1.5-flash-latest" + ? "v1beta" + : "v1", } ); this.limits = { @@ -95,7 +99,11 @@ class GeminiLLM { } isValidChatCompletionModel(modelName = "") { - const validModels = ["gemini-pro", "gemini-1.5-pro-latest"]; + const validModels = [ + "gemini-pro", + "gemini-1.5-pro-latest", + "gemini-1.5-flash-latest", + ]; return validModels.includes(modelName); } diff --git a/server/utils/AiProviders/liteLLM/index.js b/server/utils/AiProviders/liteLLM/index.js index 2c7fa823c27f45d2d407a73118c2eaafa647efe5..28d0b71dc4397846617010d38db40c97dfdbc824 100644 --- a/server/utils/AiProviders/liteLLM/index.js +++ b/server/utils/AiProviders/liteLLM/index.js @@ -1,7 +1,6 @@ const { NativeEmbedder } = require("../../EmbeddingEngines/native"); const { - writeResponseChunk, - clientAbortedHandler, + handleDefaultStreamResponseV2, } = require("../../helpers/chat/responses"); class LiteLLM { @@ -113,45 +112,7 @@ class LiteLLM { } handleStream(response, stream, responseProps) { - const { uuid = uuidv4(), sources = [] } = responseProps; - - return new Promise(async (resolve) => { - let fullText = ""; - - const handleAbort = () => clientAbortedHandler(resolve, fullText); - response.on("close", handleAbort); - - for await (const chunk of stream) { - const message = chunk?.choices?.[0]; - const token = message?.delta?.content; - - if (token) { - fullText += token; - writeResponseChunk(response, { - uuid, - sources: [], - type: "textResponseChunk", - textResponse: token, - close: false, - error: false, - }); - } - - // LiteLLM does not give a finish reason in stream until the final chunk - if (message.finish_reason || message.finish_reason === "stop") { - writeResponseChunk(response, { - uuid, - sources, - type: "textResponseChunk", - textResponse: "", - close: true, - error: false, - }); - response.removeListener("close", handleAbort); - resolve(fullText); - } - } - }); + return handleDefaultStreamResponseV2(response, stream, responseProps); } // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js index 55e8fbe5fd6da14e0e9cb67bf654ba7de8bd8bb4..b6258c2e336cc6dd90738488dcf89e6285aca2f9 100644 --- a/server/utils/chats/index.js +++ b/server/utils/chats/index.js @@ -151,16 +151,27 @@ async function chatWithWorkspace( }; } - contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts]; + const { fillSourceWindow } = require("../helpers/chat"); + const filledSources = fillSourceWindow({ + nDocs: workspace?.topN || 4, + searchResults: vectorSearchResults.sources, + history: rawHistory, + filterIdentifiers: pinnedDocIdentifiers, + }); + + // Why does contextTexts get all the info, but sources only get current search? + // This is to give the ability of the LLM to "comprehend" a contextual response without + // populating the Citations under a response with documents the user "thinks" are irrelevant + // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses. + // If a past citation was used to answer the question - that is visible in the history so it logically makes sense + // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt. + // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate. + contextTexts = [...contextTexts, ...filledSources.contextTexts]; sources = [...sources, ...vectorSearchResults.sources]; - // If in query mode and no sources are found from the vector search and no pinned documents, do not + // If in query mode and no context chunks are found from search, backfill, or pins - do not // let the LLM try to hallucinate a response or use general knowledge and exit early - if ( - chatMode === "query" && - vectorSearchResults.sources.length === 0 && - pinnedDocIdentifiers.length === 0 - ) { + if (chatMode === "query" && contextTexts.length === 0) { return { id: uuid, type: "textResponse", @@ -224,9 +235,7 @@ async function recentChatHistory({ workspace, thread = null, messageLimit = 20, - chatMode = null, }) { - if (chatMode === "query") return { rawHistory: [], chatHistory: [] }; const rawHistory = ( await WorkspaceChats.where( { diff --git a/server/utils/chats/stream.js b/server/utils/chats/stream.js index ec8fdbfac14cdf95f32632fc4a794e429b6932a9..ced9a97109430a7b28a4215b9f17357dd14b7d1b 100644 --- a/server/utils/chats/stream.js +++ b/server/utils/chats/stream.js @@ -100,7 +100,6 @@ async function streamChatWithWorkspace( workspace, thread, messageLimit, - chatMode, }); // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search @@ -157,16 +156,27 @@ async function streamChatWithWorkspace( return; } - contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts]; + const { fillSourceWindow } = require("../helpers/chat"); + const filledSources = fillSourceWindow({ + nDocs: workspace?.topN || 4, + searchResults: vectorSearchResults.sources, + history: rawHistory, + filterIdentifiers: pinnedDocIdentifiers, + }); + + // Why does contextTexts get all the info, but sources only get current search? + // This is to give the ability of the LLM to "comprehend" a contextual response without + // populating the Citations under a response with documents the user "thinks" are irrelevant + // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses. + // If a past citation was used to answer the question - that is visible in the history so it logically makes sense + // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt. + // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate. + contextTexts = [...contextTexts, ...filledSources.contextTexts]; sources = [...sources, ...vectorSearchResults.sources]; - // If in query mode and no sources are found from the vector search and no pinned documents, do not + // If in query mode and no context chunks are found from search, backfill, or pins - do not // let the LLM try to hallucinate a response or use general knowledge and exit early - if ( - chatMode === "query" && - sources.length === 0 && - pinnedDocIdentifiers.length === 0 - ) { + if (chatMode === "query" && contextTexts.length === 0) { writeResponseChunk(response, { id: uuid, type: "textResponse", diff --git a/server/utils/helpers/chat/index.js b/server/utils/helpers/chat/index.js index 84afd516cde5819c069722e1aa4380757637d52f..6f565efe14e5a67b55413c6e02a85df3ece2231f 100644 --- a/server/utils/helpers/chat/index.js +++ b/server/utils/helpers/chat/index.js @@ -1,3 +1,5 @@ +const { sourceIdentifier } = require("../../chats"); +const { safeJsonParse } = require("../../http"); const { TokenManager } = require("../tiktoken"); const { convertToPromptHistory } = require("./responses"); @@ -343,7 +345,104 @@ function cannonball({ return truncatedText; } +/** + * Fill the sources window with the priority of + * 1. Pinned documents (handled prior to function) + * 2. VectorSearch results + * 3. prevSources in chat history - starting from most recent. + * + * Ensuring the window always has the desired amount of sources so that followup questions + * in any chat mode have relevant sources, but not infinite sources. This function is used during chatting + * and allows follow-up questions within a query chat that otherwise would have zero sources and would fail. + * The added benefit is that during regular RAG chat, we have better coherence of citations that otherwise would + * also yield no results with no need for a ReRanker to run and take much longer to return a response. + * + * The side effect of this is follow-up unrelated questions now have citations that would look totally irrelevant, however + * we would rather optimize on the correctness of a response vs showing extraneous sources during a response. Given search + * results always take a priority a good unrelated question that produces RAG results will still function as desired and due to previous + * history backfill sources "changing context" mid-chat is handled appropriately. + * example: + * ---previous implementation--- + * prompt 1: "What is anythingllm?" -> possibly get 4 good sources + * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> bad response due to bad context mgmt + * ---next implementation--- + * prompt 1: "What is anythingllm?" -> possibly get 4 good sources + * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> backfill with 3 good sources from previous -> much better response + * + * @param {Object} config - params to call + * @param {object} config.nDocs = fill size of the window + * @param {object} config.searchResults = vector similarityResponse results for .sources + * @param {object[]} config.history - rawHistory of chat containing sources + * @param {string[]} config.filterIdentifiers - Pinned document identifiers to prevent duplicate context + * @returns {{ + * contextTexts: string[], + * sources: object[], + * }} - Array of sources that should be added to window + */ +function fillSourceWindow({ + nDocs = 4, // Number of documents + searchResults = [], // Sources from similarity search + history = [], // Raw history + filterIdentifiers = [], // pinned document sources +} = config) { + const sources = [...searchResults]; + + if (sources.length >= nDocs || history.length === 0) { + return { + sources, + contextTexts: sources.map((src) => src.text), + }; + } + + const log = (text, ...args) => { + console.log(`\x1b[36m[fillSourceWindow]\x1b[0m ${text}`, ...args); + }; + + log( + `Need to backfill ${nDocs - searchResults.length} chunks to fill in the source window for RAG!` + ); + const seenChunks = new Set(searchResults.map((source) => source.id)); + + // We need to reverse again because we need to iterate from bottom of array (most recent chats) + // Looking at this function by itself you may think that this loop could be extreme for long history chats, + // but this was already handled where `history` we derived. This comes from `recentChatHistory` which + // includes a limit for history (default: 20). So this loop does not look as extreme as on first glance. + for (const chat of history.reverse()) { + if (sources.length >= nDocs) { + log( + `Citations backfilled to ${nDocs} references from ${searchResults.length} original citations.` + ); + break; + } + + const chatSources = + safeJsonParse(chat.response, { sources: [] })?.sources || []; + if (!chatSources?.length || !Array.isArray(chatSources)) continue; + + const validSources = chatSources.filter((source) => { + return ( + filterIdentifiers.includes(sourceIdentifier(source)) == false && // source cannot be in current pins + source.hasOwnProperty("score") && // source cannot have come from a pinned document that was previously pinned + source.hasOwnProperty("text") && // source has a valid text property we can use + seenChunks.has(source.id) == false // is unique + ); + }); + + for (const validSource of validSources) { + if (sources.length >= nDocs) break; + sources.push(validSource); + seenChunks.add(validSource.id); + } + } + + return { + sources, + contextTexts: sources.map((src) => src.text), + }; +} + module.exports = { messageArrayCompressor, messageStringCompressor, + fillSourceWindow, }; diff --git a/server/utils/helpers/chat/responses.js b/server/utils/helpers/chat/responses.js index d49c8a85a995872f3a7fdb59393f7c7e36624889..d07eae308e9575a96449b79635d0b96269dce446 100644 --- a/server/utils/helpers/chat/responses.js +++ b/server/utils/helpers/chat/responses.js @@ -38,8 +38,13 @@ function handleDefaultStreamResponseV2(response, stream, responseProps) { }); } - // LocalAi returns '' and others return null. - if (message.finish_reason !== "" && message.finish_reason !== null) { + // LocalAi returns '' and others return null on chunks - the last chunk is not "" or null. + // Either way, the key `finish_reason` must be present to determine ending chunk. + if ( + message?.hasOwnProperty("finish_reason") && // Got valid message and it is an object with finish_reason + message.finish_reason !== "" && + message.finish_reason !== null + ) { writeResponseChunk(response, { uuid, sources, @@ -50,6 +55,7 @@ function handleDefaultStreamResponseV2(response, stream, responseProps) { }); response.removeListener("close", handleAbort); resolve(fullText); + break; // Break streaming when a valid finish_reason is first encountered } } }); diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index c8811c9de99d90896bd448d597db80ca1785a64f..c95ccd57d01fc9968bb2deaf391ad777178ddc3d 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -526,7 +526,11 @@ function supportedTranscriptionProvider(input = "") { } function validGeminiModel(input = "") { - const validModels = ["gemini-pro", "gemini-1.5-pro-latest"]; + const validModels = [ + "gemini-pro", + "gemini-1.5-pro-latest", + "gemini-1.5-flash-latest", + ]; return validModels.includes(input) ? null : `Invalid Model type. Must be one of ${validModels.join(", ")}.`;