diff --git a/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx b/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx
index d2846704d74048702d96be4108d833ac5b81e851..87e058827555ba437089fed0a198bc9db54bcff2 100644
--- a/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/GeminiLLMOptions/index.jsx
@@ -30,7 +30,11 @@ export default function GeminiLLMOptions({ settings }) {
                 required={true}
                 className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
               >
-                {["gemini-pro", "gemini-1.5-pro-latest"].map((model) => {
+                {[
+                  "gemini-pro",
+                  "gemini-1.5-pro-latest",
+                  "gemini-1.5-flash-latest",
+                ].map((model) => {
                   return (
                     <option key={model} value={model}>
                       {model}
diff --git a/frontend/src/hooks/useGetProvidersModels.js b/frontend/src/hooks/useGetProvidersModels.js
index 29075c557931a138fabd378fc9a0b011ebc5a974..6687f0a7bb445f27a408cc1d9b17b1ce6cb02072 100644
--- a/frontend/src/hooks/useGetProvidersModels.js
+++ b/frontend/src/hooks/useGetProvidersModels.js
@@ -10,7 +10,7 @@ export const DISABLED_PROVIDERS = [
 ];
 const PROVIDER_DEFAULT_MODELS = {
   openai: [],
-  gemini: ["gemini-pro", "gemini-1.5-pro-latest"],
+  gemini: ["gemini-pro", "gemini-1.5-pro-latest", "gemini-1.5-flash-latest"],
   anthropic: [
     "claude-instant-1.2",
     "claude-2.0",
diff --git a/server/utils/AiProviders/gemini/index.js b/server/utils/AiProviders/gemini/index.js
index 0c2cc7697aca94d034a9a156e2cb4627e4580a9c..30c9ffa35739e9a979bd1169270705148638b53d 100644
--- a/server/utils/AiProviders/gemini/index.js
+++ b/server/utils/AiProviders/gemini/index.js
@@ -17,8 +17,12 @@ class GeminiLLM {
     this.gemini = genAI.getGenerativeModel(
       { model: this.model },
       {
-        // Gemini-1.5-pro is only available on the v1beta API.
-        apiVersion: this.model === "gemini-1.5-pro-latest" ? "v1beta" : "v1",
+        // Gemini-1.5-pro and Gemini-1.5-flash are only available on the v1beta API.
+        apiVersion:
+          this.model === "gemini-1.5-pro-latest" ||
+          this.model === "gemini-1.5-flash-latest"
+            ? "v1beta"
+            : "v1",
       }
     );
     this.limits = {
@@ -95,7 +99,11 @@ class GeminiLLM {
   }
 
   isValidChatCompletionModel(modelName = "") {
-    const validModels = ["gemini-pro", "gemini-1.5-pro-latest"];
+    const validModels = [
+      "gemini-pro",
+      "gemini-1.5-pro-latest",
+      "gemini-1.5-flash-latest",
+    ];
     return validModels.includes(modelName);
   }
 
diff --git a/server/utils/AiProviders/liteLLM/index.js b/server/utils/AiProviders/liteLLM/index.js
index 2c7fa823c27f45d2d407a73118c2eaafa647efe5..28d0b71dc4397846617010d38db40c97dfdbc824 100644
--- a/server/utils/AiProviders/liteLLM/index.js
+++ b/server/utils/AiProviders/liteLLM/index.js
@@ -1,7 +1,6 @@
 const { NativeEmbedder } = require("../../EmbeddingEngines/native");
 const {
-  writeResponseChunk,
-  clientAbortedHandler,
+  handleDefaultStreamResponseV2,
 } = require("../../helpers/chat/responses");
 
 class LiteLLM {
@@ -113,45 +112,7 @@ class LiteLLM {
   }
 
   handleStream(response, stream, responseProps) {
-    const { uuid = uuidv4(), sources = [] } = responseProps;
-
-    return new Promise(async (resolve) => {
-      let fullText = "";
-
-      const handleAbort = () => clientAbortedHandler(resolve, fullText);
-      response.on("close", handleAbort);
-
-      for await (const chunk of stream) {
-        const message = chunk?.choices?.[0];
-        const token = message?.delta?.content;
-
-        if (token) {
-          fullText += token;
-          writeResponseChunk(response, {
-            uuid,
-            sources: [],
-            type: "textResponseChunk",
-            textResponse: token,
-            close: false,
-            error: false,
-          });
-        }
-
-        // LiteLLM does not give a finish reason in stream until the final chunk
-        if (message.finish_reason || message.finish_reason === "stop") {
-          writeResponseChunk(response, {
-            uuid,
-            sources,
-            type: "textResponseChunk",
-            textResponse: "",
-            close: true,
-            error: false,
-          });
-          response.removeListener("close", handleAbort);
-          resolve(fullText);
-        }
-      }
-    });
+    return handleDefaultStreamResponseV2(response, stream, responseProps);
   }
 
   // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
diff --git a/server/utils/chats/index.js b/server/utils/chats/index.js
index 55e8fbe5fd6da14e0e9cb67bf654ba7de8bd8bb4..b6258c2e336cc6dd90738488dcf89e6285aca2f9 100644
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@@ -151,16 +151,27 @@ async function chatWithWorkspace(
     };
   }
 
-  contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
+  const { fillSourceWindow } = require("../helpers/chat");
+  const filledSources = fillSourceWindow({
+    nDocs: workspace?.topN || 4,
+    searchResults: vectorSearchResults.sources,
+    history: rawHistory,
+    filterIdentifiers: pinnedDocIdentifiers,
+  });
+
+  // Why does contextTexts get all the info, but sources only get current search?
+  // This is to give the ability of the LLM to "comprehend" a contextual response without
+  // populating the Citations under a response with documents the user "thinks" are irrelevant
+  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
+  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
+  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
+  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
+  contextTexts = [...contextTexts, ...filledSources.contextTexts];
   sources = [...sources, ...vectorSearchResults.sources];
 
-  // If in query mode and no sources are found from the vector search and no pinned documents, do not
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
   // let the LLM try to hallucinate a response or use general knowledge and exit early
-  if (
-    chatMode === "query" &&
-    vectorSearchResults.sources.length === 0 &&
-    pinnedDocIdentifiers.length === 0
-  ) {
+  if (chatMode === "query" && contextTexts.length === 0) {
     return {
       id: uuid,
       type: "textResponse",
@@ -224,9 +235,7 @@ async function recentChatHistory({
   workspace,
   thread = null,
   messageLimit = 20,
-  chatMode = null,
 }) {
-  if (chatMode === "query") return { rawHistory: [], chatHistory: [] };
   const rawHistory = (
     await WorkspaceChats.where(
       {
diff --git a/server/utils/chats/stream.js b/server/utils/chats/stream.js
index ec8fdbfac14cdf95f32632fc4a794e429b6932a9..ced9a97109430a7b28a4215b9f17357dd14b7d1b 100644
--- a/server/utils/chats/stream.js
+++ b/server/utils/chats/stream.js
@@ -100,7 +100,6 @@ async function streamChatWithWorkspace(
     workspace,
     thread,
     messageLimit,
-    chatMode,
   });
 
   // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search
@@ -157,16 +156,27 @@ async function streamChatWithWorkspace(
     return;
   }
 
-  contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
+  const { fillSourceWindow } = require("../helpers/chat");
+  const filledSources = fillSourceWindow({
+    nDocs: workspace?.topN || 4,
+    searchResults: vectorSearchResults.sources,
+    history: rawHistory,
+    filterIdentifiers: pinnedDocIdentifiers,
+  });
+
+  // Why does contextTexts get all the info, but sources only get current search?
+  // This is to give the ability of the LLM to "comprehend" a contextual response without
+  // populating the Citations under a response with documents the user "thinks" are irrelevant
+  // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
+  // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
+  // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
+  // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
+  contextTexts = [...contextTexts, ...filledSources.contextTexts];
   sources = [...sources, ...vectorSearchResults.sources];
 
-  // If in query mode and no sources are found from the vector search and no pinned documents, do not
+  // If in query mode and no context chunks are found from search, backfill, or pins -  do not
   // let the LLM try to hallucinate a response or use general knowledge and exit early
-  if (
-    chatMode === "query" &&
-    sources.length === 0 &&
-    pinnedDocIdentifiers.length === 0
-  ) {
+  if (chatMode === "query" && contextTexts.length === 0) {
     writeResponseChunk(response, {
       id: uuid,
       type: "textResponse",
diff --git a/server/utils/helpers/chat/index.js b/server/utils/helpers/chat/index.js
index 84afd516cde5819c069722e1aa4380757637d52f..6f565efe14e5a67b55413c6e02a85df3ece2231f 100644
--- a/server/utils/helpers/chat/index.js
+++ b/server/utils/helpers/chat/index.js
@@ -1,3 +1,5 @@
+const { sourceIdentifier } = require("../../chats");
+const { safeJsonParse } = require("../../http");
 const { TokenManager } = require("../tiktoken");
 const { convertToPromptHistory } = require("./responses");
 
@@ -343,7 +345,104 @@ function cannonball({
   return truncatedText;
 }
 
+/**
+ * Fill the sources window with the priority of
+ * 1. Pinned documents (handled prior to function)
+ * 2. VectorSearch results
+ * 3. prevSources in chat history - starting from most recent.
+ *
+ * Ensuring the window always has the desired amount of sources so that followup questions
+ * in any chat mode have relevant sources, but not infinite sources. This function is used during chatting
+ * and allows follow-up questions within a query chat that otherwise would have zero sources and would fail.
+ * The added benefit is that during regular RAG chat, we have better coherence of citations that otherwise would
+ * also yield no results with no need for a ReRanker to run and take much longer to return a response.
+ *
+ * The side effect of this is follow-up unrelated questions now have citations that would look totally irrelevant, however
+ * we would rather optimize on the correctness of a response vs showing extraneous sources during a response. Given search
+ * results always take a priority a good unrelated question that produces RAG results will still function as desired and due to previous
+ * history backfill sources "changing context" mid-chat is handled appropriately.
+ * example:
+ * ---previous implementation---
+ * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
+ * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> bad response due to bad context mgmt
+ * ---next implementation---
+ * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
+ * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> backfill with 3 good sources from previous -> much better response
+ *
+ * @param {Object} config - params to call
+ * @param {object} config.nDocs = fill size of the window
+ * @param {object} config.searchResults = vector similarityResponse results for .sources
+ * @param {object[]} config.history - rawHistory of chat containing sources
+ * @param {string[]} config.filterIdentifiers - Pinned document identifiers to prevent duplicate context
+ * @returns {{
+ *   contextTexts: string[],
+ *   sources: object[],
+ * }} - Array of sources that should be added to window
+ */
+function fillSourceWindow({
+  nDocs = 4, // Number of documents
+  searchResults = [], // Sources from similarity search
+  history = [], // Raw history
+  filterIdentifiers = [], // pinned document sources
+} = config) {
+  const sources = [...searchResults];
+
+  if (sources.length >= nDocs || history.length === 0) {
+    return {
+      sources,
+      contextTexts: sources.map((src) => src.text),
+    };
+  }
+
+  const log = (text, ...args) => {
+    console.log(`\x1b[36m[fillSourceWindow]\x1b[0m ${text}`, ...args);
+  };
+
+  log(
+    `Need to backfill ${nDocs - searchResults.length} chunks to fill in the source window for RAG!`
+  );
+  const seenChunks = new Set(searchResults.map((source) => source.id));
+
+  // We need to reverse again because we need to iterate from bottom of array (most recent chats)
+  // Looking at this function by itself you may think that this loop could be extreme for long history chats,
+  // but this was already handled where `history` we derived. This comes from `recentChatHistory` which
+  // includes a limit for history (default: 20). So this loop does not look as extreme as on first glance.
+  for (const chat of history.reverse()) {
+    if (sources.length >= nDocs) {
+      log(
+        `Citations backfilled to ${nDocs} references from ${searchResults.length} original citations.`
+      );
+      break;
+    }
+
+    const chatSources =
+      safeJsonParse(chat.response, { sources: [] })?.sources || [];
+    if (!chatSources?.length || !Array.isArray(chatSources)) continue;
+
+    const validSources = chatSources.filter((source) => {
+      return (
+        filterIdentifiers.includes(sourceIdentifier(source)) == false && // source cannot be in current pins
+        source.hasOwnProperty("score") && // source cannot have come from a pinned document that was previously pinned
+        source.hasOwnProperty("text") && // source has a valid text property we can use
+        seenChunks.has(source.id) == false // is unique
+      );
+    });
+
+    for (const validSource of validSources) {
+      if (sources.length >= nDocs) break;
+      sources.push(validSource);
+      seenChunks.add(validSource.id);
+    }
+  }
+
+  return {
+    sources,
+    contextTexts: sources.map((src) => src.text),
+  };
+}
+
 module.exports = {
   messageArrayCompressor,
   messageStringCompressor,
+  fillSourceWindow,
 };
diff --git a/server/utils/helpers/chat/responses.js b/server/utils/helpers/chat/responses.js
index d49c8a85a995872f3a7fdb59393f7c7e36624889..d07eae308e9575a96449b79635d0b96269dce446 100644
--- a/server/utils/helpers/chat/responses.js
+++ b/server/utils/helpers/chat/responses.js
@@ -38,8 +38,13 @@ function handleDefaultStreamResponseV2(response, stream, responseProps) {
         });
       }
 
-      // LocalAi returns '' and others return null.
-      if (message.finish_reason !== "" && message.finish_reason !== null) {
+      // LocalAi returns '' and others return null on chunks - the last chunk is not "" or null.
+      // Either way, the key `finish_reason` must be present to determine ending chunk.
+      if (
+        message?.hasOwnProperty("finish_reason") && // Got valid message and it is an object with finish_reason
+        message.finish_reason !== "" &&
+        message.finish_reason !== null
+      ) {
         writeResponseChunk(response, {
           uuid,
           sources,
@@ -50,6 +55,7 @@ function handleDefaultStreamResponseV2(response, stream, responseProps) {
         });
         response.removeListener("close", handleAbort);
         resolve(fullText);
+        break; // Break streaming when a valid finish_reason is first encountered
       }
     }
   });
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index c8811c9de99d90896bd448d597db80ca1785a64f..c95ccd57d01fc9968bb2deaf391ad777178ddc3d 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -526,7 +526,11 @@ function supportedTranscriptionProvider(input = "") {
 }
 
 function validGeminiModel(input = "") {
-  const validModels = ["gemini-pro", "gemini-1.5-pro-latest"];
+  const validModels = [
+    "gemini-pro",
+    "gemini-1.5-pro-latest",
+    "gemini-1.5-flash-latest",
+  ];
   return validModels.includes(input)
     ? null
     : `Invalid Model type. Must be one of ${validModels.join(", ")}.`;