From b6be43be95049209dd362ea3adc94f7cf7935128 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Tue, 14 May 2024 11:57:21 -0700
Subject: [PATCH] Add Speech-to-text and Text-to-speech providers (#1394)

* Add Speech-to-text and Text-to-speech providers

* add files and update comment

* update comments

* patch: bad playerRef check
---
 .vscode/settings.json                         |   1 +
 docker/.env.example                           |  13 ++
 frontend/package.json                         |   1 +
 frontend/src/App.jsx                          |   7 +
 .../src/components/SettingsSidebar/index.jsx  |   9 +
 .../SpeechToText/BrowserNative/index.jsx      |   9 +
 .../TextToSpeech/BrowserNative/index.jsx      |   9 +
 .../TextToSpeech/ElevenLabsOptions/index.jsx  | 107 +++++++++
 .../TextToSpeech/OpenAiOptions/index.jsx      |  45 ++++
 .../Actions/TTSButton/asyncTts.jsx            |  94 ++++++++
 .../Actions/TTSButton/index.jsx               |  23 ++
 .../Actions/TTSButton/native.jsx              |  61 +++++
 .../HistoricalMessage/Actions/index.jsx       |  65 +-----
 .../PromptInput/SpeechToText/index.jsx        |  82 +++++++
 .../ChatContainer/PromptInput/index.jsx       |   5 +
 .../src/media/ttsproviders/elevenlabs.png     | Bin 0 -> 6422 bytes
 frontend/src/models/system.js                 |   2 +-
 frontend/src/models/workspace.js              |  17 +-
 .../GeneralSettings/AudioPreference/index.jsx |  45 ++++
 .../GeneralSettings/AudioPreference/stt.jsx   | 191 ++++++++++++++++
 .../GeneralSettings/AudioPreference/tts.jsx   | 209 ++++++++++++++++++
 frontend/src/utils/paths.js                   |   3 +
 frontend/yarn.lock                            |   5 +
 server/.env.example                           |  13 ++
 server/endpoints/workspaces.js                |  50 ++++-
 server/models/systemSettings.js               |  11 +
 server/package.json                           |   1 +
 server/utils/TextToSpeech/elevenLabs/index.js |  54 +++++
 server/utils/TextToSpeech/index.js            |  15 ++
 server/utils/TextToSpeech/openAi/index.js     |  29 +++
 server/utils/helpers/customModels.js          |  30 +++
 server/utils/helpers/updateENV.js             |  31 +++
 server/yarn.lock                              |  65 +++++-
 33 files changed, 1234 insertions(+), 68 deletions(-)
 create mode 100644 frontend/src/components/SpeechToText/BrowserNative/index.jsx
 create mode 100644 frontend/src/components/TextToSpeech/BrowserNative/index.jsx
 create mode 100644 frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
 create mode 100644 frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
 create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
 create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
 create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
 create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
 create mode 100644 frontend/src/media/ttsproviders/elevenlabs.png
 create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/index.jsx
 create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx
 create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
 create mode 100644 server/utils/TextToSpeech/elevenLabs/index.js
 create mode 100644 server/utils/TextToSpeech/index.js
 create mode 100644 server/utils/TextToSpeech/openAi/index.js

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 110c4fa6e..4930aa2d1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,6 +11,7 @@
     "cooldowns",
     "Deduplicator",
     "Dockerized",
+    "elevenlabs",
     "Embeddable",
     "epub",
     "GROQ",
diff --git a/docker/.env.example b/docker/.env.example
index 8cfa2aea8..70059ea51 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -171,6 +171,19 @@ GID='1000'
 # WHISPER_PROVIDER="openai"
 # OPEN_AI_KEY=sk-xxxxxxxx
 
+###########################################
+######## TTS/STT Model Selection ##########
+###########################################
+# TTS_PROVIDER="native"
+
+# TTS_PROVIDER="openai"
+# TTS_OPEN_AI_KEY=sk-example
+# TTS_OPEN_AI_VOICE_MODEL=nova
+
+# TTS_PROVIDER="elevenlabs"
+# TTS_ELEVEN_LABS_KEY=
+# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # DISABLE_TELEMETRY="false"
diff --git a/frontend/package.json b/frontend/package.json
index ded06aa9c..11e612fcd 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -28,6 +28,7 @@
     "react-dropzone": "^14.2.3",
     "react-loading-skeleton": "^3.1.0",
     "react-router-dom": "^6.3.0",
+    "react-speech-recognition": "^3.10.0",
     "react-tag-input-component": "^2.0.2",
     "react-toastify": "^9.1.3",
     "react-tooltip": "^5.25.2",
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
index 0a5ed65fc..b29e6eea9 100644
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -32,6 +32,9 @@ const GeneralLLMPreference = lazy(
 const GeneralTranscriptionPreference = lazy(
   () => import("@/pages/GeneralSettings/TranscriptionPreference")
 );
+const GeneralAudioPreference = lazy(
+  () => import("@/pages/GeneralSettings/AudioPreference")
+);
 const GeneralEmbeddingPreference = lazy(
   () => import("@/pages/GeneralSettings/EmbeddingPreference")
 );
@@ -85,6 +88,10 @@ export default function App() {
                   <AdminRoute Component={GeneralTranscriptionPreference} />
                 }
               />
+              <Route
+                path="/settings/audio-preference"
+                element={<AdminRoute Component={GeneralAudioPreference} />}
+              />
               <Route
                 path="/settings/embedding-preference"
                 element={<AdminRoute Component={GeneralEmbeddingPreference} />}
diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx
index 67797d266..6b8f79e5e 100644
--- a/frontend/src/components/SettingsSidebar/index.jsx
+++ b/frontend/src/components/SettingsSidebar/index.jsx
@@ -21,6 +21,7 @@ import {
   ClosedCaptioning,
   EyeSlash,
   SplitVertical,
+  Microphone,
 } from "@phosphor-icons/react";
 import useUser from "@/hooks/useUser";
 import { USER_BACKGROUND_COLOR } from "@/utils/constants";
@@ -280,6 +281,14 @@ const SidebarOptions = ({ user = null }) => (
       flex={true}
       allowedRole={["admin"]}
     />
+    <Option
+      href={paths.settings.audioPreference()}
+      btnText="Voice and Speech Support"
+      icon={<Microphone className="h-5 w-5 flex-shrink-0" />}
+      user={user}
+      flex={true}
+      allowedRole={["admin"]}
+    />
     <Option
       href={paths.settings.transcriptionPreference()}
       btnText="Transcription Model"
diff --git a/frontend/src/components/SpeechToText/BrowserNative/index.jsx b/frontend/src/components/SpeechToText/BrowserNative/index.jsx
new file mode 100644
index 000000000..1e9bcb3c2
--- /dev/null
+++ b/frontend/src/components/SpeechToText/BrowserNative/index.jsx
@@ -0,0 +1,9 @@
+export default function BrowserNative() {
+  return (
+    <div className="w-full h-10 items-center flex">
+      <p className="text-sm font-base text-white text-opacity-60">
+        There is no configuration needed for this provider.
+      </p>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/BrowserNative/index.jsx b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx
new file mode 100644
index 000000000..1e9bcb3c2
--- /dev/null
+++ b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx
@@ -0,0 +1,9 @@
+export default function BrowserNative() {
+  return (
+    <div className="w-full h-10 items-center flex">
+      <p className="text-sm font-base text-white text-opacity-60">
+        There is no configuration needed for this provider.
+      </p>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
new file mode 100644
index 000000000..ad86caa1c
--- /dev/null
+++ b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx
@@ -0,0 +1,107 @@
+import { useState, useEffect } from "react";
+import System from "@/models/system";
+
+export default function ElevenLabsOptions({ settings }) {
+  const [inputValue, setInputValue] = useState(settings?.TTSElevenLabsKey);
+  const [openAIKey, setOpenAIKey] = useState(settings?.TTSElevenLabsKey);
+
+  return (
+    <div className="flex gap-x-4">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          API Key
+        </label>
+        <input
+          type="password"
+          name="TTSElevenLabsKey"
+          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="ElevenLabs API Key"
+          defaultValue={settings?.TTSElevenLabsKey ? "*".repeat(20) : ""}
+          required={true}
+          autoComplete="off"
+          spellCheck={false}
+          onChange={(e) => setInputValue(e.target.value)}
+          onBlur={() => setOpenAIKey(inputValue)}
+        />
+      </div>
+      {!settings?.credentialsOnly && (
+        <ElevenLabsModelSelection settings={settings} apiKey={openAIKey} />
+      )}
+    </div>
+  );
+}
+
+function ElevenLabsModelSelection({ apiKey, settings }) {
+  const [groupedModels, setGroupedModels] = useState({});
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function findCustomModels() {
+      setLoading(true);
+      const { models } = await System.customModels(
+        "elevenlabs-tts",
+        typeof apiKey === "boolean" ? null : apiKey
+      );
+
+      if (models?.length > 0) {
+        const modelsByOrganization = models.reduce((acc, model) => {
+          acc[model.organization] = acc[model.organization] || [];
+          acc[model.organization].push(model);
+          return acc;
+        }, {});
+        setGroupedModels(modelsByOrganization);
+      }
+
+      setLoading(false);
+    }
+    findCustomModels();
+  }, [apiKey]);
+
+  if (loading) {
+    return (
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Chat Model Selection
+        </label>
+        <select
+          name="TTSElevenLabsVoiceModel"
+          disabled={true}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          <option disabled={true} selected={true}>
+            -- loading available models --
+          </option>
+        </select>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex flex-col w-60">
+      <label className="text-white text-sm font-semibold block mb-4">
+        Chat Model Selection
+      </label>
+      <select
+        name="TTSElevenLabsVoiceModel"
+        required={true}
+        className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+      >
+        {Object.keys(groupedModels)
+          .sort()
+          .map((organization) => (
+            <optgroup key={organization} label={organization}>
+              {groupedModels[organization].map((model) => (
+                <option
+                  key={model.id}
+                  value={model.id}
+                  selected={settings?.OpenAiModelPref === model.id}
+                >
+                  {model.name}
+                </option>
+              ))}
+            </optgroup>
+          ))}
+      </select>
+    </div>
+  );
+}
diff --git a/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
new file mode 100644
index 000000000..4183a4e58
--- /dev/null
+++ b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx
@@ -0,0 +1,45 @@
+function toProperCase(string) {
+  return string.replace(/\w\S*/g, function (txt) {
+    return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase();
+  });
+}
+
+export default function OpenAiTextToSpeechOptions({ settings }) {
+  const apiKey = settings?.TTSOpenAIKey;
+
+  return (
+    <div className="flex gap-x-4">
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          API Key
+        </label>
+        <input
+          type="password"
+          name="TTSOpenAIKey"
+          className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="OpenAI API Key"
+          defaultValue={apiKey ? "*".repeat(20) : ""}
+          required={true}
+          autoComplete="off"
+          spellCheck={false}
+        />
+      </div>
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Voice Model
+        </label>
+        <select
+          name="TTSOpenAIVoiceModel"
+          defaultValue={settings?.TTSOpenAIVoiceModel ?? "alloy"}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          {["alloy", "echo", "fable", "onyx", "nova", "shimmer"].map(
+            (voice) => {
+              return <option value={voice}>{toProperCase(voice)}</option>;
+            }
+          )}
+        </select>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
new file mode 100644
index 000000000..1947f0057
--- /dev/null
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx
@@ -0,0 +1,94 @@
+import { useEffect, useState, useRef } from "react";
+import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+import Workspace from "@/models/workspace";
+import showToast from "@/utils/toast";
+
+export default function AsyncTTSMessage({ slug, chatId }) {
+  const playerRef = useRef(null);
+  const [speaking, setSpeaking] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [audioSrc, setAudioSrc] = useState(null);
+
+  function speakMessage() {
+    if (speaking) {
+      playerRef?.current?.pause();
+      return;
+    }
+
+    try {
+      if (!audioSrc) {
+        setLoading(true);
+        Workspace.ttsMessage(slug, chatId)
+          .then((audioBlob) => {
+            if (!audioBlob)
+              throw new Error("Failed to load or play TTS message response.");
+            setAudioSrc(audioBlob);
+          })
+          .catch((e) => showToast(e.message, "error", { clear: true }))
+          .finally(() => setLoading(false));
+      } else {
+        playerRef.current.play();
+      }
+    } catch (e) {
+      console.error(e);
+      setLoading(false);
+      setSpeaking(false);
+    }
+  }
+
+  useEffect(() => {
+    function setupPlayer() {
+      if (!playerRef?.current) return;
+      playerRef.current.addEventListener("play", () => {
+        setSpeaking(true);
+      });
+
+      playerRef.current.addEventListener("pause", () => {
+        playerRef.current.currentTime = 0;
+        setSpeaking(false);
+      });
+    }
+    setupPlayer();
+  }, []);
+
+  if (!chatId) return null;
+  return (
+    <div className="mt-3 relative">
+      <button
+        onClick={speakMessage}
+        data-tooltip-id="message-to-speech"
+        data-tooltip-content={
+          speaking ? "Pause TTS speech of message" : "TTS Speak message"
+        }
+        className="border-none text-zinc-300"
+        aria-label={speaking ? "Pause speech" : "Speak message"}
+      >
+        {speaking ? (
+          <PauseCircle size={18} className="mb-1" />
+        ) : (
+          <>
+            {loading ? (
+              <CircleNotch size={18} className="mb-1 animate-spin" />
+            ) : (
+              <SpeakerHigh size={18} className="mb-1" />
+            )}
+          </>
+        )}
+        <audio
+          ref={playerRef}
+          hidden={true}
+          src={audioSrc}
+          autoPlay={true}
+          controls={false}
+        />
+      </button>
+      <Tooltip
+        id="message-to-speech"
+        place="bottom"
+        delayShow={300}
+        className="tooltip !text-xs"
+      />
+    </div>
+  );
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
new file mode 100644
index 000000000..644a57afc
--- /dev/null
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx
@@ -0,0 +1,23 @@
+import { useEffect, useState } from "react";
+import NativeTTSMessage from "./native";
+import AsyncTTSMessage from "./asyncTts";
+import System from "@/models/system";
+
+export default function TTSMessage({ slug, chatId, message }) {
+  const [provider, setProvider] = useState("native");
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function getSettings() {
+      const _settings = await System.keys();
+      setProvider(_settings?.TextToSpeechProvider ?? "native");
+      setLoading(false);
+    }
+    getSettings();
+  }, []);
+
+  if (loading) return null;
+  if (provider !== "native")
+    return <AsyncTTSMessage slug={slug} chatId={chatId} />;
+  return <NativeTTSMessage message={message} />;
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
new file mode 100644
index 000000000..5f3bd3f69
--- /dev/null
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
@@ -0,0 +1,61 @@
+import React, { useEffect, useState } from "react";
+import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+
+export default function NativeTTSMessage({ message }) {
+  const [speaking, setSpeaking] = useState(false);
+  const [supported, setSupported] = useState(false);
+  useEffect(() => {
+    setSupported("speechSynthesis" in window);
+  }, []);
+
+  function endSpeechUtterance() {
+    window.speechSynthesis?.cancel();
+    setSpeaking(false);
+    return;
+  }
+
+  function speakMessage() {
+    // if the user is pausing this particular message
+    // while the synth is speaking we can end it.
+    // If they are clicking another message's TTS
+    // we need to ignore that until they pause the one that is playing.
+    if (window.speechSynthesis.speaking && speaking) {
+      endSpeechUtterance();
+      return;
+    }
+
+    if (window.speechSynthesis.speaking && !speaking) return;
+    const utterance = new SpeechSynthesisUtterance(message);
+    utterance.addEventListener("end", endSpeechUtterance);
+    window.speechSynthesis.speak(utterance);
+    setSpeaking(true);
+  }
+
+  if (!supported) return null;
+  return (
+    <div className="mt-3 relative">
+      <button
+        onClick={speakMessage}
+        data-tooltip-id="message-to-speech"
+        data-tooltip-content={
+          speaking ? "Pause TTS speech of message" : "TTS Speak message"
+        }
+        className="border-none text-zinc-300"
+        aria-label={speaking ? "Pause speech" : "Speak message"}
+      >
+        {speaking ? (
+          <PauseCircle size={18} className="mb-1" />
+        ) : (
+          <SpeakerHigh size={18} className="mb-1" />
+        )}
+      </button>
+      <Tooltip
+        id="message-to-speech"
+        place="bottom"
+        delayShow={300}
+        className="tooltip !text-xs"
+      />
+    </div>
+  );
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx
index 3bdee472d..52ae1466a 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx
@@ -1,4 +1,4 @@
-import React, { memo, useEffect, useState } from "react";
+import React, { memo, useState } from "react";
 import useCopyText from "@/hooks/useCopyText";
 import {
   Check,
@@ -6,11 +6,10 @@ import {
   ThumbsUp,
   ThumbsDown,
   ArrowsClockwise,
-  SpeakerHigh,
-  PauseCircle,
 } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 import Workspace from "@/models/workspace";
+import TTSMessage from "./TTSButton";
 
 const Actions = ({
   message,
@@ -60,7 +59,7 @@ const Actions = ({
           </>
         )}
       </div>
-      <TTSMessage message={message} />
+      <TTSMessage slug={slug} chatId={chatId} message={message} />
     </div>
   );
 };
@@ -149,62 +148,4 @@ function RegenerateMessage({ regenerateMessage, chatId }) {
   );
 }
 
-function TTSMessage({ message }) {
-  const [speaking, setSpeaking] = useState(false);
-  const [supported, setSupported] = useState(false);
-  useEffect(() => {
-    setSupported("speechSynthesis" in window);
-  }, []);
-
-  function endSpeechUtterance() {
-    window.speechSynthesis?.cancel();
-    setSpeaking(false);
-    return;
-  }
-
-  function speakMessage() {
-    // if the user is pausing this particular message
-    // while the synth if speaking we can end it.
-    // If they are clicking another message's TTS
-    // we need to ignore that until they pause the one that is playing.
-    if (window.speechSynthesis.speaking && speaking) {
-      endSpeechUtterance();
-      return;
-    }
-
-    if (window.speechSynthesis.speaking && !speaking) return;
-    const utterance = new SpeechSynthesisUtterance(message);
-    utterance.addEventListener("end", endSpeechUtterance);
-    window.speechSynthesis.speak(utterance);
-    setSpeaking(true);
-  }
-
-  if (!supported) return null;
-  return (
-    <div className="mt-3 relative">
-      <button
-        onClick={speakMessage}
-        data-tooltip-id="message-to-speech"
-        data-tooltip-content={
-          speaking ? "Pause TTS speech of message" : "TTS Speak message"
-        }
-        className="border-none text-zinc-300"
-        aria-label={speaking ? "Pause speech" : "Speak message"}
-      >
-        {speaking ? (
-          <PauseCircle size={18} className="mb-1" />
-        ) : (
-          <SpeakerHigh size={18} className="mb-1" />
-        )}
-      </button>
-      <Tooltip
-        id="message-to-speech"
-        place="bottom"
-        delayShow={300}
-        className="tooltip !text-xs"
-      />
-    </div>
-  );
-}
-
 export default memo(Actions);
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
new file mode 100644
index 000000000..6cbcfbf8d
--- /dev/null
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
@@ -0,0 +1,82 @@
+import { useEffect } from "react";
+import { Microphone } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+import _regeneratorRuntime from "regenerator-runtime";
+import SpeechRecognition, {
+  useSpeechRecognition,
+} from "react-speech-recognition";
+
+let timeout;
+const SILENCE_INTERVAL = 3_200; // wait in seconds of silence before closing.
+export default function SpeechToText({ sendCommand }) {
+  const {
+    transcript,
+    listening,
+    resetTranscript,
+    browserSupportsSpeechRecognition,
+    browserSupportsContinuousListening,
+    isMicrophoneAvailable,
+  } = useSpeechRecognition({
+    clearTranscriptOnListen: true,
+  });
+
+  function startSTTSession() {
+    if (!isMicrophoneAvailable) {
+      alert(
+        "AnythingLLM does not have access to microphone. Please enable for this site to use this feature."
+      );
+      return;
+    }
+
+    resetTranscript();
+    SpeechRecognition.startListening({
+      continuous: browserSupportsContinuousListening,
+      language: window?.navigator?.language ?? "en-US",
+    });
+  }
+
+  function endTTSSession() {
+    SpeechRecognition.stopListening();
+    if (transcript.length > 0) {
+      sendCommand(transcript, true);
+    }
+
+    resetTranscript();
+    clearTimeout(timeout);
+  }
+
+  useEffect(() => {
+    if (transcript?.length > 0) {
+      sendCommand(transcript, false);
+      clearTimeout(timeout);
+      timeout = setTimeout(() => {
+        endTTSSession();
+      }, SILENCE_INTERVAL);
+    }
+  }, [transcript]);
+
+  if (!browserSupportsSpeechRecognition) return null;
+  return (
+    <div
+      id="text-size-btn"
+      data-tooltip-id="tooltip-text-size-btn"
+      data-tooltip-content="Speak your prompt"
+      aria-label="Speak your prompt"
+      onClick={listening ? endTTSSession : startSTTSession}
+      className={`relative flex justify-center items-center opacity-60 hover:opacity-100 cursor-pointer ${
+        !!listening ? "!opacity-100" : ""
+      }`}
+    >
+      <Microphone
+        weight="fill"
+        className="w-6 h-6 pointer-events-none text-white"
+      />
+      <Tooltip
+        id="tooltip-text-size-btn"
+        place="top"
+        delayShow={300}
+        className="tooltip !text-xs z-99"
+      />
+    </div>
+  );
+}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
index 98ad11f8f..df08bcc7c 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@@ -12,6 +12,7 @@ import AvailableAgentsButton, {
   useAvailableAgents,
 } from "./AgentMenu";
 import TextSizeButton from "./TextSizeMenu";
+import SpeechToText from "./SpeechToText";
 
 export const PROMPT_INPUT_EVENT = "set_prompt_input";
 export default function PromptInput({
@@ -34,6 +35,7 @@ export default function PromptInput({
   function handlePromptUpdate(e) {
     setPromptInput(e?.detail ?? "");
   }
+
   useEffect(() => {
     if (!!window)
       window.addEventListener(PROMPT_INPUT_EVENT, handlePromptUpdate);
@@ -156,6 +158,9 @@ export default function PromptInput({
                 />
                 <TextSizeButton />
               </div>
+              <div className="flex gap-x-2">
+                <SpeechToText sendCommand={sendCommand} />
+              </div>
             </div>
           </div>
         </div>
diff --git a/frontend/src/media/ttsproviders/elevenlabs.png b/frontend/src/media/ttsproviders/elevenlabs.png
new file mode 100644
index 0000000000000000000000000000000000000000..b1047e422e3c7a855caec5ef2bae1fcc647495e5
GIT binary patch
literal 6422
zcmeAS@N?(olHy`uVBq!ia0y~yVAKI&4rT@h2EWC1?hFhJjKx9jPK-BC>eMqZFmM)l
zL>4nJa0`PlBg3pY5)2GXe*=6%T>t<7|L@<w8#ivOTD5BR>eVY&thjsk?vEcojvYI;
za^=c9ckX=u{{8sz<7?NhUAJ!C`t|D{KYsl4=g;fcuO}oVxVyW1czC>c@#5F7U;X|4
zVq#(v5)$I#;`8Rsd-LW^N=nLt1q=TC`SbVh-{;StOG!ydN=p9z{rl9ZQ)+5zEG#Un
ztgP(p><=G4?CR=bVq#)rW1BT=*6Y`=BO)T0nVFwGdlnoVynOlc)2B}}GBPqSFm!Zu
z%$zwhC@5&@(xo$I%+S!#P*+#quwg?(L&KRfXHK3x`S$JGxpU{vnKS48`}enQ-CD9_
zNpo{^X=!O?W##VOyEks!*wfQfQBhG|Uf$l`e);m{$&)8%Wo0EMCg$bk-Mo48$&)7z
z4i3D$ygWQSrlzKCZEg4O-xn4Z77`MYk&(G_<%)=ih^VM&NJxl@iHWMJYC%E4vSrKo
z`1ttw`3nmR?d<G0I5=EfTn-*QxM$BEZf<TzN5{UtzNu5Ee*OB@(9rPRyLXEgEy~Qy
ztgEY2P*6}-R-Qe3_P1}}7A{=a+1YvZ>eY`QKTe!D@ztwWTwGk;-Q82BOffJpxOnm6
zg9i_yqN2RKysWLQKYaL*oSdAUoxOGI*7@`2b8>Qi`SK+!ENuJs?YD2=&d<*m5D<ur
zj1&|Uw6wI;)YO!hmyeB&EiNurR8%~4=+K%qYg}Dj?d|PlWo5T*+g4Ombno812@@uq
zJ9kb+Mdj0{Pc}9-TefUzY;4rk)qVN$<)urPT3cJ?<m3(?K0IyOG#?)yTU%Q#Ev@tC
z&-?oNHZ?UF85soz2BxN_nwy(Hdh|$NUtd~UIv^n6$dMx@B_;9k@%#7h*U`~wX=$mi
zuUAr1a&vRryLa!dUAv;Aqd$NCTvb(-larI1n;RM$x?{(V>C>k_efl&bBV*^zo&Ns*
z+S=NFetzEG-bqPG7cN|giHWhYvT}BIPD@KWaNxk=#fzPsoZ{l*%*@Qn%E~rv+EiU#
zJ!#UU-rnBY+S+~l_MJU@HatB1#EBC}j~=b5skwITnx3AXg@uKur>C*8ae8{XPS2Ne
z1_seho-U3d6?5Ls<&6<JdjCNB1lz}c_eAc0Xtib5xgW6E-N)J2Rr}(Z52o>_4wVbm
zX!<w@pH?{<Qu8wO${C-t<!6I}YgYTHdIx(<+vMZ2*EzrQ_Tk#HZ|@#7e4cOL>HOmR
z-<mypZ_k}M_otP$wefEz)f2JKr>&VqIj68Pq%t~0G6aY*j50O+Be&I_ema9y;;F{z
zeXsxb>6o7iT5#n|UC2_gZ4WZmDK%ti?sb0krA&13o9w$jQ%{6@&EyOZjEXFc=$v{e
zev#3_uWL7@@QQL=3{DQ^e}3}x&d6UTv9`0*e?9LFN;?!fB`nNXOLM-|tWzni7xv$D
zTzXdgvPk6Bmv3X9x6g1}mFBTpD(Oq$x4dmnmOr0rB)o3XmQA<rrg=)#_D7{=-q4@?
zJT&wg^E7Lf{7HY--AI)*`EMCnr>XIJ{!Z>lwUtl&xcQx@Jk{{t_bk3f{nON4^$T>=
zH~B>A?sci*Qj+NCKX`odyUBrJo{?$iw0#T2gi|diKb`w(%jM2Zb55P|C>FcCQfHd%
z&0SMu!zM0jUX-*t!%^mVKx^ZQQd8Zp2VU))wDi2fT}|f;|7#|`U=7=`!2W7r<F$>`
z-)pbp{?D_O#b~YMDW$*viggzTaMVY}n)n2VeNCO69}*uQ75janju%@@mV^JQrx(ov
zySC3duTYg6#~xOE=Y>0~=*qgQem`4wTwbrQRW0~MH+$KgZFw(U&i*!9d+<fZsh(Fa
zf_iVCzAv@4rQ_89!$!p`bMD%2T5+CxwQ^te4eQiPmm}v_{YtrT;GT8r(iJ;x&C_md
zw)xf29#r=3<P+@{Kif+e-m;g!W?Or4?e&9xuh)Ln&678FnRn;H{JM9!YdeGXZ{|Oz
z->WNr;?>3*yA5j69Mif_F8bJ6H^*7LdB#ybUNK3InWg{OQ`gGON$QW<?KscxCg;oV
zyVl4%u9p|!R-UO7SNHaQ`nHTabu&|+F1)*b?YB+hK_boH?mXYL=Mq=B(tS^{ua&AZ
zt2PQKO@FPJ87Y#y<3E?;hg++&Ma+%f{$95A<|58xbEQ<{`dz1F%o8k%{A(!asV%4~
zVL17q`DEiOvzBgq@%N=vUx$lt{|U=yABDXGZx~HioxCkt?WVK7LFe|jXWa}V^Y*lb
zIxIf(^478fp}-n};>p~REl1Wpnv#++U$n{e*`)8OvW6E|d~6BnzV&i#&gsJI&ngNQ
zwT3EQUh#8Q$xXHH3EQgo_H?=Hg|EG<cJ8->CfDn(jf&wCbF%N+9K7~TWTNZyW%I9X
zka}c2;gY23>o4<P9_7>98T(h0ak5;*i?jRkR9}5`y;tvkGv&qhmpYPC)0R}2L@rzM
zd)@!7TfaPV3Tu4g&Of7tDbi@7+6B>+14(P6cKU4G{Io}?bw_lJg16ctTN8EPoliri
z#ko&CbRvkS_w1$9D#hDAeu`AfpW?s9ea;WT?8WQ$@9|2Iy!InByiO`^{{)?DS2oH@
zJlcOxpJl?C2D{wn**D%*t?{1Dozl7S<hyi_n7f%#Wm9I|GMQQRe(kjE<rzLHZrh9_
z|LmBff79i`6|G=S>l+yk|8KXuTnqU+`~B3vbxhL9ujWOZlhXZb?0!?<$fB>l{oF-?
zn?GJ`-uQ}f)pYSU_Q!IL@~)bmHa%4?d#3t_Q`6l_XVxbD(p~=OL{Z%4ALWm2M41!s
zF78~L_=s1oH1go`%MzNI@9V<0-ukH>+MTs;#?i7J6Z+J&7fpS9ammwv8;&p3R@eM`
z+2&UEuGndveqGCR+N*y44G)R*&l7mJbAz5G54XV9$#br)S@28Ay71ic=1n;Zzdkvf
zQI@c6$KLpJwb`nxea`<WGwJ=XOS`o6)W=^%d;I2Y?RmTFZ|CW48<)q;sPS0a?7w#7
zX(MTAj+xgsUMNoAbF+(8pW}PXjZOB_mjeIDRlks{KDX6NPxExV-@H>nGdC<Xc&NVp
z*{ms(<NhtZ)ly=;X5Rlj>mrQ4)NZ<TBk;$o62tt{+FqZIZD`Z{X0dkiERDI^kv^|Z
zaGOQGza)NX&C{G)MxiU`r~RMbQmwa!`&Z<wcUQR%?^nB}J*!z~{x4s>4WA>5_jf3N
zYq(y0bi<M@wL-Q#j!V5u<-1qibDpD0-|u7Mi%*_s-`w3DVeLNaXXy6hng18{SDmU}
zzq)>x@3h-HXEkk|Q~O$G4a24?op0Ja`I;7m)+dyT|Aa;O$MR`L`+vQ4bnlj?^Iv$^
zJea$=KgumdKKY*A8~-CQ=YOdxZ8VlMw4YLRI$`ow4WE43e-kSAhqFZfw=q+ZZSsrU
zedB1cn()6Rs*B4`#LMNJ`>Ha3bMow<lRghK?bF!GjOT00os5#Y{6^!X+UYCVdRxz)
zk4T-#q}8o6d1-~hIholDOhWv7<W3)B;%q*<yKjA1%GUVz$}BfiruMk(TB|?d>xJi2
z{jPmxPL5cvRl;1TE%xf$wCB2#A+wF#e&)rPe|pvO^-Ya=vewxZe?KghTC<S*ZpH41
zCAW7U*50VI_RfEk-@jhyFSqGxjS76Ox>WpVes7XzWY+e5%dV&iZ(sG8_mTCUuR+(e
zTJ@vtRz=Kt!1vE->C~tCc?q%XpC_pvH|A!}$WHszl<Q`(;JBdKv7a`jny;nLygima
z;a26U#N?XI90%oG`6gXq;bOZqH6Sc5;PPpmd&;?D0tdHQHoRG)`MHH5d-|m7fnh<P
zXYg-1)fZ;Fv}a{z!W%8IPL-Z*#}ymzs?G73FS_8%P3<*nu4rF5S#c(CTAjbj?uW;|
zq`hLla%=m7t&^_*Pk4DZ|E$q#ldSL8k8ix#f5h&^DZiX4C*I2*x~21YlOAJ}>B$AL
z;qF<-w=P!Rziz{(Wi3;B%pUAoP&lPjxv<r0Qg)45(KM+AVduYjTV|c&&$jHpsw-vx
z#j;m-oy_O!N{*4Q>@%)(i`p-_dHweq>9RHFR6@j_%l|wRWO!R==9jnS3thH6m?&Of
zJZI5E@%_iF7B2g9_vTLLDZ8$&__!sb&oo(h(H~aR`Pt$(Qciw(9CuC7r{+k3Y0!z;
zUk=Y~@yorpQh)YJgZ>+n)I!e8I$AQN<Lb_%Q&~2?koj|>IcJWxw@~a0H>v!{gDZ<!
zzC32U#@7F7HhbjqPfw>ym2QnaXsO)$;7XdewD7db><C^@-zF0u<E5b?k$wgxE5m)J
z)om-ARrSzun$Fyl;rt%2LS)S?Rwg8UF4~c+wD<i!?ro)CBI5l^okF*?OkST-K3A&#
zmo(dNpOrdcUXd>KZ>C?AbA7cS=S|&;Yn2D*U3?!G;$F8&!v5yIz6)A~(y8;63b#0a
z3SV}A=j(#|eY}}+<?RtWWB<(53Hx#9!}_(;51G8)moxjoO|}cCUPL@n%l%-dCX!<E
z@z*geoxR7e{SxCjmidQ&UY@$eqSW1?j|!4r%y|B^q|EKd>giRiML$}YBCm>X@LoH6
zp4f@KL2aA9@$D}=ex=<^K8bJdkv7daPmk_=WvVANVMo>{3GPDmE+J2)$xL!QTfTE%
z>G{TY<Mck6HE#_{mi@VR(>Ym3Zjw}c-|NjrMox3hmwI1&B2d=Ev*q-x&*@y3zh78+
z$YkM^xjM6UyxtkyvPEfml;CaqZz~t9$$f4)HA86A&GpBk&2HWPka)cM+N(WbRZUMi
z`Q6Wc=1}hwPkPZUD?LBFsa<@g%kr9QXT+Yf&)FMkzW(F+vsve(GdG?yysX}ITeJK9
z^ryE|eMRCz7fqhmf2TjZ{2k+7qg@X_uRo)Z@0#zkxcsLDW4Fok%cl*u-utKAFZAzT
z=pW-$&DZryb;a}a7qy6SJ~UZhe&@lPzQu<^Rl^SKT%UFAFW)JqhD{6$bQm<a8DPxs
zOLgYe7ChN!95-##mu;3-FF0J}Yo_T+z2AL*-aI|$NXOiLCK}xGcU2=6X8x7muGzg@
zr{?9msQ=$JPixM4BXMZindbh3ed@PV9x==ooioY&+kWNKWu@-Un-^}Hlw|5PYx4i<
zO+HpT)^yCZno>GF(kZe*XQ7V6v;Z*$Zw>AVs#91`2%Re0S02X^Fn!za4QsSzQv&Dh
z5cc$Ro6zmLCAQ*3$&M`%&!#Nb@tU-%#b@2+q{9z0*Yn#47;Mv=GF9P-{x%bzpT)nU
zUmJB_F_QY>vAi+*_JykRxl-jOcT@8pXt_sj31j~sruqEJq<tYvQ#UMMdx6h%b=sAy
z&2R3_ytC(O`!Xw=-1YXVz9GV*hvT@{S6vF-cFI>w^?ckF0nfeBZFhek;+0*uKWlGA
z&h)EhUuQqQcAOz4&|4~hRqAfd2_c<!I)zu{K3?o7en0K$($M2J)BCEYx}H3eYZV#3
z!%1xCc6+OJc6FJPuhzX=vN`7A*UK05<L2tGFnoPfUq5F3i>^I;{;l%X3a>nQDL#J2
zfjnE~w55NabyWFFnOErVK72y$w20FG6_ao7l?gLeJH0R=f7{L4n?+y01%1_%{HnKP
zj;d*NSW=|pVW;Dp)n}zH{#s_|S*ud@a*NN?HR+R&zS|bcXEgEb%vIlWC69ey>z%TS
zbMH#FH1-WTUT@y%2=Ba;yO%p>YQK)xte`N>l+1T$kDmW6SEJE+_qv+*WcJTZIn$4}
z*rff~>&08wKP~v^yV)f%7k=;DSUV%x%&qLtIotgI<&*r(Ce7Su8&jY#tzT#FZ}Zs5
zsYd6s3w5S(sM!3x6k&G%d&PSRZ?{IB;Nu$t*Xt`;e|z%#e0aR`habU<Jxb0WJTrCU
z|E66s8(rLed43v%OU0Ki)_%6&w7ZzSU)0%oEApq_Xo_Gtw0m+)yl(Zo*L%}tvZlTK
z{p0)<n_c>`i}y(%`miy5y4=3sC2wvXF0_0jw}C-#!NtxEQ}av@3jh7?wDZKGl(#vH
zLZeoN`lm+9FHp#3P0gG%D|ZR&rk?QiH{Wf$)St1VN0)d0x_ciJ_PeNQ_en1gt^T^T
zefvBA*ekvI@{ylzM^?79{P~o(Z;i~PAhrJq_d|8oEIr4#_pgD;#CWbau@nEVOx5Ly
zeQq+(X#Kih3^%`U9%mKONIf;>gVWP@Q`mUJ#8(?t8YLw?5$D~(mbH5B@3rBLI>pax
zSF+ZAJJ1-v;$K<Y*Zm*VPjB{dTHk+PIH~WGV9fV@4~yO$6c&=-{?z5kt;@ZA)zS6h
z6II(}^i~*7`F=k1ci8O8e--KF+3V*`xbW}xqO^NI%ue52?3n6kd1}_%?SH#>Fi)Dc
zH_Tm6Q(OI;YLv$6=YrRE8hxp8fB!EvKk4<)&4ClXZ1@nk;-KlaQ)}7nlUze}H*WA!
z(ypzz^vmbt;eehQuNAbn3QtarxzjsIZ*%+wuU2N&$wx}pZw}E&wS3oAG->Jmc~d?!
zzs}NHEarE~H|OTod;h0xP43WIeLG3(_4O@Pb39@mez@RJzW-P5(zjLudkYFws(V}{
zf4Od}dGf<N?qt^Wg)b6+L>9A03QxXjc<z1p)Y7e+uf?n^&n;aybMs#D?vC5n&aK!o
zH>h^;H0zrS&fR&^YdleX_U|8_3z;<aw5JwNFjWq;O^C0XY5JzBIihv;-}0<oCn9g`
zKGvbBv#T(5s%xzh|4RR2R{xHWU(a89rDxxh`F3iqQf&M2<E*z?U#xy!wdvu7A7!;`
z(@!XL27D}7ecbNVqTt#3Kd)!$y*NC*f6dN^fos$i+<d=nw<{7dQSKC(oT_xsJ!jL3
zWqz^SOgtjDUlH$T)2J4#urxl9?&c%^jZ0<ctM_^@X6|yj=9+uaf=TjLfwHvJy`Bww
z?@TPeZLqC^+5e37pSA1V6dmhx=B_K#J>)+>{Mv^n6UB_L3ii!A5wH8BuabX$uw|O;
zpYmPeokppl%Z@iB`-EjIProYks(t6%q|=Kwb?jVwJ$>dWqXc&@X&%kx&7Frke3+;G
z;=kN>wK{yQS50r$^~kecoa=bn6%~_T&**YFS@GlKl)ch7b_Uo)G>cDJy3g;jv8vV5
zH~z76-c-gZ>?wRv_q9K<Blo|RazIe(r?|cniG6wVZcU#1M|Jj`X_Ko%_###N`1{Xj
zihticuc7AU@>R;mw&iU}+co)NpVa$h_jj&WOQ~KJ?RzVaS&5_fUqh)x%BM>U6i>fC
z|8Kv=H=doEUrM%dTScv#5GoRRWxnFq-d9Jtf`9aF)>M}Fy*lZ@mq)qN<~&~bMb@Ry
zV_W*--reT9q8T+|tB-DLk9D7W%QA89*T4scv$ZZfk$=Enr?F|<=Zmw~uX?}Kv~bcT
z$?G%a-|cV-_nD?xcsFzHVON>tb~El9htDs`^uFyXKUwLLRpfo+Me(jL4({JLdGG7f
zNz<49OO#BSKJ~E)S9ra(V)lN|hdEkPZe33KdOw$QcTQo%)Tt(2!9~}8zl?r-H#c!g
zSbpi1%bTyvJhWu$$Nby(Ggs$LQc9MZG3VW@_+?u*wap9qn!}y8>6zR1#G@Bq{o7=x
zJXhJWS9nkCHSSpF@>}=fSBcKItN(VqOlY=<&9l`fyfTbul`gY;bL~rp@meX5pEYR#
z@wcPze)%-#gWvX>DXlIRPbd3CexIDZ?7x@x>FV3+r=D+_Bj$50=X-a~UGA&Z8b>eL
zOZ;YduhVtA<FD8Bxjh;8AO22Nzsvq_*0q$2%#AaxnK*YA<$O-mx;)|daXl6F>uJ3w
z%t~TD?)~ZTHk&v8{ja_CiPz0Me!WcVHFTfqx7RYLUdhm2V{ekr#ShaX`-9hLujpi5
z)!%Wu{_3u$HhsTNIZ0{oK4HHaFSe_)Qgo8u>&;huR|!UW|2>`jq%EkkE&TS@sWsuH
zztj2N{|>AAursdn`8|obPDjq3weKuEd6cVk|I*&|_l#2XJKxUiTD0FQV8hH?^QN5B
z@HifocTdVMVYPAPwH2qGHkwp?3-Ujdx4FDRup%-zD)RrKbN?5v`NHS;#<Kh&Z~jxQ
m)r%U@n=ME!m=VqV&%b`P;X}qQ;~)kG1_n=8KbLh*2~7Z0H7_Rs

literal 0
HcmV?d00001

diff --git a/frontend/src/models/system.js b/frontend/src/models/system.js
index e64b01199..f8f123448 100644
--- a/frontend/src/models/system.js
+++ b/frontend/src/models/system.js
@@ -332,7 +332,7 @@ const System = {
       })
       .then((blob) => (blob ? URL.createObjectURL(blob) : null))
       .catch((e) => {
-        console.log(e);
+        // console.log(e);
         return null;
       });
   },
diff --git a/frontend/src/models/workspace.js b/frontend/src/models/workspace.js
index 91f4a2db3..64732c044 100644
--- a/frontend/src/models/workspace.js
+++ b/frontend/src/models/workspace.js
@@ -272,6 +272,21 @@ const Workspace = {
         return false;
       });
   },
+  ttsMessage: async function (slug, chatId) {
+    return await fetch(`${API_BASE}/workspace/${slug}/tts/${chatId}`, {
+      method: "GET",
+      cache: "no-cache",
+      headers: baseHeaders(),
+    })
+      .then((res) => {
+        if (res.ok && res.status !== 204) return res.blob();
+        throw new Error("Failed to fetch TTS.");
+      })
+      .then((blob) => (blob ? URL.createObjectURL(blob) : null))
+      .catch((e) => {
+        return null;
+      });
+  },
   threads: WorkspaceThread,
 
   uploadPfp: async function (formData, slug) {
@@ -302,7 +317,7 @@ const Workspace = {
       })
       .then((blob) => (blob ? URL.createObjectURL(blob) : null))
       .catch((e) => {
-        console.log(e);
+        // console.log(e);
         return null;
       });
   },
diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx
new file mode 100644
index 000000000..c4abaf546
--- /dev/null
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx
@@ -0,0 +1,45 @@
+import React, { useEffect, useState, useRef } from "react";
+import { isMobile } from "react-device-detect";
+import Sidebar from "@/components/SettingsSidebar";
+import System from "@/models/system";
+import PreLoader from "@/components/Preloader";
+import SpeechToTextProvider from "./stt";
+import TextToSpeechProvider from "./tts";
+
+export default function AudioPreference() {
+  const [settings, setSettings] = useState(null);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function fetchKeys() {
+      const _settings = await System.keys();
+      setSettings(_settings);
+      setLoading(false);
+    }
+    fetchKeys();
+  }, []);
+
+  return (
+    <div className="w-screen h-screen overflow-hidden bg-sidebar flex">
+      <Sidebar />
+      {loading ? (
+        <div
+          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
+          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
+        >
+          <div className="w-full h-full flex justify-center items-center">
+            <PreLoader />
+          </div>
+        </div>
+      ) : (
+        <div
+          style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
+          className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
+        >
+          <SpeechToTextProvider settings={settings} />
+          <TextToSpeechProvider settings={settings} />
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx
new file mode 100644
index 000000000..58bb1489b
--- /dev/null
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx
@@ -0,0 +1,191 @@
+import React, { useEffect, useState, useRef } from "react";
+import System from "@/models/system";
+import showToast from "@/utils/toast";
+import LLMItem from "@/components/LLMSelection/LLMItem";
+import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
+import CTAButton from "@/components/lib/CTAButton";
+import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
+import BrowserNative from "@/components/SpeechToText/BrowserNative";
+
+const PROVIDERS = [
+  {
+    name: "System native",
+    value: "native",
+    logo: AnythingLLMIcon,
+    options: (settings) => <BrowserNative settings={settings} />,
+    description: "Uses your browser's built in STT service if supported.",
+  },
+];
+
+export default function SpeechToTextProvider({ settings }) {
+  const [saving, setSaving] = useState(false);
+  const [hasChanges, setHasChanges] = useState(false);
+  const [searchQuery, setSearchQuery] = useState("");
+  const [filteredProviders, setFilteredProviders] = useState([]);
+  const [selectedProvider, setSelectedProvider] = useState(
+    settings?.SpeechToTextProvider || "native"
+  );
+  const [searchMenuOpen, setSearchMenuOpen] = useState(false);
+  const searchInputRef = useRef(null);
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    const form = e.target;
+    const data = { SpeechToTextProvider: selectedProvider };
+    const formData = new FormData(form);
+
+    for (var [key, value] of formData.entries()) data[key] = value;
+    const { error } = await System.updateSystem(data);
+    setSaving(true);
+
+    if (error) {
+      showToast(`Failed to save preferences: ${error}`, "error");
+    } else {
+      showToast("Speech-to-text preferences saved successfully.", "success");
+    }
+    setSaving(false);
+    setHasChanges(!!error);
+  };
+
+  const updateProviderChoice = (selection) => {
+    setSearchQuery("");
+    setSelectedProvider(selection);
+    setSearchMenuOpen(false);
+    setHasChanges(true);
+  };
+
+  const handleXButton = () => {
+    if (searchQuery.length > 0) {
+      setSearchQuery("");
+      if (searchInputRef.current) searchInputRef.current.value = "";
+    } else {
+      setSearchMenuOpen(!searchMenuOpen);
+    }
+  };
+
+  useEffect(() => {
+    const filtered = PROVIDERS.filter((provider) =>
+      provider.name.toLowerCase().includes(searchQuery.toLowerCase())
+    );
+    setFilteredProviders(filtered);
+  }, [searchQuery, selectedProvider]);
+
+  const selectedProviderObject = PROVIDERS.find(
+    (provider) => provider.value === selectedProvider
+  );
+
+  return (
+    <form onSubmit={handleSubmit} className="flex w-full">
+      <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
+        <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10">
+          <div className="flex gap-x-4 items-center">
+            <p className="text-lg leading-6 font-bold text-white">
+              Speech-to-text Preference
+            </p>
+          </div>
+          <p className="text-xs leading-[18px] font-base text-white text-opacity-60">
+            Here you can specify what kind of text-to-speech and speech-to-text
+            providers you would want to use in your AnythingLLM experience. By
+            default, we use the browser's built in support for these services,
+            but you may want to use others.
+          </p>
+        </div>
+        <div className="w-full justify-end flex">
+          {hasChanges && (
+            <CTAButton
+              onClick={() => handleSubmit()}
+              className="mt-3 mr-0 -mb-14 z-10"
+            >
+              {saving ? "Saving..." : "Save changes"}
+            </CTAButton>
+          )}
+        </div>
+        <div className="text-base font-bold text-white mt-6 mb-4">Provider</div>
+        <div className="relative">
+          {searchMenuOpen && (
+            <div
+              className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10"
+              onClick={() => setSearchMenuOpen(false)}
+            />
+          )}
+          {searchMenuOpen ? (
+            <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20">
+              <div className="w-full flex flex-col gap-y-1">
+                <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]">
+                  <MagnifyingGlass
+                    size={20}
+                    weight="bold"
+                    className="absolute left-4 z-30 text-white -ml-4 my-2"
+                  />
+                  <input
+                    type="text"
+                    name="stt-provider-search"
+                    autoComplete="off"
+                    placeholder="Search speech to text providers"
+                    className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium"
+                    onChange={(e) => setSearchQuery(e.target.value)}
+                    ref={searchInputRef}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") e.preventDefault();
+                    }}
+                  />
+                  <X
+                    size={20}
+                    weight="bold"
+                    className="cursor-pointer text-white hover:text-[#9CA3AF]"
+                    onClick={handleXButton}
+                  />
+                </div>
+                <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4">
+                  {filteredProviders.map((provider) => (
+                    <LLMItem
+                      key={provider.name}
+                      name={provider.name}
+                      value={provider.value}
+                      image={provider.logo}
+                      description={provider.description}
+                      checked={selectedProvider === provider.value}
+                      onClick={() => updateProviderChoice(provider.value)}
+                    />
+                  ))}
+                </div>
+              </div>
+            </div>
+          ) : (
+            <button
+              className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300"
+              type="button"
+              onClick={() => setSearchMenuOpen(true)}
+            >
+              <div className="flex gap-x-4 items-center">
+                <img
+                  src={selectedProviderObject.logo}
+                  alt={`${selectedProviderObject.name} logo`}
+                  className="w-10 h-10 rounded-md"
+                />
+                <div className="flex flex-col text-left">
+                  <div className="text-sm font-semibold text-white">
+                    {selectedProviderObject.name}
+                  </div>
+                  <div className="mt-1 text-xs text-[#D2D5DB]">
+                    {selectedProviderObject.description}
+                  </div>
+                </div>
+              </div>
+              <CaretUpDown size={24} weight="bold" className="text-white" />
+            </button>
+          )}
+        </div>
+        <div
+          onChange={() => setHasChanges(true)}
+          className="mt-4 flex flex-col gap-y-1"
+        >
+          {selectedProvider &&
+            PROVIDERS.find(
+              (provider) => provider.value === selectedProvider
+            )?.options(settings)}
+        </div>
+      </div>
+    </form>
+  );
+}
diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
new file mode 100644
index 000000000..6b11f1a46
--- /dev/null
+++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx
@@ -0,0 +1,209 @@
+import React, { useEffect, useState, useRef } from "react";
+import System from "@/models/system";
+import showToast from "@/utils/toast";
+import LLMItem from "@/components/LLMSelection/LLMItem";
+import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react";
+import CTAButton from "@/components/lib/CTAButton";
+import OpenAiLogo from "@/media/llmprovider/openai.png";
+import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png";
+import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png";
+import BrowserNative from "@/components/TextToSpeech/BrowserNative";
+import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions";
+import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions";
+
+const PROVIDERS = [
+  {
+    name: "System native",
+    value: "native",
+    logo: AnythingLLMIcon,
+    options: (settings) => <BrowserNative settings={settings} />,
+    description: "Uses your browser's built in TTS service if supported.",
+  },
+  {
+    name: "OpenAI",
+    value: "openai",
+    logo: OpenAiLogo,
+    options: (settings) => <OpenAiTTSOptions settings={settings} />,
+    description: "Use OpenAI's text to speech voices.",
+  },
+  {
+    name: "ElevenLabs",
+    value: "elevenlabs",
+    logo: ElevenLabsIcon,
+    options: (settings) => <ElevenLabsTTSOptions settings={settings} />,
+    description: "Use ElevenLabs's text to speech voices and technology.",
+  },
+];
+
+export default function TextToSpeechProvider({ settings }) {
+  const [saving, setSaving] = useState(false);
+  const [hasChanges, setHasChanges] = useState(false);
+  const [searchQuery, setSearchQuery] = useState("");
+  const [filteredProviders, setFilteredProviders] = useState([]);
+  const [selectedProvider, setSelectedProvider] = useState(
+    settings?.TextToSpeechProvider || "native"
+  );
+  const [searchMenuOpen, setSearchMenuOpen] = useState(false);
+  const searchInputRef = useRef(null);
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    const form = e.target;
+    const data = { TextToSpeechProvider: selectedProvider };
+    const formData = new FormData(form);
+
+    for (var [key, value] of formData.entries()) data[key] = value;
+    const { error } = await System.updateSystem(data);
+    setSaving(true);
+
+    if (error) {
+      showToast(`Failed to save preferences: ${error}`, "error");
+    } else {
+      showToast("Text-to-speech preferences saved successfully.", "success");
+    }
+    setSaving(false);
+    setHasChanges(!!error);
+  };
+
+  const updateProviderChoice = (selection) => {
+    setSearchQuery("");
+    setSelectedProvider(selection);
+    setSearchMenuOpen(false);
+    setHasChanges(true);
+  };
+
+  const handleXButton = () => {
+    if (searchQuery.length > 0) {
+      setSearchQuery("");
+      if (searchInputRef.current) searchInputRef.current.value = "";
+    } else {
+      setSearchMenuOpen(!searchMenuOpen);
+    }
+  };
+
+  useEffect(() => {
+    const filtered = PROVIDERS.filter((provider) =>
+      provider.name.toLowerCase().includes(searchQuery.toLowerCase())
+    );
+    setFilteredProviders(filtered);
+  }, [searchQuery, selectedProvider]);
+
+  const selectedProviderObject = PROVIDERS.find(
+    (provider) => provider.value === selectedProvider
+  );
+
+  return (
+    <form onSubmit={handleSubmit} className="flex w-full">
+      <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
+        <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10">
+          <div className="flex gap-x-4 items-center">
+            <p className="text-lg leading-6 font-bold text-white">
+              Text-to-speech Preference
+            </p>
+          </div>
+          <p className="text-xs leading-[18px] font-base text-white text-opacity-60">
+            Here you can specify what kind of text-to-speech providers you would
+            want to use in your AnythingLLM experience. By default, we use the
+            browser's built in support for these services, but you may want to
+            use others.
+          </p>
+        </div>
+        <div className="w-full justify-end flex">
+          {hasChanges && (
+            <CTAButton
+              onClick={() => handleSubmit()}
+              className="mt-3 mr-0 -mb-14 z-10"
+            >
+              {saving ? "Saving..." : "Save changes"}
+            </CTAButton>
+          )}
+        </div>
+        <div className="text-base font-bold text-white mt-6 mb-4">Provider</div>
+        <div className="relative">
+          {searchMenuOpen && (
+            <div
+              className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10"
+              onClick={() => setSearchMenuOpen(false)}
+            />
+          )}
+          {searchMenuOpen ? (
+            <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20">
+              <div className="w-full flex flex-col gap-y-1">
+                <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]">
+                  <MagnifyingGlass
+                    size={20}
+                    weight="bold"
+                    className="absolute left-4 z-30 text-white -ml-4 my-2"
+                  />
+                  <input
+                    type="text"
+                    name="tts-provider-search"
+                    autoComplete="off"
+                    placeholder="Search text to speech providers"
+                    className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium"
+                    onChange={(e) => setSearchQuery(e.target.value)}
+                    ref={searchInputRef}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") e.preventDefault();
+                    }}
+                  />
+                  <X
+                    size={20}
+                    weight="bold"
+                    className="cursor-pointer text-white hover:text-[#9CA3AF]"
+                    onClick={handleXButton}
+                  />
+                </div>
+                <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4">
+                  {filteredProviders.map((provider) => (
+                    <LLMItem
+                      key={provider.name}
+                      name={provider.name}
+                      value={provider.value}
+                      image={provider.logo}
+                      description={provider.description}
+                      checked={selectedProvider === provider.value}
+                      onClick={() => updateProviderChoice(provider.value)}
+                    />
+                  ))}
+                </div>
+              </div>
+            </div>
+          ) : (
+            <button
+              className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300"
+              type="button"
+              onClick={() => setSearchMenuOpen(true)}
+            >
+              <div className="flex gap-x-4 items-center">
+                <img
+                  src={selectedProviderObject.logo}
+                  alt={`${selectedProviderObject.name} logo`}
+                  className="w-10 h-10 rounded-md"
+                />
+                <div className="flex flex-col text-left">
+                  <div className="text-sm font-semibold text-white">
+                    {selectedProviderObject.name}
+                  </div>
+                  <div className="mt-1 text-xs text-[#D2D5DB]">
+                    {selectedProviderObject.description}
+                  </div>
+                </div>
+              </div>
+              <CaretUpDown size={24} weight="bold" className="text-white" />
+            </button>
+          )}
+        </div>
+        <div
+          onChange={() => setHasChanges(true)}
+          className="mt-4 flex flex-col gap-y-1"
+        >
+          {selectedProvider &&
+            PROVIDERS.find(
+              (provider) => provider.value === selectedProvider
+            )?.options(settings)}
+        </div>
+      </div>
+    </form>
+  );
+}
diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js
index 4dc4d5285..cc2b69eee 100644
--- a/frontend/src/utils/paths.js
+++ b/frontend/src/utils/paths.js
@@ -98,6 +98,9 @@ export default {
     transcriptionPreference: () => {
       return "/settings/transcription-preference";
     },
+    audioPreference: () => {
+      return "/settings/audio-preference";
+    },
     embedder: {
       modelPreference: () => "/settings/embedding-preference",
       chunkingPreference: () => "/settings/text-splitter-preference",
diff --git a/frontend/yarn.lock b/frontend/yarn.lock
index bd12e9fa3..93bdc0884 100644
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@@ -2841,6 +2841,11 @@ react-smooth@^4.0.0:
     prop-types "^15.8.1"
     react-transition-group "^4.4.5"
 
+react-speech-recognition@^3.10.0:
+  version "3.10.0"
+  resolved "https://registry.yarnpkg.com/react-speech-recognition/-/react-speech-recognition-3.10.0.tgz#7aa43bb28d78b92671864dabba3a70489ccad27b"
+  integrity sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw==
+
 react-tag-input-component@^2.0.2:
   version "2.0.2"
   resolved "https://registry.yarnpkg.com/react-tag-input-component/-/react-tag-input-component-2.0.2.tgz#f62f013c6a535141dd1c6c3a88858223170150f1"
diff --git a/server/.env.example b/server/.env.example
index 290a07096..5e0233b7b 100644
--- a/server/.env.example
+++ b/server/.env.example
@@ -168,6 +168,19 @@ WHISPER_PROVIDER="local"
 # WHISPER_PROVIDER="openai"
 # OPEN_AI_KEY=sk-xxxxxxxx
 
+###########################################
+######## TTS/STT Model Selection ##########
+###########################################
+TTS_PROVIDER="native"
+
+# TTS_PROVIDER="openai"
+# TTS_OPEN_AI_KEY=sk-example
+# TTS_OPEN_AI_VOICE_MODEL=nova
+
+# TTS_PROVIDER="elevenlabs"
+# TTS_ELEVEN_LABS_KEY=
+# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # STORAGE_DIR= # absolute filesystem path with no trailing slash
diff --git a/server/endpoints/workspaces.js b/server/endpoints/workspaces.js
index c22c679a0..81cbd6154 100644
--- a/server/endpoints/workspaces.js
+++ b/server/endpoints/workspaces.js
@@ -1,6 +1,11 @@
 const path = require("path");
 const fs = require("fs");
-const { reqBody, multiUserMode, userFromSession } = require("../utils/http");
+const {
+  reqBody,
+  multiUserMode,
+  userFromSession,
+  safeJsonParse,
+} = require("../utils/http");
 const { normalizePath } = require("../utils/files");
 const { Workspace } = require("../models/workspace");
 const { Document } = require("../models/documents");
@@ -25,6 +30,7 @@ const {
   determineWorkspacePfpFilepath,
   fetchPfp,
 } = require("../utils/files/pfp");
+const { getTTSProvider } = require("../utils/TextToSpeech");
 
 function workspaceEndpoints(app) {
   if (!app) return;
@@ -506,6 +512,48 @@ function workspaceEndpoints(app) {
     }
   );
 
+  app.get(
+    "/workspace/:slug/tts/:chatId",
+    [validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug],
+    async function (request, response) {
+      try {
+        const { chatId } = request.params;
+        const workspace = response.locals.workspace;
+        const cacheKey = `${workspace.slug}:${chatId}`;
+        const wsChat = await WorkspaceChats.get({
+          id: Number(chatId),
+          workspaceId: workspace.id,
+        });
+
+        const cachedResponse = responseCache.get(cacheKey);
+        if (cachedResponse) {
+          response.writeHead(200, {
+            "Content-Type": cachedResponse.mime || "audio/mpeg",
+          });
+          response.end(cachedResponse.buffer);
+          return;
+        }
+
+        const text = safeJsonParse(wsChat.response, null)?.text;
+        if (!text) return response.sendStatus(204).end();
+
+        const TTSProvider = getTTSProvider();
+        const buffer = await TTSProvider.ttsBuffer(text);
+        if (buffer === null) return response.sendStatus(204).end();
+
+        responseCache.set(cacheKey, { buffer, mime: "audio/mpeg" });
+        response.writeHead(200, {
+          "Content-Type": "audio/mpeg",
+        });
+        response.end(buffer);
+        return;
+      } catch (error) {
+        console.error("Error processing the TTS request:", error);
+        response.status(500).json({ message: "TTS could not be completed" });
+      }
+    }
+  );
+
   app.get(
     "/workspace/:slug/pfp",
     [validatedRequest, flexUserRoleValid([ROLES.all])],
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
index 904c448d5..248ca8cd7 100644
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -131,6 +131,17 @@ const SystemSettings = {
       // --------------------------------------------------------
       WhisperProvider: process.env.WHISPER_PROVIDER || "local",
 
+      // --------------------------------------------------------
+      // TTS/STT  Selection Settings & Configs
+      // - Currently the only 3rd party is OpenAI or the native browser-built in
+      // --------------------------------------------------------
+      TextToSpeechProvider: process.env.TTS_PROVIDER || "native",
+      TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY,
+      TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL,
+      // Eleven Labs TTS
+      TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY,
+      TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL,
+
       // --------------------------------------------------------
       // Agent Settings & Configs
       // --------------------------------------------------------
diff --git a/server/package.json b/server/package.json
index edee71b02..73b947c46 100644
--- a/server/package.json
+++ b/server/package.json
@@ -44,6 +44,7 @@
     "cohere-ai": "^7.9.5",
     "cors": "^2.8.5",
     "dotenv": "^16.0.3",
+    "elevenlabs": "^0.5.0",
     "express": "^4.18.2",
     "express-ws": "^5.0.2",
     "extract-json-from-string": "^1.0.1",
diff --git a/server/utils/TextToSpeech/elevenLabs/index.js b/server/utils/TextToSpeech/elevenLabs/index.js
new file mode 100644
index 000000000..e3d25f3ae
--- /dev/null
+++ b/server/utils/TextToSpeech/elevenLabs/index.js
@@ -0,0 +1,54 @@
+const { ElevenLabsClient, stream } = require("elevenlabs");
+
+class ElevenLabsTTS {
+  constructor() {
+    if (!process.env.TTS_ELEVEN_LABS_KEY)
+      throw new Error("No ElevenLabs API key was set.");
+    this.elevenLabs = new ElevenLabsClient({
+      apiKey: process.env.TTS_ELEVEN_LABS_KEY,
+    });
+
+    // Rachel as default voice
+    // https://api.elevenlabs.io/v1/voices
+    this.voiceId =
+      process.env.TTS_ELEVEN_LABS_VOICE_MODEL ?? "21m00Tcm4TlvDq8ikWAM";
+    this.modelId = "eleven_multilingual_v2";
+  }
+
+  static async voices(apiKey = null) {
+    try {
+      const client = new ElevenLabsClient({
+        apiKey: apiKey ?? process.env.TTS_ELEVEN_LABS_KEY ?? null,
+      });
+      return (await client.voices.getAll())?.voices ?? [];
+    } catch {}
+    return [];
+  }
+
+  #stream2buffer(stream) {
+    return new Promise((resolve, reject) => {
+      const _buf = [];
+      stream.on("data", (chunk) => _buf.push(chunk));
+      stream.on("end", () => resolve(Buffer.concat(_buf)));
+      stream.on("error", (err) => reject(err));
+    });
+  }
+
+  async ttsBuffer(textInput) {
+    try {
+      const audio = await this.elevenLabs.generate({
+        voice: this.voiceId,
+        text: textInput,
+        model_id: "eleven_multilingual_v2",
+      });
+      return Buffer.from(await this.#stream2buffer(audio));
+    } catch (e) {
+      console.error(e);
+    }
+    return null;
+  }
+}
+
+module.exports = {
+  ElevenLabsTTS,
+};
diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js
new file mode 100644
index 000000000..155fc9540
--- /dev/null
+++ b/server/utils/TextToSpeech/index.js
@@ -0,0 +1,15 @@
+function getTTSProvider() {
+  const provider = process.env.TTS_PROVIDER || "openai";
+  switch (provider) {
+    case "openai":
+      const { OpenAiTTS } = require("./openAi");
+      return new OpenAiTTS();
+    case "elevenlabs":
+      const { ElevenLabsTTS } = require("./elevenLabs");
+      return new ElevenLabsTTS();
+    default:
+      throw new Error("ENV: No TTS_PROVIDER value found in environment!");
+  }
+}
+
+module.exports = { getTTSProvider };
diff --git a/server/utils/TextToSpeech/openAi/index.js b/server/utils/TextToSpeech/openAi/index.js
new file mode 100644
index 000000000..3c5b4840d
--- /dev/null
+++ b/server/utils/TextToSpeech/openAi/index.js
@@ -0,0 +1,29 @@
+class OpenAiTTS {
+  constructor() {
+    if (!process.env.TTS_OPEN_AI_KEY)
+      throw new Error("No OpenAI API key was set.");
+    const { OpenAI: OpenAIApi } = require("openai");
+    this.openai = new OpenAIApi({
+      apiKey: process.env.TTS_OPEN_AI_KEY,
+    });
+    this.voice = process.env.TTS_OPEN_AI_VOICE_MODEL ?? "alloy";
+  }
+
+  async ttsBuffer(textInput) {
+    try {
+      const result = await this.openai.audio.speech.create({
+        model: "tts-1",
+        voice: this.voice,
+        input: textInput,
+      });
+      return Buffer.from(await result.arrayBuffer());
+    } catch (e) {
+      console.error(e);
+    }
+    return null;
+  }
+}
+
+module.exports = {
+  OpenAiTTS,
+};
diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js
index b7aae93be..caf5a77c7 100644
--- a/server/utils/helpers/customModels.js
+++ b/server/utils/helpers/customModels.js
@@ -4,6 +4,7 @@ const {
 } = require("../AiProviders/openRouter");
 const { perplexityModels } = require("../AiProviders/perplexity");
 const { togetherAiModels } = require("../AiProviders/togetherAi");
+const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs");
 const SUPPORT_CUSTOM_MODELS = [
   "openai",
   "localai",
@@ -15,6 +16,7 @@ const SUPPORT_CUSTOM_MODELS = [
   "openrouter",
   "lmstudio",
   "koboldcpp",
+  "elevenlabs-tts",
 ];
 
 async function getCustomModels(provider = "", apiKey = null, basePath = null) {
@@ -42,6 +44,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) {
       return await getLMStudioModels(basePath);
     case "koboldcpp":
       return await getKoboldCPPModels(basePath);
+    case "elevenlabs-tts":
+      return await getElevenLabsModels(apiKey);
     default:
       return { models: [], error: "Invalid provider for custom models" };
   }
@@ -321,6 +325,32 @@ function nativeLLMModels() {
   return { models: files, error: null };
 }
 
+async function getElevenLabsModels(apiKey = null) {
+  const models = (await ElevenLabsTTS.voices(apiKey)).map((model) => {
+    return {
+      id: model.voice_id,
+      organization: model.category,
+      name: model.name,
+    };
+  });
+
+  if (models.length === 0) {
+    return {
+      models: [
+        {
+          id: "21m00Tcm4TlvDq8ikWAM",
+          organization: "premade",
+          name: "Rachel (default)",
+        },
+      ],
+      error: null,
+    };
+  }
+
+  if (models.length > 0 && !!apiKey) process.env.TTS_ELEVEN_LABS_KEY = apiKey;
+  return { models, error: null };
+}
+
 module.exports = {
   getCustomModels,
 };
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index 947fbc624..e2b1d2e1c 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -366,6 +366,32 @@ const KEY_MAPPING = {
     envKey: "AGENT_SERPER_DEV_KEY",
     checks: [],
   },
+
+  // TTS/STT Integration ENVS
+  TextToSpeechProvider: {
+    envKey: "TTS_PROVIDER",
+    checks: [supportedTTSProvider],
+  },
+
+  // TTS OpenAI
+  TTSOpenAIKey: {
+    envKey: "TTS_OPEN_AI_KEY",
+    checks: [validOpenAIKey],
+  },
+  TTSOpenAIVoiceModel: {
+    envKey: "TTS_OPEN_AI_VOICE_MODEL",
+    checks: [],
+  },
+
+  // TTS ElevenLabs
+  TTSElevenLabsKey: {
+    envKey: "TTS_ELEVEN_LABS_KEY",
+    checks: [isNotEmpty],
+  },
+  TTSElevenLabsVoiceModel: {
+    envKey: "TTS_ELEVEN_LABS_VOICE_MODEL",
+    checks: [],
+  },
 };
 
 function isNotEmpty(input = "") {
@@ -419,6 +445,11 @@ function validOllamaLLMBasePath(input = "") {
   }
 }
 
+function supportedTTSProvider(input = "") {
+  const validSelection = ["native", "openai", "elevenlabs"].includes(input);
+  return validSelection ? null : `${input} is not a valid TTS provider.`;
+}
+
 function supportedLLM(input = "") {
   const validSelection = [
     "openai",
diff --git a/server/yarn.lock b/server/yarn.lock
index 5edd09a35..9e4f184d5 100644
--- a/server/yarn.lock
+++ b/server/yarn.lock
@@ -1901,6 +1901,11 @@ combined-stream@^1.0.8:
   dependencies:
     delayed-stream "~1.0.0"
 
+command-exists@^1.2.9:
+  version "1.2.9"
+  resolved "https://registry.yarnpkg.com/command-exists/-/command-exists-1.2.9.tgz#c50725af3808c8ab0260fd60b01fbfa25b954f69"
+  integrity sha512-LTQ/SGc+s0Xc0Fu5WaKnR0YiygZkm9eKFvyS+fRsU7/ZWFF8ykFM6Pc9aCVf1+xasOOZpO3BAVgVrKvsqKHV7w==
+
 command-line-args@5.2.1, command-line-args@^5.2.1:
   version "5.2.1"
   resolved "https://registry.yarnpkg.com/command-line-args/-/command-line-args-5.2.1.tgz#c44c32e437a57d7c51157696893c5909e9cec42e"
@@ -2255,6 +2260,18 @@ ee-first@1.1.1:
   resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d"
   integrity sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==
 
+elevenlabs@^0.5.0:
+  version "0.5.0"
+  resolved "https://registry.yarnpkg.com/elevenlabs/-/elevenlabs-0.5.0.tgz#07eb1a943b0ab99b925875bd5c57833a3a024e58"
+  integrity sha512-jfex4ecuWIlyAUuMrMJAJNa5MLziqYQOCDw4ZYuoc9PCYLxtHwaYBWpZoDhnYMcceLI7rRRvmbLMcT9HlVMfHA==
+  dependencies:
+    command-exists "^1.2.9"
+    execa "^5.1.1"
+    form-data "4.0.0"
+    node-fetch "2.7.0"
+    qs "6.11.2"
+    url-join "4.0.1"
+
 emoji-regex@^10.2.1:
   version "10.3.0"
   resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-10.3.0.tgz#76998b9268409eb3dae3de989254d456e70cfe23"
@@ -2605,6 +2622,21 @@ eventemitter3@^4.0.4:
   resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-4.0.7.tgz#2de9b68f6528d5644ef5c59526a1b4a07306169f"
   integrity sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==
 
+execa@^5.1.1:
+  version "5.1.1"
+  resolved "https://registry.yarnpkg.com/execa/-/execa-5.1.1.tgz#f80ad9cbf4298f7bd1d4c9555c21e93741c411dd"
+  integrity sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==
+  dependencies:
+    cross-spawn "^7.0.3"
+    get-stream "^6.0.0"
+    human-signals "^2.1.0"
+    is-stream "^2.0.0"
+    merge-stream "^2.0.0"
+    npm-run-path "^4.0.1"
+    onetime "^5.1.2"
+    signal-exit "^3.0.3"
+    strip-final-newline "^2.0.0"
+
 expand-template@^2.0.3:
   version "2.0.3"
   resolved "https://registry.yarnpkg.com/expand-template/-/expand-template-2.0.3.tgz#6e14b3fcee0f3a6340ecb57d2e8918692052a47c"
@@ -3024,6 +3056,11 @@ get-stream@^5.1.0:
   dependencies:
     pump "^3.0.0"
 
+get-stream@^6.0.0:
+  version "6.0.1"
+  resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-6.0.1.tgz#a262d8eef67aced57c2852ad6167526a43cbf7b7"
+  integrity sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==
+
 get-symbol-description@^1.0.2:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/get-symbol-description/-/get-symbol-description-1.0.2.tgz#533744d5aa20aca4e079c8e5daf7fd44202821f5"
@@ -3297,6 +3334,11 @@ https-proxy-agent@^7.0.0:
     agent-base "^7.0.2"
     debug "4"
 
+human-signals@^2.1.0:
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
+  integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
+
 humanize-ms@^1.2.1:
   version "1.2.1"
   resolved "https://registry.yarnpkg.com/humanize-ms/-/humanize-ms-1.2.1.tgz#c46e3159a293f6b896da29316d8b6fe8bb79bbed"
@@ -4092,6 +4134,11 @@ merge-descriptors@1.0.1:
   resolved "https://registry.yarnpkg.com/merge-descriptors/-/merge-descriptors-1.0.1.tgz#b00aaa556dd8b44568150ec9d1b953f3f90cbb61"
   integrity sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==
 
+merge-stream@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60"
+  integrity sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==
+
 methods@~1.1.2:
   version "1.1.2"
   resolved "https://registry.yarnpkg.com/methods/-/methods-1.1.2.tgz#5529a4d67654134edcc5266656835b0f851afcee"
@@ -4455,6 +4502,13 @@ normalize-path@^3.0.0, normalize-path@~3.0.0:
   resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
   integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==
 
+npm-run-path@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
+  integrity sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==
+  dependencies:
+    path-key "^3.0.0"
+
 npmlog@^5.0.1:
   version "5.0.1"
   resolved "https://registry.yarnpkg.com/npmlog/-/npmlog-5.0.1.tgz#f06678e80e29419ad67ab964e0fa69959c1eb8b0"
@@ -4593,7 +4647,7 @@ one-time@^1.0.0:
   dependencies:
     fn.name "1.x.x"
 
-onetime@^5.1.0:
+onetime@^5.1.0, onetime@^5.1.2:
   version "5.1.2"
   resolved "https://registry.yarnpkg.com/onetime/-/onetime-5.1.2.tgz#d0e96ebb56b07476df1dd9c4806e5237985ca45e"
   integrity sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==
@@ -4774,7 +4828,7 @@ path-is-absolute@^1.0.0:
   resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f"
   integrity sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==
 
-path-key@^3.1.0:
+path-key@^3.0.0, path-key@^3.1.0:
   version "3.1.1"
   resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375"
   integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==
@@ -5322,7 +5376,7 @@ side-channel@^1.0.4, side-channel@^1.0.6:
     get-intrinsic "^1.2.4"
     object-inspect "^1.13.1"
 
-signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.7:
+signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7:
   version "3.0.7"
   resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9"
   integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==
@@ -5559,6 +5613,11 @@ strip-ansi@^7.0.1, strip-ansi@^7.1.0:
   dependencies:
     ansi-regex "^6.0.1"
 
+strip-final-newline@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad"
+  integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==
+
 strip-json-comments@^3.1.1:
   version "3.1.1"
   resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz#31f1281b3832630434831c310c01cccda8cbe006"
-- 
GitLab