From b6be43be95049209dd362ea3adc94f7cf7935128 Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Tue, 14 May 2024 11:57:21 -0700 Subject: [PATCH] Add Speech-to-text and Text-to-speech providers (#1394) * Add Speech-to-text and Text-to-speech providers * add files and update comment * update comments * patch: bad playerRef check --- .vscode/settings.json | 1 + docker/.env.example | 13 ++ frontend/package.json | 1 + frontend/src/App.jsx | 7 + .../src/components/SettingsSidebar/index.jsx | 9 + .../SpeechToText/BrowserNative/index.jsx | 9 + .../TextToSpeech/BrowserNative/index.jsx | 9 + .../TextToSpeech/ElevenLabsOptions/index.jsx | 107 +++++++++ .../TextToSpeech/OpenAiOptions/index.jsx | 45 ++++ .../Actions/TTSButton/asyncTts.jsx | 94 ++++++++ .../Actions/TTSButton/index.jsx | 23 ++ .../Actions/TTSButton/native.jsx | 61 +++++ .../HistoricalMessage/Actions/index.jsx | 65 +----- .../PromptInput/SpeechToText/index.jsx | 82 +++++++ .../ChatContainer/PromptInput/index.jsx | 5 + .../src/media/ttsproviders/elevenlabs.png | Bin 0 -> 6422 bytes frontend/src/models/system.js | 2 +- frontend/src/models/workspace.js | 17 +- .../GeneralSettings/AudioPreference/index.jsx | 45 ++++ .../GeneralSettings/AudioPreference/stt.jsx | 191 ++++++++++++++++ .../GeneralSettings/AudioPreference/tts.jsx | 209 ++++++++++++++++++ frontend/src/utils/paths.js | 3 + frontend/yarn.lock | 5 + server/.env.example | 13 ++ server/endpoints/workspaces.js | 50 ++++- server/models/systemSettings.js | 11 + server/package.json | 1 + server/utils/TextToSpeech/elevenLabs/index.js | 54 +++++ server/utils/TextToSpeech/index.js | 15 ++ server/utils/TextToSpeech/openAi/index.js | 29 +++ server/utils/helpers/customModels.js | 30 +++ server/utils/helpers/updateENV.js | 31 +++ server/yarn.lock | 65 +++++- 33 files changed, 1234 insertions(+), 68 deletions(-) create mode 100644 frontend/src/components/SpeechToText/BrowserNative/index.jsx create mode 100644 frontend/src/components/TextToSpeech/BrowserNative/index.jsx create mode 100644 frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx create mode 100644 frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx create mode 100644 frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx create mode 100644 frontend/src/media/ttsproviders/elevenlabs.png create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/index.jsx create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx create mode 100644 frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx create mode 100644 server/utils/TextToSpeech/elevenLabs/index.js create mode 100644 server/utils/TextToSpeech/index.js create mode 100644 server/utils/TextToSpeech/openAi/index.js diff --git a/.vscode/settings.json b/.vscode/settings.json index 110c4fa6e..4930aa2d1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -11,6 +11,7 @@ "cooldowns", "Deduplicator", "Dockerized", + "elevenlabs", "Embeddable", "epub", "GROQ", diff --git a/docker/.env.example b/docker/.env.example index 8cfa2aea8..70059ea51 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -171,6 +171,19 @@ GID='1000' # WHISPER_PROVIDER="openai" # OPEN_AI_KEY=sk-xxxxxxxx +########################################### +######## TTS/STT Model Selection ########## +########################################### +# TTS_PROVIDER="native" + +# TTS_PROVIDER="openai" +# TTS_OPEN_AI_KEY=sk-example +# TTS_OPEN_AI_VOICE_MODEL=nova + +# TTS_PROVIDER="elevenlabs" +# TTS_ELEVEN_LABS_KEY= +# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # DISABLE_TELEMETRY="false" diff --git a/frontend/package.json b/frontend/package.json index ded06aa9c..11e612fcd 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -28,6 +28,7 @@ "react-dropzone": "^14.2.3", "react-loading-skeleton": "^3.1.0", "react-router-dom": "^6.3.0", + "react-speech-recognition": "^3.10.0", "react-tag-input-component": "^2.0.2", "react-toastify": "^9.1.3", "react-tooltip": "^5.25.2", diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index 0a5ed65fc..b29e6eea9 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -32,6 +32,9 @@ const GeneralLLMPreference = lazy( const GeneralTranscriptionPreference = lazy( () => import("@/pages/GeneralSettings/TranscriptionPreference") ); +const GeneralAudioPreference = lazy( + () => import("@/pages/GeneralSettings/AudioPreference") +); const GeneralEmbeddingPreference = lazy( () => import("@/pages/GeneralSettings/EmbeddingPreference") ); @@ -85,6 +88,10 @@ export default function App() { <AdminRoute Component={GeneralTranscriptionPreference} /> } /> + <Route + path="/settings/audio-preference" + element={<AdminRoute Component={GeneralAudioPreference} />} + /> <Route path="/settings/embedding-preference" element={<AdminRoute Component={GeneralEmbeddingPreference} />} diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx index 67797d266..6b8f79e5e 100644 --- a/frontend/src/components/SettingsSidebar/index.jsx +++ b/frontend/src/components/SettingsSidebar/index.jsx @@ -21,6 +21,7 @@ import { ClosedCaptioning, EyeSlash, SplitVertical, + Microphone, } from "@phosphor-icons/react"; import useUser from "@/hooks/useUser"; import { USER_BACKGROUND_COLOR } from "@/utils/constants"; @@ -280,6 +281,14 @@ const SidebarOptions = ({ user = null }) => ( flex={true} allowedRole={["admin"]} /> + <Option + href={paths.settings.audioPreference()} + btnText="Voice and Speech Support" + icon={<Microphone className="h-5 w-5 flex-shrink-0" />} + user={user} + flex={true} + allowedRole={["admin"]} + /> <Option href={paths.settings.transcriptionPreference()} btnText="Transcription Model" diff --git a/frontend/src/components/SpeechToText/BrowserNative/index.jsx b/frontend/src/components/SpeechToText/BrowserNative/index.jsx new file mode 100644 index 000000000..1e9bcb3c2 --- /dev/null +++ b/frontend/src/components/SpeechToText/BrowserNative/index.jsx @@ -0,0 +1,9 @@ +export default function BrowserNative() { + return ( + <div className="w-full h-10 items-center flex"> + <p className="text-sm font-base text-white text-opacity-60"> + There is no configuration needed for this provider. + </p> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/BrowserNative/index.jsx b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx new file mode 100644 index 000000000..1e9bcb3c2 --- /dev/null +++ b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx @@ -0,0 +1,9 @@ +export default function BrowserNative() { + return ( + <div className="w-full h-10 items-center flex"> + <p className="text-sm font-base text-white text-opacity-60"> + There is no configuration needed for this provider. + </p> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx new file mode 100644 index 000000000..ad86caa1c --- /dev/null +++ b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx @@ -0,0 +1,107 @@ +import { useState, useEffect } from "react"; +import System from "@/models/system"; + +export default function ElevenLabsOptions({ settings }) { + const [inputValue, setInputValue] = useState(settings?.TTSElevenLabsKey); + const [openAIKey, setOpenAIKey] = useState(settings?.TTSElevenLabsKey); + + return ( + <div className="flex gap-x-4"> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + API Key + </label> + <input + type="password" + name="TTSElevenLabsKey" + className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="ElevenLabs API Key" + defaultValue={settings?.TTSElevenLabsKey ? "*".repeat(20) : ""} + required={true} + autoComplete="off" + spellCheck={false} + onChange={(e) => setInputValue(e.target.value)} + onBlur={() => setOpenAIKey(inputValue)} + /> + </div> + {!settings?.credentialsOnly && ( + <ElevenLabsModelSelection settings={settings} apiKey={openAIKey} /> + )} + </div> + ); +} + +function ElevenLabsModelSelection({ apiKey, settings }) { + const [groupedModels, setGroupedModels] = useState({}); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function findCustomModels() { + setLoading(true); + const { models } = await System.customModels( + "elevenlabs-tts", + typeof apiKey === "boolean" ? null : apiKey + ); + + if (models?.length > 0) { + const modelsByOrganization = models.reduce((acc, model) => { + acc[model.organization] = acc[model.organization] || []; + acc[model.organization].push(model); + return acc; + }, {}); + setGroupedModels(modelsByOrganization); + } + + setLoading(false); + } + findCustomModels(); + }, [apiKey]); + + if (loading) { + return ( + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Chat Model Selection + </label> + <select + name="TTSElevenLabsVoiceModel" + disabled={true} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + <option disabled={true} selected={true}> + -- loading available models -- + </option> + </select> + </div> + ); + } + + return ( + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Chat Model Selection + </label> + <select + name="TTSElevenLabsVoiceModel" + required={true} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + {Object.keys(groupedModels) + .sort() + .map((organization) => ( + <optgroup key={organization} label={organization}> + {groupedModels[organization].map((model) => ( + <option + key={model.id} + value={model.id} + selected={settings?.OpenAiModelPref === model.id} + > + {model.name} + </option> + ))} + </optgroup> + ))} + </select> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx new file mode 100644 index 000000000..4183a4e58 --- /dev/null +++ b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx @@ -0,0 +1,45 @@ +function toProperCase(string) { + return string.replace(/\w\S*/g, function (txt) { + return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase(); + }); +} + +export default function OpenAiTextToSpeechOptions({ settings }) { + const apiKey = settings?.TTSOpenAIKey; + + return ( + <div className="flex gap-x-4"> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + API Key + </label> + <input + type="password" + name="TTSOpenAIKey" + className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="OpenAI API Key" + defaultValue={apiKey ? "*".repeat(20) : ""} + required={true} + autoComplete="off" + spellCheck={false} + /> + </div> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Voice Model + </label> + <select + name="TTSOpenAIVoiceModel" + defaultValue={settings?.TTSOpenAIVoiceModel ?? "alloy"} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + {["alloy", "echo", "fable", "onyx", "nova", "shimmer"].map( + (voice) => { + return <option value={voice}>{toProperCase(voice)}</option>; + } + )} + </select> + </div> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx new file mode 100644 index 000000000..1947f0057 --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx @@ -0,0 +1,94 @@ +import { useEffect, useState, useRef } from "react"; +import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; +import Workspace from "@/models/workspace"; +import showToast from "@/utils/toast"; + +export default function AsyncTTSMessage({ slug, chatId }) { + const playerRef = useRef(null); + const [speaking, setSpeaking] = useState(false); + const [loading, setLoading] = useState(false); + const [audioSrc, setAudioSrc] = useState(null); + + function speakMessage() { + if (speaking) { + playerRef?.current?.pause(); + return; + } + + try { + if (!audioSrc) { + setLoading(true); + Workspace.ttsMessage(slug, chatId) + .then((audioBlob) => { + if (!audioBlob) + throw new Error("Failed to load or play TTS message response."); + setAudioSrc(audioBlob); + }) + .catch((e) => showToast(e.message, "error", { clear: true })) + .finally(() => setLoading(false)); + } else { + playerRef.current.play(); + } + } catch (e) { + console.error(e); + setLoading(false); + setSpeaking(false); + } + } + + useEffect(() => { + function setupPlayer() { + if (!playerRef?.current) return; + playerRef.current.addEventListener("play", () => { + setSpeaking(true); + }); + + playerRef.current.addEventListener("pause", () => { + playerRef.current.currentTime = 0; + setSpeaking(false); + }); + } + setupPlayer(); + }, []); + + if (!chatId) return null; + return ( + <div className="mt-3 relative"> + <button + onClick={speakMessage} + data-tooltip-id="message-to-speech" + data-tooltip-content={ + speaking ? "Pause TTS speech of message" : "TTS Speak message" + } + className="border-none text-zinc-300" + aria-label={speaking ? "Pause speech" : "Speak message"} + > + {speaking ? ( + <PauseCircle size={18} className="mb-1" /> + ) : ( + <> + {loading ? ( + <CircleNotch size={18} className="mb-1 animate-spin" /> + ) : ( + <SpeakerHigh size={18} className="mb-1" /> + )} + </> + )} + <audio + ref={playerRef} + hidden={true} + src={audioSrc} + autoPlay={true} + controls={false} + /> + </button> + <Tooltip + id="message-to-speech" + place="bottom" + delayShow={300} + className="tooltip !text-xs" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx new file mode 100644 index 000000000..644a57afc --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx @@ -0,0 +1,23 @@ +import { useEffect, useState } from "react"; +import NativeTTSMessage from "./native"; +import AsyncTTSMessage from "./asyncTts"; +import System from "@/models/system"; + +export default function TTSMessage({ slug, chatId, message }) { + const [provider, setProvider] = useState("native"); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function getSettings() { + const _settings = await System.keys(); + setProvider(_settings?.TextToSpeechProvider ?? "native"); + setLoading(false); + } + getSettings(); + }, []); + + if (loading) return null; + if (provider !== "native") + return <AsyncTTSMessage slug={slug} chatId={chatId} />; + return <NativeTTSMessage message={message} />; +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx new file mode 100644 index 000000000..5f3bd3f69 --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx @@ -0,0 +1,61 @@ +import React, { useEffect, useState } from "react"; +import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; + +export default function NativeTTSMessage({ message }) { + const [speaking, setSpeaking] = useState(false); + const [supported, setSupported] = useState(false); + useEffect(() => { + setSupported("speechSynthesis" in window); + }, []); + + function endSpeechUtterance() { + window.speechSynthesis?.cancel(); + setSpeaking(false); + return; + } + + function speakMessage() { + // if the user is pausing this particular message + // while the synth is speaking we can end it. + // If they are clicking another message's TTS + // we need to ignore that until they pause the one that is playing. + if (window.speechSynthesis.speaking && speaking) { + endSpeechUtterance(); + return; + } + + if (window.speechSynthesis.speaking && !speaking) return; + const utterance = new SpeechSynthesisUtterance(message); + utterance.addEventListener("end", endSpeechUtterance); + window.speechSynthesis.speak(utterance); + setSpeaking(true); + } + + if (!supported) return null; + return ( + <div className="mt-3 relative"> + <button + onClick={speakMessage} + data-tooltip-id="message-to-speech" + data-tooltip-content={ + speaking ? "Pause TTS speech of message" : "TTS Speak message" + } + className="border-none text-zinc-300" + aria-label={speaking ? "Pause speech" : "Speak message"} + > + {speaking ? ( + <PauseCircle size={18} className="mb-1" /> + ) : ( + <SpeakerHigh size={18} className="mb-1" /> + )} + </button> + <Tooltip + id="message-to-speech" + place="bottom" + delayShow={300} + className="tooltip !text-xs" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx index 3bdee472d..52ae1466a 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx @@ -1,4 +1,4 @@ -import React, { memo, useEffect, useState } from "react"; +import React, { memo, useState } from "react"; import useCopyText from "@/hooks/useCopyText"; import { Check, @@ -6,11 +6,10 @@ import { ThumbsUp, ThumbsDown, ArrowsClockwise, - SpeakerHigh, - PauseCircle, } from "@phosphor-icons/react"; import { Tooltip } from "react-tooltip"; import Workspace from "@/models/workspace"; +import TTSMessage from "./TTSButton"; const Actions = ({ message, @@ -60,7 +59,7 @@ const Actions = ({ </> )} </div> - <TTSMessage message={message} /> + <TTSMessage slug={slug} chatId={chatId} message={message} /> </div> ); }; @@ -149,62 +148,4 @@ function RegenerateMessage({ regenerateMessage, chatId }) { ); } -function TTSMessage({ message }) { - const [speaking, setSpeaking] = useState(false); - const [supported, setSupported] = useState(false); - useEffect(() => { - setSupported("speechSynthesis" in window); - }, []); - - function endSpeechUtterance() { - window.speechSynthesis?.cancel(); - setSpeaking(false); - return; - } - - function speakMessage() { - // if the user is pausing this particular message - // while the synth if speaking we can end it. - // If they are clicking another message's TTS - // we need to ignore that until they pause the one that is playing. - if (window.speechSynthesis.speaking && speaking) { - endSpeechUtterance(); - return; - } - - if (window.speechSynthesis.speaking && !speaking) return; - const utterance = new SpeechSynthesisUtterance(message); - utterance.addEventListener("end", endSpeechUtterance); - window.speechSynthesis.speak(utterance); - setSpeaking(true); - } - - if (!supported) return null; - return ( - <div className="mt-3 relative"> - <button - onClick={speakMessage} - data-tooltip-id="message-to-speech" - data-tooltip-content={ - speaking ? "Pause TTS speech of message" : "TTS Speak message" - } - className="border-none text-zinc-300" - aria-label={speaking ? "Pause speech" : "Speak message"} - > - {speaking ? ( - <PauseCircle size={18} className="mb-1" /> - ) : ( - <SpeakerHigh size={18} className="mb-1" /> - )} - </button> - <Tooltip - id="message-to-speech" - place="bottom" - delayShow={300} - className="tooltip !text-xs" - /> - </div> - ); -} - export default memo(Actions); diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx new file mode 100644 index 000000000..6cbcfbf8d --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx @@ -0,0 +1,82 @@ +import { useEffect } from "react"; +import { Microphone } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; +import _regeneratorRuntime from "regenerator-runtime"; +import SpeechRecognition, { + useSpeechRecognition, +} from "react-speech-recognition"; + +let timeout; +const SILENCE_INTERVAL = 3_200; // wait in seconds of silence before closing. +export default function SpeechToText({ sendCommand }) { + const { + transcript, + listening, + resetTranscript, + browserSupportsSpeechRecognition, + browserSupportsContinuousListening, + isMicrophoneAvailable, + } = useSpeechRecognition({ + clearTranscriptOnListen: true, + }); + + function startSTTSession() { + if (!isMicrophoneAvailable) { + alert( + "AnythingLLM does not have access to microphone. Please enable for this site to use this feature." + ); + return; + } + + resetTranscript(); + SpeechRecognition.startListening({ + continuous: browserSupportsContinuousListening, + language: window?.navigator?.language ?? "en-US", + }); + } + + function endTTSSession() { + SpeechRecognition.stopListening(); + if (transcript.length > 0) { + sendCommand(transcript, true); + } + + resetTranscript(); + clearTimeout(timeout); + } + + useEffect(() => { + if (transcript?.length > 0) { + sendCommand(transcript, false); + clearTimeout(timeout); + timeout = setTimeout(() => { + endTTSSession(); + }, SILENCE_INTERVAL); + } + }, [transcript]); + + if (!browserSupportsSpeechRecognition) return null; + return ( + <div + id="text-size-btn" + data-tooltip-id="tooltip-text-size-btn" + data-tooltip-content="Speak your prompt" + aria-label="Speak your prompt" + onClick={listening ? endTTSSession : startSTTSession} + className={`relative flex justify-center items-center opacity-60 hover:opacity-100 cursor-pointer ${ + !!listening ? "!opacity-100" : "" + }`} + > + <Microphone + weight="fill" + className="w-6 h-6 pointer-events-none text-white" + /> + <Tooltip + id="tooltip-text-size-btn" + place="top" + delayShow={300} + className="tooltip !text-xs z-99" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx index 98ad11f8f..df08bcc7c 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx @@ -12,6 +12,7 @@ import AvailableAgentsButton, { useAvailableAgents, } from "./AgentMenu"; import TextSizeButton from "./TextSizeMenu"; +import SpeechToText from "./SpeechToText"; export const PROMPT_INPUT_EVENT = "set_prompt_input"; export default function PromptInput({ @@ -34,6 +35,7 @@ export default function PromptInput({ function handlePromptUpdate(e) { setPromptInput(e?.detail ?? ""); } + useEffect(() => { if (!!window) window.addEventListener(PROMPT_INPUT_EVENT, handlePromptUpdate); @@ -156,6 +158,9 @@ export default function PromptInput({ /> <TextSizeButton /> </div> + <div className="flex gap-x-2"> + <SpeechToText sendCommand={sendCommand} /> + </div> </div> </div> </div> diff --git a/frontend/src/media/ttsproviders/elevenlabs.png b/frontend/src/media/ttsproviders/elevenlabs.png new file mode 100644 index 0000000000000000000000000000000000000000..b1047e422e3c7a855caec5ef2bae1fcc647495e5 GIT binary patch literal 6422 zcmeAS@N?(olHy`uVBq!ia0y~yVAKI&4rT@h2EWC1?hFhJjKx9jPK-BC>eMqZFmM)l zL>4nJa0`PlBg3pY5)2GXe*=6%T>t<7|L@<w8#ivOTD5BR>eVY&thjsk?vEcojvYI; za^=c9ckX=u{{8sz<7?NhUAJ!C`t|D{KYsl4=g;fcuO}oVxVyW1czC>c@#5F7U;X|4 zVq#(v5)$I#;`8Rsd-LW^N=nLt1q=TC`SbVh-{;StOG!ydN=p9z{rl9ZQ)+5zEG#Un ztgP(p><=G4?CR=bVq#)rW1BT=*6Y`=BO)T0nVFwGdlnoVynOlc)2B}}GBPqSFm!Zu z%$zwhC@5&@(xo$I%+S!#P*+#quwg?(L&KRfXHK3x`S$JGxpU{vnKS48`}enQ-CD9_ zNpo{^X=!O?W##VOyEks!*wfQfQBhG|Uf$l`e);m{$&)8%Wo0EMCg$bk-Mo48$&)7z z4i3D$ygWQSrlzKCZEg4O-xn4Z77`MYk&(G_<%)=ih^VM&NJxl@iHWMJYC%E4vSrKo z`1ttw`3nmR?d<G0I5=EfTn-*QxM$BEZf<TzN5{UtzNu5Ee*OB@(9rPRyLXEgEy~Qy ztgEY2P*6}-R-Qe3_P1}}7A{=a+1YvZ>eY`QKTe!D@ztwWTwGk;-Q82BOffJpxOnm6 zg9i_yqN2RKysWLQKYaL*oSdAUoxOGI*7@`2b8>Qi`SK+!ENuJs?YD2=&d<*m5D<ur zj1&|Uw6wI;)YO!hmyeB&EiNurR8%~4=+K%qYg}Dj?d|PlWo5T*+g4Ombno812@@uq zJ9kb+Mdj0{Pc}9-TefUzY;4rk)qVN$<)urPT3cJ?<m3(?K0IyOG#?)yTU%Q#Ev@tC z&-?oNHZ?UF85soz2BxN_nwy(Hdh|$NUtd~UIv^n6$dMx@B_;9k@%#7h*U`~wX=$mi zuUAr1a&vRryLa!dUAv;Aqd$NCTvb(-larI1n;RM$x?{(V>C>k_efl&bBV*^zo&Ns* z+S=NFetzEG-bqPG7cN|giHWhYvT}BIPD@KWaNxk=#fzPsoZ{l*%*@Qn%E~rv+EiU# zJ!#UU-rnBY+S+~l_MJU@HatB1#EBC}j~=b5skwITnx3AXg@uKur>C*8ae8{XPS2Ne z1_seho-U3d6?5Ls<&6<JdjCNB1lz}c_eAc0Xtib5xgW6E-N)J2Rr}(Z52o>_4wVbm zX!<w@pH?{<Qu8wO${C-t<!6I}YgYTHdIx(<+vMZ2*EzrQ_Tk#HZ|@#7e4cOL>HOmR z-<mypZ_k}M_otP$wefEz)f2JKr>&VqIj68Pq%t~0G6aY*j50O+Be&I_ema9y;;F{z zeXsxb>6o7iT5#n|UC2_gZ4WZmDK%ti?sb0krA&13o9w$jQ%{6@&EyOZjEXFc=$v{e zev#3_uWL7@@QQL=3{DQ^e}3}x&d6UTv9`0*e?9LFN;?!fB`nNXOLM-|tWzni7xv$D zTzXdgvPk6Bmv3X9x6g1}mFBTpD(Oq$x4dmnmOr0rB)o3XmQA<rrg=)#_D7{=-q4@? zJT&wg^E7Lf{7HY--AI)*`EMCnr>XIJ{!Z>lwUtl&xcQx@Jk{{t_bk3f{nON4^$T>= zH~B>A?sci*Qj+NCKX`odyUBrJo{?$iw0#T2gi|diKb`w(%jM2Zb55P|C>FcCQfHd% z&0SMu!zM0jUX-*t!%^mVKx^ZQQd8Zp2VU))wDi2fT}|f;|7#|`U=7=`!2W7r<F$>` z-)pbp{?D_O#b~YMDW$*viggzTaMVY}n)n2VeNCO69}*uQ75janju%@@mV^JQrx(ov zySC3duTYg6#~xOE=Y>0~=*qgQem`4wTwbrQRW0~MH+$KgZFw(U&i*!9d+<fZsh(Fa zf_iVCzAv@4rQ_89!$!p`bMD%2T5+CxwQ^te4eQiPmm}v_{YtrT;GT8r(iJ;x&C_md zw)xf29#r=3<P+@{Kif+e-m;g!W?Or4?e&9xuh)Ln&678FnRn;H{JM9!YdeGXZ{|Oz z->WNr;?>3*yA5j69Mif_F8bJ6H^*7LdB#ybUNK3InWg{OQ`gGON$QW<?KscxCg;oV zyVl4%u9p|!R-UO7SNHaQ`nHTabu&|+F1)*b?YB+hK_boH?mXYL=Mq=B(tS^{ua&AZ zt2PQKO@FPJ87Y#y<3E?;hg++&Ma+%f{$95A<|58xbEQ<{`dz1F%o8k%{A(!asV%4~ zVL17q`DEiOvzBgq@%N=vUx$lt{|U=yABDXGZx~HioxCkt?WVK7LFe|jXWa}V^Y*lb zIxIf(^478fp}-n};>p~REl1Wpnv#++U$n{e*`)8OvW6E|d~6BnzV&i#&gsJI&ngNQ zwT3EQUh#8Q$xXHH3EQgo_H?=Hg|EG<cJ8->CfDn(jf&wCbF%N+9K7~TWTNZyW%I9X zka}c2;gY23>o4<P9_7>98T(h0ak5;*i?jRkR9}5`y;tvkGv&qhmpYPC)0R}2L@rzM zd)@!7TfaPV3Tu4g&Of7tDbi@7+6B>+14(P6cKU4G{Io}?bw_lJg16ctTN8EPoliri z#ko&CbRvkS_w1$9D#hDAeu`AfpW?s9ea;WT?8WQ$@9|2Iy!InByiO`^{{)?DS2oH@ zJlcOxpJl?C2D{wn**D%*t?{1Dozl7S<hyi_n7f%#Wm9I|GMQQRe(kjE<rzLHZrh9_ z|LmBff79i`6|G=S>l+yk|8KXuTnqU+`~B3vbxhL9ujWOZlhXZb?0!?<$fB>l{oF-? zn?GJ`-uQ}f)pYSU_Q!IL@~)bmHa%4?d#3t_Q`6l_XVxbD(p~=OL{Z%4ALWm2M41!s zF78~L_=s1oH1go`%MzNI@9V<0-ukH>+MTs;#?i7J6Z+J&7fpS9ammwv8;&p3R@eM` z+2&UEuGndveqGCR+N*y44G)R*&l7mJbAz5G54XV9$#br)S@28Ay71ic=1n;Zzdkvf zQI@c6$KLpJwb`nxea`<WGwJ=XOS`o6)W=^%d;I2Y?RmTFZ|CW48<)q;sPS0a?7w#7 zX(MTAj+xgsUMNoAbF+(8pW}PXjZOB_mjeIDRlks{KDX6NPxExV-@H>nGdC<Xc&NVp z*{ms(<NhtZ)ly=;X5Rlj>mrQ4)NZ<TBk;$o62tt{+FqZIZD`Z{X0dkiERDI^kv^|Z zaGOQGza)NX&C{G)MxiU`r~RMbQmwa!`&Z<wcUQR%?^nB}J*!z~{x4s>4WA>5_jf3N zYq(y0bi<M@wL-Q#j!V5u<-1qibDpD0-|u7Mi%*_s-`w3DVeLNaXXy6hng18{SDmU} zzq)>x@3h-HXEkk|Q~O$G4a24?op0Ja`I;7m)+dyT|Aa;O$MR`L`+vQ4bnlj?^Iv$^ zJea$=KgumdKKY*A8~-CQ=YOdxZ8VlMw4YLRI$`ow4WE43e-kSAhqFZfw=q+ZZSsrU zedB1cn()6Rs*B4`#LMNJ`>Ha3bMow<lRghK?bF!GjOT00os5#Y{6^!X+UYCVdRxz) zk4T-#q}8o6d1-~hIholDOhWv7<W3)B;%q*<yKjA1%GUVz$}BfiruMk(TB|?d>xJi2 z{jPmxPL5cvRl;1TE%xf$wCB2#A+wF#e&)rPe|pvO^-Ya=vewxZe?KghTC<S*ZpH41 zCAW7U*50VI_RfEk-@jhyFSqGxjS76Ox>WpVes7XzWY+e5%dV&iZ(sG8_mTCUuR+(e zTJ@vtRz=Kt!1vE->C~tCc?q%XpC_pvH|A!}$WHszl<Q`(;JBdKv7a`jny;nLygima z;a26U#N?XI90%oG`6gXq;bOZqH6Sc5;PPpmd&;?D0tdHQHoRG)`MHH5d-|m7fnh<P zXYg-1)fZ;Fv}a{z!W%8IPL-Z*#}ymzs?G73FS_8%P3<*nu4rF5S#c(CTAjbj?uW;| zq`hLla%=m7t&^_*Pk4DZ|E$q#ldSL8k8ix#f5h&^DZiX4C*I2*x~21YlOAJ}>B$AL z;qF<-w=P!Rziz{(Wi3;B%pUAoP&lPjxv<r0Qg)45(KM+AVduYjTV|c&&$jHpsw-vx z#j;m-oy_O!N{*4Q>@%)(i`p-_dHweq>9RHFR6@j_%l|wRWO!R==9jnS3thH6m?&Of zJZI5E@%_iF7B2g9_vTLLDZ8$&__!sb&oo(h(H~aR`Pt$(Qciw(9CuC7r{+k3Y0!z; zUk=Y~@yorpQh)YJgZ>+n)I!e8I$AQN<Lb_%Q&~2?koj|>IcJWxw@~a0H>v!{gDZ<! zzC32U#@7F7HhbjqPfw>ym2QnaXsO)$;7XdewD7db><C^@-zF0u<E5b?k$wgxE5m)J z)om-ARrSzun$Fyl;rt%2LS)S?Rwg8UF4~c+wD<i!?ro)CBI5l^okF*?OkST-K3A&# zmo(dNpOrdcUXd>KZ>C?AbA7cS=S|&;Yn2D*U3?!G;$F8&!v5yIz6)A~(y8;63b#0a z3SV}A=j(#|eY}}+<?RtWWB<(53Hx#9!}_(;51G8)moxjoO|}cCUPL@n%l%-dCX!<E z@z*geoxR7e{SxCjmidQ&UY@$eqSW1?j|!4r%y|B^q|EKd>giRiML$}YBCm>X@LoH6 zp4f@KL2aA9@$D}=ex=<^K8bJdkv7daPmk_=WvVANVMo>{3GPDmE+J2)$xL!QTfTE% z>G{TY<Mck6HE#_{mi@VR(>Ym3Zjw}c-|NjrMox3hmwI1&B2d=Ev*q-x&*@y3zh78+ z$YkM^xjM6UyxtkyvPEfml;CaqZz~t9$$f4)HA86A&GpBk&2HWPka)cM+N(WbRZUMi z`Q6Wc=1}hwPkPZUD?LBFsa<@g%kr9QXT+Yf&)FMkzW(F+vsve(GdG?yysX}ITeJK9 z^ryE|eMRCz7fqhmf2TjZ{2k+7qg@X_uRo)Z@0#zkxcsLDW4Fok%cl*u-utKAFZAzT z=pW-$&DZryb;a}a7qy6SJ~UZhe&@lPzQu<^Rl^SKT%UFAFW)JqhD{6$bQm<a8DPxs zOLgYe7ChN!95-##mu;3-FF0J}Yo_T+z2AL*-aI|$NXOiLCK}xGcU2=6X8x7muGzg@ zr{?9msQ=$JPixM4BXMZindbh3ed@PV9x==ooioY&+kWNKWu@-Un-^}Hlw|5PYx4i< zO+HpT)^yCZno>GF(kZe*XQ7V6v;Z*$Zw>AVs#91`2%Re0S02X^Fn!za4QsSzQv&Dh z5cc$Ro6zmLCAQ*3$&M`%&!#Nb@tU-%#b@2+q{9z0*Yn#47;Mv=GF9P-{x%bzpT)nU zUmJB_F_QY>vAi+*_JykRxl-jOcT@8pXt_sj31j~sruqEJq<tYvQ#UMMdx6h%b=sAy z&2R3_ytC(O`!Xw=-1YXVz9GV*hvT@{S6vF-cFI>w^?ckF0nfeBZFhek;+0*uKWlGA z&h)EhUuQqQcAOz4&|4~hRqAfd2_c<!I)zu{K3?o7en0K$($M2J)BCEYx}H3eYZV#3 z!%1xCc6+OJc6FJPuhzX=vN`7A*UK05<L2tGFnoPfUq5F3i>^I;{;l%X3a>nQDL#J2 zfjnE~w55NabyWFFnOErVK72y$w20FG6_ao7l?gLeJH0R=f7{L4n?+y01%1_%{HnKP zj;d*NSW=|pVW;Dp)n}zH{#s_|S*ud@a*NN?HR+R&zS|bcXEgEb%vIlWC69ey>z%TS zbMH#FH1-WTUT@y%2=Ba;yO%p>YQK)xte`N>l+1T$kDmW6SEJE+_qv+*WcJTZIn$4} z*rff~>&08wKP~v^yV)f%7k=;DSUV%x%&qLtIotgI<&*r(Ce7Su8&jY#tzT#FZ}Zs5 zsYd6s3w5S(sM!3x6k&G%d&PSRZ?{IB;Nu$t*Xt`;e|z%#e0aR`habU<Jxb0WJTrCU z|E66s8(rLed43v%OU0Ki)_%6&w7ZzSU)0%oEApq_Xo_Gtw0m+)yl(Zo*L%}tvZlTK z{p0)<n_c>`i}y(%`miy5y4=3sC2wvXF0_0jw}C-#!NtxEQ}av@3jh7?wDZKGl(#vH zLZeoN`lm+9FHp#3P0gG%D|ZR&rk?QiH{Wf$)St1VN0)d0x_ciJ_PeNQ_en1gt^T^T zefvBA*ekvI@{ylzM^?79{P~o(Z;i~PAhrJq_d|8oEIr4#_pgD;#CWbau@nEVOx5Ly zeQq+(X#Kih3^%`U9%mKONIf;>gVWP@Q`mUJ#8(?t8YLw?5$D~(mbH5B@3rBLI>pax zSF+ZAJJ1-v;$K<Y*Zm*VPjB{dTHk+PIH~WGV9fV@4~yO$6c&=-{?z5kt;@ZA)zS6h z6II(}^i~*7`F=k1ci8O8e--KF+3V*`xbW}xqO^NI%ue52?3n6kd1}_%?SH#>Fi)Dc zH_Tm6Q(OI;YLv$6=YrRE8hxp8fB!EvKk4<)&4ClXZ1@nk;-KlaQ)}7nlUze}H*WA! z(ypzz^vmbt;eehQuNAbn3QtarxzjsIZ*%+wuU2N&$wx}pZw}E&wS3oAG->Jmc~d?! zzs}NHEarE~H|OTod;h0xP43WIeLG3(_4O@Pb39@mez@RJzW-P5(zjLudkYFws(V}{ zf4Od}dGf<N?qt^Wg)b6+L>9A03QxXjc<z1p)Y7e+uf?n^&n;aybMs#D?vC5n&aK!o zH>h^;H0zrS&fR&^YdleX_U|8_3z;<aw5JwNFjWq;O^C0XY5JzBIihv;-}0<oCn9g` zKGvbBv#T(5s%xzh|4RR2R{xHWU(a89rDxxh`F3iqQf&M2<E*z?U#xy!wdvu7A7!;` z(@!XL27D}7ecbNVqTt#3Kd)!$y*NC*f6dN^fos$i+<d=nw<{7dQSKC(oT_xsJ!jL3 zWqz^SOgtjDUlH$T)2J4#urxl9?&c%^jZ0<ctM_^@X6|yj=9+uaf=TjLfwHvJy`Bww z?@TPeZLqC^+5e37pSA1V6dmhx=B_K#J>)+>{Mv^n6UB_L3ii!A5wH8BuabX$uw|O; zpYmPeokppl%Z@iB`-EjIProYks(t6%q|=Kwb?jVwJ$>dWqXc&@X&%kx&7Frke3+;G z;=kN>wK{yQS50r$^~kecoa=bn6%~_T&**YFS@GlKl)ch7b_Uo)G>cDJy3g;jv8vV5 zH~z76-c-gZ>?wRv_q9K<Blo|RazIe(r?|cniG6wVZcU#1M|Jj`X_Ko%_###N`1{Xj zihticuc7AU@>R;mw&iU}+co)NpVa$h_jj&WOQ~KJ?RzVaS&5_fUqh)x%BM>U6i>fC z|8Kv=H=doEUrM%dTScv#5GoRRWxnFq-d9Jtf`9aF)>M}Fy*lZ@mq)qN<~&~bMb@Ry zV_W*--reT9q8T+|tB-DLk9D7W%QA89*T4scv$ZZfk$=Enr?F|<=Zmw~uX?}Kv~bcT z$?G%a-|cV-_nD?xcsFzHVON>tb~El9htDs`^uFyXKUwLLRpfo+Me(jL4({JLdGG7f zNz<49OO#BSKJ~E)S9ra(V)lN|hdEkPZe33KdOw$QcTQo%)Tt(2!9~}8zl?r-H#c!g zSbpi1%bTyvJhWu$$Nby(Ggs$LQc9MZG3VW@_+?u*wap9qn!}y8>6zR1#G@Bq{o7=x zJXhJWS9nkCHSSpF@>}=fSBcKItN(VqOlY=<&9l`fyfTbul`gY;bL~rp@meX5pEYR# z@wcPze)%-#gWvX>DXlIRPbd3CexIDZ?7x@x>FV3+r=D+_Bj$50=X-a~UGA&Z8b>eL zOZ;YduhVtA<FD8Bxjh;8AO22Nzsvq_*0q$2%#AaxnK*YA<$O-mx;)|daXl6F>uJ3w z%t~TD?)~ZTHk&v8{ja_CiPz0Me!WcVHFTfqx7RYLUdhm2V{ekr#ShaX`-9hLujpi5 z)!%Wu{_3u$HhsTNIZ0{oK4HHaFSe_)Qgo8u>&;huR|!UW|2>`jq%EkkE&TS@sWsuH zztj2N{|>AAursdn`8|obPDjq3weKuEd6cVk|I*&|_l#2XJKxUiTD0FQV8hH?^QN5B z@HifocTdVMVYPAPwH2qGHkwp?3-Ujdx4FDRup%-zD)RrKbN?5v`NHS;#<Kh&Z~jxQ m)r%U@n=ME!m=VqV&%b`P;X}qQ;~)kG1_n=8KbLh*2~7Z0H7_Rs literal 0 HcmV?d00001 diff --git a/frontend/src/models/system.js b/frontend/src/models/system.js index e64b01199..f8f123448 100644 --- a/frontend/src/models/system.js +++ b/frontend/src/models/system.js @@ -332,7 +332,7 @@ const System = { }) .then((blob) => (blob ? URL.createObjectURL(blob) : null)) .catch((e) => { - console.log(e); + // console.log(e); return null; }); }, diff --git a/frontend/src/models/workspace.js b/frontend/src/models/workspace.js index 91f4a2db3..64732c044 100644 --- a/frontend/src/models/workspace.js +++ b/frontend/src/models/workspace.js @@ -272,6 +272,21 @@ const Workspace = { return false; }); }, + ttsMessage: async function (slug, chatId) { + return await fetch(`${API_BASE}/workspace/${slug}/tts/${chatId}`, { + method: "GET", + cache: "no-cache", + headers: baseHeaders(), + }) + .then((res) => { + if (res.ok && res.status !== 204) return res.blob(); + throw new Error("Failed to fetch TTS."); + }) + .then((blob) => (blob ? URL.createObjectURL(blob) : null)) + .catch((e) => { + return null; + }); + }, threads: WorkspaceThread, uploadPfp: async function (formData, slug) { @@ -302,7 +317,7 @@ const Workspace = { }) .then((blob) => (blob ? URL.createObjectURL(blob) : null)) .catch((e) => { - console.log(e); + // console.log(e); return null; }); }, diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx new file mode 100644 index 000000000..c4abaf546 --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx @@ -0,0 +1,45 @@ +import React, { useEffect, useState, useRef } from "react"; +import { isMobile } from "react-device-detect"; +import Sidebar from "@/components/SettingsSidebar"; +import System from "@/models/system"; +import PreLoader from "@/components/Preloader"; +import SpeechToTextProvider from "./stt"; +import TextToSpeechProvider from "./tts"; + +export default function AudioPreference() { + const [settings, setSettings] = useState(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function fetchKeys() { + const _settings = await System.keys(); + setSettings(_settings); + setLoading(false); + } + fetchKeys(); + }, []); + + return ( + <div className="w-screen h-screen overflow-hidden bg-sidebar flex"> + <Sidebar /> + {loading ? ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <div className="w-full h-full flex justify-center items-center"> + <PreLoader /> + </div> + </div> + ) : ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <SpeechToTextProvider settings={settings} /> + <TextToSpeechProvider settings={settings} /> + </div> + )} + </div> + ); +} diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx new file mode 100644 index 000000000..58bb1489b --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx @@ -0,0 +1,191 @@ +import React, { useEffect, useState, useRef } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import LLMItem from "@/components/LLMSelection/LLMItem"; +import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react"; +import CTAButton from "@/components/lib/CTAButton"; +import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; +import BrowserNative from "@/components/SpeechToText/BrowserNative"; + +const PROVIDERS = [ + { + name: "System native", + value: "native", + logo: AnythingLLMIcon, + options: (settings) => <BrowserNative settings={settings} />, + description: "Uses your browser's built in STT service if supported.", + }, +]; + +export default function SpeechToTextProvider({ settings }) { + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); + const [searchQuery, setSearchQuery] = useState(""); + const [filteredProviders, setFilteredProviders] = useState([]); + const [selectedProvider, setSelectedProvider] = useState( + settings?.SpeechToTextProvider || "native" + ); + const [searchMenuOpen, setSearchMenuOpen] = useState(false); + const searchInputRef = useRef(null); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = e.target; + const data = { SpeechToTextProvider: selectedProvider }; + const formData = new FormData(form); + + for (var [key, value] of formData.entries()) data[key] = value; + const { error } = await System.updateSystem(data); + setSaving(true); + + if (error) { + showToast(`Failed to save preferences: ${error}`, "error"); + } else { + showToast("Speech-to-text preferences saved successfully.", "success"); + } + setSaving(false); + setHasChanges(!!error); + }; + + const updateProviderChoice = (selection) => { + setSearchQuery(""); + setSelectedProvider(selection); + setSearchMenuOpen(false); + setHasChanges(true); + }; + + const handleXButton = () => { + if (searchQuery.length > 0) { + setSearchQuery(""); + if (searchInputRef.current) searchInputRef.current.value = ""; + } else { + setSearchMenuOpen(!searchMenuOpen); + } + }; + + useEffect(() => { + const filtered = PROVIDERS.filter((provider) => + provider.name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + setFilteredProviders(filtered); + }, [searchQuery, selectedProvider]); + + const selectedProviderObject = PROVIDERS.find( + (provider) => provider.value === selectedProvider + ); + + return ( + <form onSubmit={handleSubmit} className="flex w-full"> + <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16"> + <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10"> + <div className="flex gap-x-4 items-center"> + <p className="text-lg leading-6 font-bold text-white"> + Speech-to-text Preference + </p> + </div> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60"> + Here you can specify what kind of text-to-speech and speech-to-text + providers you would want to use in your AnythingLLM experience. By + default, we use the browser's built in support for these services, + but you may want to use others. + </p> + </div> + <div className="w-full justify-end flex"> + {hasChanges && ( + <CTAButton + onClick={() => handleSubmit()} + className="mt-3 mr-0 -mb-14 z-10" + > + {saving ? "Saving..." : "Save changes"} + </CTAButton> + )} + </div> + <div className="text-base font-bold text-white mt-6 mb-4">Provider</div> + <div className="relative"> + {searchMenuOpen && ( + <div + className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10" + onClick={() => setSearchMenuOpen(false)} + /> + )} + {searchMenuOpen ? ( + <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20"> + <div className="w-full flex flex-col gap-y-1"> + <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]"> + <MagnifyingGlass + size={20} + weight="bold" + className="absolute left-4 z-30 text-white -ml-4 my-2" + /> + <input + type="text" + name="stt-provider-search" + autoComplete="off" + placeholder="Search speech to text providers" + className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium" + onChange={(e) => setSearchQuery(e.target.value)} + ref={searchInputRef} + onKeyDown={(e) => { + if (e.key === "Enter") e.preventDefault(); + }} + /> + <X + size={20} + weight="bold" + className="cursor-pointer text-white hover:text-[#9CA3AF]" + onClick={handleXButton} + /> + </div> + <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4"> + {filteredProviders.map((provider) => ( + <LLMItem + key={provider.name} + name={provider.name} + value={provider.value} + image={provider.logo} + description={provider.description} + checked={selectedProvider === provider.value} + onClick={() => updateProviderChoice(provider.value)} + /> + ))} + </div> + </div> + </div> + ) : ( + <button + className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300" + type="button" + onClick={() => setSearchMenuOpen(true)} + > + <div className="flex gap-x-4 items-center"> + <img + src={selectedProviderObject.logo} + alt={`${selectedProviderObject.name} logo`} + className="w-10 h-10 rounded-md" + /> + <div className="flex flex-col text-left"> + <div className="text-sm font-semibold text-white"> + {selectedProviderObject.name} + </div> + <div className="mt-1 text-xs text-[#D2D5DB]"> + {selectedProviderObject.description} + </div> + </div> + </div> + <CaretUpDown size={24} weight="bold" className="text-white" /> + </button> + )} + </div> + <div + onChange={() => setHasChanges(true)} + className="mt-4 flex flex-col gap-y-1" + > + {selectedProvider && + PROVIDERS.find( + (provider) => provider.value === selectedProvider + )?.options(settings)} + </div> + </div> + </form> + ); +} diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx new file mode 100644 index 000000000..6b11f1a46 --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx @@ -0,0 +1,209 @@ +import React, { useEffect, useState, useRef } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import LLMItem from "@/components/LLMSelection/LLMItem"; +import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react"; +import CTAButton from "@/components/lib/CTAButton"; +import OpenAiLogo from "@/media/llmprovider/openai.png"; +import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; +import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png"; +import BrowserNative from "@/components/TextToSpeech/BrowserNative"; +import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions"; +import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions"; + +const PROVIDERS = [ + { + name: "System native", + value: "native", + logo: AnythingLLMIcon, + options: (settings) => <BrowserNative settings={settings} />, + description: "Uses your browser's built in TTS service if supported.", + }, + { + name: "OpenAI", + value: "openai", + logo: OpenAiLogo, + options: (settings) => <OpenAiTTSOptions settings={settings} />, + description: "Use OpenAI's text to speech voices.", + }, + { + name: "ElevenLabs", + value: "elevenlabs", + logo: ElevenLabsIcon, + options: (settings) => <ElevenLabsTTSOptions settings={settings} />, + description: "Use ElevenLabs's text to speech voices and technology.", + }, +]; + +export default function TextToSpeechProvider({ settings }) { + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); + const [searchQuery, setSearchQuery] = useState(""); + const [filteredProviders, setFilteredProviders] = useState([]); + const [selectedProvider, setSelectedProvider] = useState( + settings?.TextToSpeechProvider || "native" + ); + const [searchMenuOpen, setSearchMenuOpen] = useState(false); + const searchInputRef = useRef(null); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = e.target; + const data = { TextToSpeechProvider: selectedProvider }; + const formData = new FormData(form); + + for (var [key, value] of formData.entries()) data[key] = value; + const { error } = await System.updateSystem(data); + setSaving(true); + + if (error) { + showToast(`Failed to save preferences: ${error}`, "error"); + } else { + showToast("Text-to-speech preferences saved successfully.", "success"); + } + setSaving(false); + setHasChanges(!!error); + }; + + const updateProviderChoice = (selection) => { + setSearchQuery(""); + setSelectedProvider(selection); + setSearchMenuOpen(false); + setHasChanges(true); + }; + + const handleXButton = () => { + if (searchQuery.length > 0) { + setSearchQuery(""); + if (searchInputRef.current) searchInputRef.current.value = ""; + } else { + setSearchMenuOpen(!searchMenuOpen); + } + }; + + useEffect(() => { + const filtered = PROVIDERS.filter((provider) => + provider.name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + setFilteredProviders(filtered); + }, [searchQuery, selectedProvider]); + + const selectedProviderObject = PROVIDERS.find( + (provider) => provider.value === selectedProvider + ); + + return ( + <form onSubmit={handleSubmit} className="flex w-full"> + <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16"> + <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10"> + <div className="flex gap-x-4 items-center"> + <p className="text-lg leading-6 font-bold text-white"> + Text-to-speech Preference + </p> + </div> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60"> + Here you can specify what kind of text-to-speech providers you would + want to use in your AnythingLLM experience. By default, we use the + browser's built in support for these services, but you may want to + use others. + </p> + </div> + <div className="w-full justify-end flex"> + {hasChanges && ( + <CTAButton + onClick={() => handleSubmit()} + className="mt-3 mr-0 -mb-14 z-10" + > + {saving ? "Saving..." : "Save changes"} + </CTAButton> + )} + </div> + <div className="text-base font-bold text-white mt-6 mb-4">Provider</div> + <div className="relative"> + {searchMenuOpen && ( + <div + className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10" + onClick={() => setSearchMenuOpen(false)} + /> + )} + {searchMenuOpen ? ( + <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20"> + <div className="w-full flex flex-col gap-y-1"> + <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]"> + <MagnifyingGlass + size={20} + weight="bold" + className="absolute left-4 z-30 text-white -ml-4 my-2" + /> + <input + type="text" + name="tts-provider-search" + autoComplete="off" + placeholder="Search text to speech providers" + className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium" + onChange={(e) => setSearchQuery(e.target.value)} + ref={searchInputRef} + onKeyDown={(e) => { + if (e.key === "Enter") e.preventDefault(); + }} + /> + <X + size={20} + weight="bold" + className="cursor-pointer text-white hover:text-[#9CA3AF]" + onClick={handleXButton} + /> + </div> + <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4"> + {filteredProviders.map((provider) => ( + <LLMItem + key={provider.name} + name={provider.name} + value={provider.value} + image={provider.logo} + description={provider.description} + checked={selectedProvider === provider.value} + onClick={() => updateProviderChoice(provider.value)} + /> + ))} + </div> + </div> + </div> + ) : ( + <button + className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300" + type="button" + onClick={() => setSearchMenuOpen(true)} + > + <div className="flex gap-x-4 items-center"> + <img + src={selectedProviderObject.logo} + alt={`${selectedProviderObject.name} logo`} + className="w-10 h-10 rounded-md" + /> + <div className="flex flex-col text-left"> + <div className="text-sm font-semibold text-white"> + {selectedProviderObject.name} + </div> + <div className="mt-1 text-xs text-[#D2D5DB]"> + {selectedProviderObject.description} + </div> + </div> + </div> + <CaretUpDown size={24} weight="bold" className="text-white" /> + </button> + )} + </div> + <div + onChange={() => setHasChanges(true)} + className="mt-4 flex flex-col gap-y-1" + > + {selectedProvider && + PROVIDERS.find( + (provider) => provider.value === selectedProvider + )?.options(settings)} + </div> + </div> + </form> + ); +} diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js index 4dc4d5285..cc2b69eee 100644 --- a/frontend/src/utils/paths.js +++ b/frontend/src/utils/paths.js @@ -98,6 +98,9 @@ export default { transcriptionPreference: () => { return "/settings/transcription-preference"; }, + audioPreference: () => { + return "/settings/audio-preference"; + }, embedder: { modelPreference: () => "/settings/embedding-preference", chunkingPreference: () => "/settings/text-splitter-preference", diff --git a/frontend/yarn.lock b/frontend/yarn.lock index bd12e9fa3..93bdc0884 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -2841,6 +2841,11 @@ react-smooth@^4.0.0: prop-types "^15.8.1" react-transition-group "^4.4.5" +react-speech-recognition@^3.10.0: + version "3.10.0" + resolved "https://registry.yarnpkg.com/react-speech-recognition/-/react-speech-recognition-3.10.0.tgz#7aa43bb28d78b92671864dabba3a70489ccad27b" + integrity sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw== + react-tag-input-component@^2.0.2: version "2.0.2" resolved "https://registry.yarnpkg.com/react-tag-input-component/-/react-tag-input-component-2.0.2.tgz#f62f013c6a535141dd1c6c3a88858223170150f1" diff --git a/server/.env.example b/server/.env.example index 290a07096..5e0233b7b 100644 --- a/server/.env.example +++ b/server/.env.example @@ -168,6 +168,19 @@ WHISPER_PROVIDER="local" # WHISPER_PROVIDER="openai" # OPEN_AI_KEY=sk-xxxxxxxx +########################################### +######## TTS/STT Model Selection ########## +########################################### +TTS_PROVIDER="native" + +# TTS_PROVIDER="openai" +# TTS_OPEN_AI_KEY=sk-example +# TTS_OPEN_AI_VOICE_MODEL=nova + +# TTS_PROVIDER="elevenlabs" +# TTS_ELEVEN_LABS_KEY= +# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash diff --git a/server/endpoints/workspaces.js b/server/endpoints/workspaces.js index c22c679a0..81cbd6154 100644 --- a/server/endpoints/workspaces.js +++ b/server/endpoints/workspaces.js @@ -1,6 +1,11 @@ const path = require("path"); const fs = require("fs"); -const { reqBody, multiUserMode, userFromSession } = require("../utils/http"); +const { + reqBody, + multiUserMode, + userFromSession, + safeJsonParse, +} = require("../utils/http"); const { normalizePath } = require("../utils/files"); const { Workspace } = require("../models/workspace"); const { Document } = require("../models/documents"); @@ -25,6 +30,7 @@ const { determineWorkspacePfpFilepath, fetchPfp, } = require("../utils/files/pfp"); +const { getTTSProvider } = require("../utils/TextToSpeech"); function workspaceEndpoints(app) { if (!app) return; @@ -506,6 +512,48 @@ function workspaceEndpoints(app) { } ); + app.get( + "/workspace/:slug/tts/:chatId", + [validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug], + async function (request, response) { + try { + const { chatId } = request.params; + const workspace = response.locals.workspace; + const cacheKey = `${workspace.slug}:${chatId}`; + const wsChat = await WorkspaceChats.get({ + id: Number(chatId), + workspaceId: workspace.id, + }); + + const cachedResponse = responseCache.get(cacheKey); + if (cachedResponse) { + response.writeHead(200, { + "Content-Type": cachedResponse.mime || "audio/mpeg", + }); + response.end(cachedResponse.buffer); + return; + } + + const text = safeJsonParse(wsChat.response, null)?.text; + if (!text) return response.sendStatus(204).end(); + + const TTSProvider = getTTSProvider(); + const buffer = await TTSProvider.ttsBuffer(text); + if (buffer === null) return response.sendStatus(204).end(); + + responseCache.set(cacheKey, { buffer, mime: "audio/mpeg" }); + response.writeHead(200, { + "Content-Type": "audio/mpeg", + }); + response.end(buffer); + return; + } catch (error) { + console.error("Error processing the TTS request:", error); + response.status(500).json({ message: "TTS could not be completed" }); + } + } + ); + app.get( "/workspace/:slug/pfp", [validatedRequest, flexUserRoleValid([ROLES.all])], diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 904c448d5..248ca8cd7 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -131,6 +131,17 @@ const SystemSettings = { // -------------------------------------------------------- WhisperProvider: process.env.WHISPER_PROVIDER || "local", + // -------------------------------------------------------- + // TTS/STT Selection Settings & Configs + // - Currently the only 3rd party is OpenAI or the native browser-built in + // -------------------------------------------------------- + TextToSpeechProvider: process.env.TTS_PROVIDER || "native", + TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY, + TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL, + // Eleven Labs TTS + TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY, + TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL, + // -------------------------------------------------------- // Agent Settings & Configs // -------------------------------------------------------- diff --git a/server/package.json b/server/package.json index edee71b02..73b947c46 100644 --- a/server/package.json +++ b/server/package.json @@ -44,6 +44,7 @@ "cohere-ai": "^7.9.5", "cors": "^2.8.5", "dotenv": "^16.0.3", + "elevenlabs": "^0.5.0", "express": "^4.18.2", "express-ws": "^5.0.2", "extract-json-from-string": "^1.0.1", diff --git a/server/utils/TextToSpeech/elevenLabs/index.js b/server/utils/TextToSpeech/elevenLabs/index.js new file mode 100644 index 000000000..e3d25f3ae --- /dev/null +++ b/server/utils/TextToSpeech/elevenLabs/index.js @@ -0,0 +1,54 @@ +const { ElevenLabsClient, stream } = require("elevenlabs"); + +class ElevenLabsTTS { + constructor() { + if (!process.env.TTS_ELEVEN_LABS_KEY) + throw new Error("No ElevenLabs API key was set."); + this.elevenLabs = new ElevenLabsClient({ + apiKey: process.env.TTS_ELEVEN_LABS_KEY, + }); + + // Rachel as default voice + // https://api.elevenlabs.io/v1/voices + this.voiceId = + process.env.TTS_ELEVEN_LABS_VOICE_MODEL ?? "21m00Tcm4TlvDq8ikWAM"; + this.modelId = "eleven_multilingual_v2"; + } + + static async voices(apiKey = null) { + try { + const client = new ElevenLabsClient({ + apiKey: apiKey ?? process.env.TTS_ELEVEN_LABS_KEY ?? null, + }); + return (await client.voices.getAll())?.voices ?? []; + } catch {} + return []; + } + + #stream2buffer(stream) { + return new Promise((resolve, reject) => { + const _buf = []; + stream.on("data", (chunk) => _buf.push(chunk)); + stream.on("end", () => resolve(Buffer.concat(_buf))); + stream.on("error", (err) => reject(err)); + }); + } + + async ttsBuffer(textInput) { + try { + const audio = await this.elevenLabs.generate({ + voice: this.voiceId, + text: textInput, + model_id: "eleven_multilingual_v2", + }); + return Buffer.from(await this.#stream2buffer(audio)); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + ElevenLabsTTS, +}; diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js new file mode 100644 index 000000000..155fc9540 --- /dev/null +++ b/server/utils/TextToSpeech/index.js @@ -0,0 +1,15 @@ +function getTTSProvider() { + const provider = process.env.TTS_PROVIDER || "openai"; + switch (provider) { + case "openai": + const { OpenAiTTS } = require("./openAi"); + return new OpenAiTTS(); + case "elevenlabs": + const { ElevenLabsTTS } = require("./elevenLabs"); + return new ElevenLabsTTS(); + default: + throw new Error("ENV: No TTS_PROVIDER value found in environment!"); + } +} + +module.exports = { getTTSProvider }; diff --git a/server/utils/TextToSpeech/openAi/index.js b/server/utils/TextToSpeech/openAi/index.js new file mode 100644 index 000000000..3c5b4840d --- /dev/null +++ b/server/utils/TextToSpeech/openAi/index.js @@ -0,0 +1,29 @@ +class OpenAiTTS { + constructor() { + if (!process.env.TTS_OPEN_AI_KEY) + throw new Error("No OpenAI API key was set."); + const { OpenAI: OpenAIApi } = require("openai"); + this.openai = new OpenAIApi({ + apiKey: process.env.TTS_OPEN_AI_KEY, + }); + this.voice = process.env.TTS_OPEN_AI_VOICE_MODEL ?? "alloy"; + } + + async ttsBuffer(textInput) { + try { + const result = await this.openai.audio.speech.create({ + model: "tts-1", + voice: this.voice, + input: textInput, + }); + return Buffer.from(await result.arrayBuffer()); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + OpenAiTTS, +}; diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js index b7aae93be..caf5a77c7 100644 --- a/server/utils/helpers/customModels.js +++ b/server/utils/helpers/customModels.js @@ -4,6 +4,7 @@ const { } = require("../AiProviders/openRouter"); const { perplexityModels } = require("../AiProviders/perplexity"); const { togetherAiModels } = require("../AiProviders/togetherAi"); +const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs"); const SUPPORT_CUSTOM_MODELS = [ "openai", "localai", @@ -15,6 +16,7 @@ const SUPPORT_CUSTOM_MODELS = [ "openrouter", "lmstudio", "koboldcpp", + "elevenlabs-tts", ]; async function getCustomModels(provider = "", apiKey = null, basePath = null) { @@ -42,6 +44,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) { return await getLMStudioModels(basePath); case "koboldcpp": return await getKoboldCPPModels(basePath); + case "elevenlabs-tts": + return await getElevenLabsModels(apiKey); default: return { models: [], error: "Invalid provider for custom models" }; } @@ -321,6 +325,32 @@ function nativeLLMModels() { return { models: files, error: null }; } +async function getElevenLabsModels(apiKey = null) { + const models = (await ElevenLabsTTS.voices(apiKey)).map((model) => { + return { + id: model.voice_id, + organization: model.category, + name: model.name, + }; + }); + + if (models.length === 0) { + return { + models: [ + { + id: "21m00Tcm4TlvDq8ikWAM", + organization: "premade", + name: "Rachel (default)", + }, + ], + error: null, + }; + } + + if (models.length > 0 && !!apiKey) process.env.TTS_ELEVEN_LABS_KEY = apiKey; + return { models, error: null }; +} + module.exports = { getCustomModels, }; diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 947fbc624..e2b1d2e1c 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -366,6 +366,32 @@ const KEY_MAPPING = { envKey: "AGENT_SERPER_DEV_KEY", checks: [], }, + + // TTS/STT Integration ENVS + TextToSpeechProvider: { + envKey: "TTS_PROVIDER", + checks: [supportedTTSProvider], + }, + + // TTS OpenAI + TTSOpenAIKey: { + envKey: "TTS_OPEN_AI_KEY", + checks: [validOpenAIKey], + }, + TTSOpenAIVoiceModel: { + envKey: "TTS_OPEN_AI_VOICE_MODEL", + checks: [], + }, + + // TTS ElevenLabs + TTSElevenLabsKey: { + envKey: "TTS_ELEVEN_LABS_KEY", + checks: [isNotEmpty], + }, + TTSElevenLabsVoiceModel: { + envKey: "TTS_ELEVEN_LABS_VOICE_MODEL", + checks: [], + }, }; function isNotEmpty(input = "") { @@ -419,6 +445,11 @@ function validOllamaLLMBasePath(input = "") { } } +function supportedTTSProvider(input = "") { + const validSelection = ["native", "openai", "elevenlabs"].includes(input); + return validSelection ? null : `${input} is not a valid TTS provider.`; +} + function supportedLLM(input = "") { const validSelection = [ "openai", diff --git a/server/yarn.lock b/server/yarn.lock index 5edd09a35..9e4f184d5 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -1901,6 +1901,11 @@ combined-stream@^1.0.8: dependencies: delayed-stream "~1.0.0" +command-exists@^1.2.9: + version "1.2.9" + resolved "https://registry.yarnpkg.com/command-exists/-/command-exists-1.2.9.tgz#c50725af3808c8ab0260fd60b01fbfa25b954f69" + integrity sha512-LTQ/SGc+s0Xc0Fu5WaKnR0YiygZkm9eKFvyS+fRsU7/ZWFF8ykFM6Pc9aCVf1+xasOOZpO3BAVgVrKvsqKHV7w== + command-line-args@5.2.1, command-line-args@^5.2.1: version "5.2.1" resolved "https://registry.yarnpkg.com/command-line-args/-/command-line-args-5.2.1.tgz#c44c32e437a57d7c51157696893c5909e9cec42e" @@ -2255,6 +2260,18 @@ ee-first@1.1.1: resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d" integrity sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow== +elevenlabs@^0.5.0: + version "0.5.0" + resolved "https://registry.yarnpkg.com/elevenlabs/-/elevenlabs-0.5.0.tgz#07eb1a943b0ab99b925875bd5c57833a3a024e58" + integrity sha512-jfex4ecuWIlyAUuMrMJAJNa5MLziqYQOCDw4ZYuoc9PCYLxtHwaYBWpZoDhnYMcceLI7rRRvmbLMcT9HlVMfHA== + dependencies: + command-exists "^1.2.9" + execa "^5.1.1" + form-data "4.0.0" + node-fetch "2.7.0" + qs "6.11.2" + url-join "4.0.1" + emoji-regex@^10.2.1: version "10.3.0" resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-10.3.0.tgz#76998b9268409eb3dae3de989254d456e70cfe23" @@ -2605,6 +2622,21 @@ eventemitter3@^4.0.4: resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-4.0.7.tgz#2de9b68f6528d5644ef5c59526a1b4a07306169f" integrity sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw== +execa@^5.1.1: + version "5.1.1" + resolved "https://registry.yarnpkg.com/execa/-/execa-5.1.1.tgz#f80ad9cbf4298f7bd1d4c9555c21e93741c411dd" + integrity sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg== + dependencies: + cross-spawn "^7.0.3" + get-stream "^6.0.0" + human-signals "^2.1.0" + is-stream "^2.0.0" + merge-stream "^2.0.0" + npm-run-path "^4.0.1" + onetime "^5.1.2" + signal-exit "^3.0.3" + strip-final-newline "^2.0.0" + expand-template@^2.0.3: version "2.0.3" resolved "https://registry.yarnpkg.com/expand-template/-/expand-template-2.0.3.tgz#6e14b3fcee0f3a6340ecb57d2e8918692052a47c" @@ -3024,6 +3056,11 @@ get-stream@^5.1.0: dependencies: pump "^3.0.0" +get-stream@^6.0.0: + version "6.0.1" + resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-6.0.1.tgz#a262d8eef67aced57c2852ad6167526a43cbf7b7" + integrity sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg== + get-symbol-description@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/get-symbol-description/-/get-symbol-description-1.0.2.tgz#533744d5aa20aca4e079c8e5daf7fd44202821f5" @@ -3297,6 +3334,11 @@ https-proxy-agent@^7.0.0: agent-base "^7.0.2" debug "4" +human-signals@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" + integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== + humanize-ms@^1.2.1: version "1.2.1" resolved "https://registry.yarnpkg.com/humanize-ms/-/humanize-ms-1.2.1.tgz#c46e3159a293f6b896da29316d8b6fe8bb79bbed" @@ -4092,6 +4134,11 @@ merge-descriptors@1.0.1: resolved "https://registry.yarnpkg.com/merge-descriptors/-/merge-descriptors-1.0.1.tgz#b00aaa556dd8b44568150ec9d1b953f3f90cbb61" integrity sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w== +merge-stream@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60" + integrity sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w== + methods@~1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/methods/-/methods-1.1.2.tgz#5529a4d67654134edcc5266656835b0f851afcee" @@ -4455,6 +4502,13 @@ normalize-path@^3.0.0, normalize-path@~3.0.0: resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== +npm-run-path@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea" + integrity sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw== + dependencies: + path-key "^3.0.0" + npmlog@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/npmlog/-/npmlog-5.0.1.tgz#f06678e80e29419ad67ab964e0fa69959c1eb8b0" @@ -4593,7 +4647,7 @@ one-time@^1.0.0: dependencies: fn.name "1.x.x" -onetime@^5.1.0: +onetime@^5.1.0, onetime@^5.1.2: version "5.1.2" resolved "https://registry.yarnpkg.com/onetime/-/onetime-5.1.2.tgz#d0e96ebb56b07476df1dd9c4806e5237985ca45e" integrity sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg== @@ -4774,7 +4828,7 @@ path-is-absolute@^1.0.0: resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f" integrity sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg== -path-key@^3.1.0: +path-key@^3.0.0, path-key@^3.1.0: version "3.1.1" resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375" integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q== @@ -5322,7 +5376,7 @@ side-channel@^1.0.4, side-channel@^1.0.6: get-intrinsic "^1.2.4" object-inspect "^1.13.1" -signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.7: +signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7: version "3.0.7" resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9" integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ== @@ -5559,6 +5613,11 @@ strip-ansi@^7.0.1, strip-ansi@^7.1.0: dependencies: ansi-regex "^6.0.1" +strip-final-newline@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad" + integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA== + strip-json-comments@^3.1.1: version "3.1.1" resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz#31f1281b3832630434831c310c01cccda8cbe006" -- GitLab