diff --git a/.vscode/settings.json b/.vscode/settings.json index 110c4fa6ed555efea6bb471413860b140c7bdc72..4930aa2d1536d873952d1a10fd765ee5ac9f1289 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -11,6 +11,7 @@ "cooldowns", "Deduplicator", "Dockerized", + "elevenlabs", "Embeddable", "epub", "GROQ", diff --git a/docker/.env.example b/docker/.env.example index 8cfa2aea8723ecce749453ca2556f4ab7504109f..70059ea5135a54ebe375a026fc5b636f2ccdb636 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -171,6 +171,19 @@ GID='1000' # WHISPER_PROVIDER="openai" # OPEN_AI_KEY=sk-xxxxxxxx +########################################### +######## TTS/STT Model Selection ########## +########################################### +# TTS_PROVIDER="native" + +# TTS_PROVIDER="openai" +# TTS_OPEN_AI_KEY=sk-example +# TTS_OPEN_AI_VOICE_MODEL=nova + +# TTS_PROVIDER="elevenlabs" +# TTS_ELEVEN_LABS_KEY= +# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # DISABLE_TELEMETRY="false" diff --git a/frontend/package.json b/frontend/package.json index ded06aa9c77217a1374a885c52538909eb62e320..11e612fcdfd4295116ec45d765fd7546d76420a4 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -28,6 +28,7 @@ "react-dropzone": "^14.2.3", "react-loading-skeleton": "^3.1.0", "react-router-dom": "^6.3.0", + "react-speech-recognition": "^3.10.0", "react-tag-input-component": "^2.0.2", "react-toastify": "^9.1.3", "react-tooltip": "^5.25.2", diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index 0a5ed65fc85b5345a7429dbd025523a9f09c5d6b..b29e6eea925e1a9d7cdf1745aa02001daf99a967 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -32,6 +32,9 @@ const GeneralLLMPreference = lazy( const GeneralTranscriptionPreference = lazy( () => import("@/pages/GeneralSettings/TranscriptionPreference") ); +const GeneralAudioPreference = lazy( + () => import("@/pages/GeneralSettings/AudioPreference") +); const GeneralEmbeddingPreference = lazy( () => import("@/pages/GeneralSettings/EmbeddingPreference") ); @@ -85,6 +88,10 @@ export default function App() { <AdminRoute Component={GeneralTranscriptionPreference} /> } /> + <Route + path="/settings/audio-preference" + element={<AdminRoute Component={GeneralAudioPreference} />} + /> <Route path="/settings/embedding-preference" element={<AdminRoute Component={GeneralEmbeddingPreference} />} diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx index 67797d266190570339b0a6e64f14213828085eb5..6b8f79e5eddfe8b28f530d65ed28b09f5644d09a 100644 --- a/frontend/src/components/SettingsSidebar/index.jsx +++ b/frontend/src/components/SettingsSidebar/index.jsx @@ -21,6 +21,7 @@ import { ClosedCaptioning, EyeSlash, SplitVertical, + Microphone, } from "@phosphor-icons/react"; import useUser from "@/hooks/useUser"; import { USER_BACKGROUND_COLOR } from "@/utils/constants"; @@ -280,6 +281,14 @@ const SidebarOptions = ({ user = null }) => ( flex={true} allowedRole={["admin"]} /> + <Option + href={paths.settings.audioPreference()} + btnText="Voice and Speech Support" + icon={<Microphone className="h-5 w-5 flex-shrink-0" />} + user={user} + flex={true} + allowedRole={["admin"]} + /> <Option href={paths.settings.transcriptionPreference()} btnText="Transcription Model" diff --git a/frontend/src/components/SpeechToText/BrowserNative/index.jsx b/frontend/src/components/SpeechToText/BrowserNative/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..1e9bcb3c2e58b7917538c93c0c690b1e90d73171 --- /dev/null +++ b/frontend/src/components/SpeechToText/BrowserNative/index.jsx @@ -0,0 +1,9 @@ +export default function BrowserNative() { + return ( + <div className="w-full h-10 items-center flex"> + <p className="text-sm font-base text-white text-opacity-60"> + There is no configuration needed for this provider. + </p> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/BrowserNative/index.jsx b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..1e9bcb3c2e58b7917538c93c0c690b1e90d73171 --- /dev/null +++ b/frontend/src/components/TextToSpeech/BrowserNative/index.jsx @@ -0,0 +1,9 @@ +export default function BrowserNative() { + return ( + <div className="w-full h-10 items-center flex"> + <p className="text-sm font-base text-white text-opacity-60"> + There is no configuration needed for this provider. + </p> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..ad86caa1cbe78bd7e997047d5908344d1a5e289c --- /dev/null +++ b/frontend/src/components/TextToSpeech/ElevenLabsOptions/index.jsx @@ -0,0 +1,107 @@ +import { useState, useEffect } from "react"; +import System from "@/models/system"; + +export default function ElevenLabsOptions({ settings }) { + const [inputValue, setInputValue] = useState(settings?.TTSElevenLabsKey); + const [openAIKey, setOpenAIKey] = useState(settings?.TTSElevenLabsKey); + + return ( + <div className="flex gap-x-4"> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + API Key + </label> + <input + type="password" + name="TTSElevenLabsKey" + className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="ElevenLabs API Key" + defaultValue={settings?.TTSElevenLabsKey ? "*".repeat(20) : ""} + required={true} + autoComplete="off" + spellCheck={false} + onChange={(e) => setInputValue(e.target.value)} + onBlur={() => setOpenAIKey(inputValue)} + /> + </div> + {!settings?.credentialsOnly && ( + <ElevenLabsModelSelection settings={settings} apiKey={openAIKey} /> + )} + </div> + ); +} + +function ElevenLabsModelSelection({ apiKey, settings }) { + const [groupedModels, setGroupedModels] = useState({}); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function findCustomModels() { + setLoading(true); + const { models } = await System.customModels( + "elevenlabs-tts", + typeof apiKey === "boolean" ? null : apiKey + ); + + if (models?.length > 0) { + const modelsByOrganization = models.reduce((acc, model) => { + acc[model.organization] = acc[model.organization] || []; + acc[model.organization].push(model); + return acc; + }, {}); + setGroupedModels(modelsByOrganization); + } + + setLoading(false); + } + findCustomModels(); + }, [apiKey]); + + if (loading) { + return ( + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Chat Model Selection + </label> + <select + name="TTSElevenLabsVoiceModel" + disabled={true} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + <option disabled={true} selected={true}> + -- loading available models -- + </option> + </select> + </div> + ); + } + + return ( + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Chat Model Selection + </label> + <select + name="TTSElevenLabsVoiceModel" + required={true} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + {Object.keys(groupedModels) + .sort() + .map((organization) => ( + <optgroup key={organization} label={organization}> + {groupedModels[organization].map((model) => ( + <option + key={model.id} + value={model.id} + selected={settings?.OpenAiModelPref === model.id} + > + {model.name} + </option> + ))} + </optgroup> + ))} + </select> + </div> + ); +} diff --git a/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..4183a4e5800cf4dbc085c6bc3157dc9689a17b3a --- /dev/null +++ b/frontend/src/components/TextToSpeech/OpenAiOptions/index.jsx @@ -0,0 +1,45 @@ +function toProperCase(string) { + return string.replace(/\w\S*/g, function (txt) { + return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase(); + }); +} + +export default function OpenAiTextToSpeechOptions({ settings }) { + const apiKey = settings?.TTSOpenAIKey; + + return ( + <div className="flex gap-x-4"> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + API Key + </label> + <input + type="password" + name="TTSOpenAIKey" + className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="OpenAI API Key" + defaultValue={apiKey ? "*".repeat(20) : ""} + required={true} + autoComplete="off" + spellCheck={false} + /> + </div> + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold block mb-4"> + Voice Model + </label> + <select + name="TTSOpenAIVoiceModel" + defaultValue={settings?.TTSOpenAIVoiceModel ?? "alloy"} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + > + {["alloy", "echo", "fable", "onyx", "nova", "shimmer"].map( + (voice) => { + return <option value={voice}>{toProperCase(voice)}</option>; + } + )} + </select> + </div> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx new file mode 100644 index 0000000000000000000000000000000000000000..1947f0057f4086b9784e15c3155e64a4b5291363 --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/asyncTts.jsx @@ -0,0 +1,94 @@ +import { useEffect, useState, useRef } from "react"; +import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; +import Workspace from "@/models/workspace"; +import showToast from "@/utils/toast"; + +export default function AsyncTTSMessage({ slug, chatId }) { + const playerRef = useRef(null); + const [speaking, setSpeaking] = useState(false); + const [loading, setLoading] = useState(false); + const [audioSrc, setAudioSrc] = useState(null); + + function speakMessage() { + if (speaking) { + playerRef?.current?.pause(); + return; + } + + try { + if (!audioSrc) { + setLoading(true); + Workspace.ttsMessage(slug, chatId) + .then((audioBlob) => { + if (!audioBlob) + throw new Error("Failed to load or play TTS message response."); + setAudioSrc(audioBlob); + }) + .catch((e) => showToast(e.message, "error", { clear: true })) + .finally(() => setLoading(false)); + } else { + playerRef.current.play(); + } + } catch (e) { + console.error(e); + setLoading(false); + setSpeaking(false); + } + } + + useEffect(() => { + function setupPlayer() { + if (!playerRef?.current) return; + playerRef.current.addEventListener("play", () => { + setSpeaking(true); + }); + + playerRef.current.addEventListener("pause", () => { + playerRef.current.currentTime = 0; + setSpeaking(false); + }); + } + setupPlayer(); + }, []); + + if (!chatId) return null; + return ( + <div className="mt-3 relative"> + <button + onClick={speakMessage} + data-tooltip-id="message-to-speech" + data-tooltip-content={ + speaking ? "Pause TTS speech of message" : "TTS Speak message" + } + className="border-none text-zinc-300" + aria-label={speaking ? "Pause speech" : "Speak message"} + > + {speaking ? ( + <PauseCircle size={18} className="mb-1" /> + ) : ( + <> + {loading ? ( + <CircleNotch size={18} className="mb-1 animate-spin" /> + ) : ( + <SpeakerHigh size={18} className="mb-1" /> + )} + </> + )} + <audio + ref={playerRef} + hidden={true} + src={audioSrc} + autoPlay={true} + controls={false} + /> + </button> + <Tooltip + id="message-to-speech" + place="bottom" + delayShow={300} + className="tooltip !text-xs" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..644a57afc384a5389770a7055dcd392790cf51ef --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/index.jsx @@ -0,0 +1,23 @@ +import { useEffect, useState } from "react"; +import NativeTTSMessage from "./native"; +import AsyncTTSMessage from "./asyncTts"; +import System from "@/models/system"; + +export default function TTSMessage({ slug, chatId, message }) { + const [provider, setProvider] = useState("native"); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function getSettings() { + const _settings = await System.keys(); + setProvider(_settings?.TextToSpeechProvider ?? "native"); + setLoading(false); + } + getSettings(); + }, []); + + if (loading) return null; + if (provider !== "native") + return <AsyncTTSMessage slug={slug} chatId={chatId} />; + return <NativeTTSMessage message={message} />; +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx new file mode 100644 index 0000000000000000000000000000000000000000..5f3bd3f69cdad0c5bae51dffa26e21959d3795cc --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx @@ -0,0 +1,61 @@ +import React, { useEffect, useState } from "react"; +import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; + +export default function NativeTTSMessage({ message }) { + const [speaking, setSpeaking] = useState(false); + const [supported, setSupported] = useState(false); + useEffect(() => { + setSupported("speechSynthesis" in window); + }, []); + + function endSpeechUtterance() { + window.speechSynthesis?.cancel(); + setSpeaking(false); + return; + } + + function speakMessage() { + // if the user is pausing this particular message + // while the synth is speaking we can end it. + // If they are clicking another message's TTS + // we need to ignore that until they pause the one that is playing. + if (window.speechSynthesis.speaking && speaking) { + endSpeechUtterance(); + return; + } + + if (window.speechSynthesis.speaking && !speaking) return; + const utterance = new SpeechSynthesisUtterance(message); + utterance.addEventListener("end", endSpeechUtterance); + window.speechSynthesis.speak(utterance); + setSpeaking(true); + } + + if (!supported) return null; + return ( + <div className="mt-3 relative"> + <button + onClick={speakMessage} + data-tooltip-id="message-to-speech" + data-tooltip-content={ + speaking ? "Pause TTS speech of message" : "TTS Speak message" + } + className="border-none text-zinc-300" + aria-label={speaking ? "Pause speech" : "Speak message"} + > + {speaking ? ( + <PauseCircle size={18} className="mb-1" /> + ) : ( + <SpeakerHigh size={18} className="mb-1" /> + )} + </button> + <Tooltip + id="message-to-speech" + place="bottom" + delayShow={300} + className="tooltip !text-xs" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx index 3bdee472d8e917e1b3deb7800c66a0934b5b5449..52ae1466a82ba0e82095a74318c1c3cd3aa74cd8 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/index.jsx @@ -1,4 +1,4 @@ -import React, { memo, useEffect, useState } from "react"; +import React, { memo, useState } from "react"; import useCopyText from "@/hooks/useCopyText"; import { Check, @@ -6,11 +6,10 @@ import { ThumbsUp, ThumbsDown, ArrowsClockwise, - SpeakerHigh, - PauseCircle, } from "@phosphor-icons/react"; import { Tooltip } from "react-tooltip"; import Workspace from "@/models/workspace"; +import TTSMessage from "./TTSButton"; const Actions = ({ message, @@ -60,7 +59,7 @@ const Actions = ({ </> )} </div> - <TTSMessage message={message} /> + <TTSMessage slug={slug} chatId={chatId} message={message} /> </div> ); }; @@ -149,62 +148,4 @@ function RegenerateMessage({ regenerateMessage, chatId }) { ); } -function TTSMessage({ message }) { - const [speaking, setSpeaking] = useState(false); - const [supported, setSupported] = useState(false); - useEffect(() => { - setSupported("speechSynthesis" in window); - }, []); - - function endSpeechUtterance() { - window.speechSynthesis?.cancel(); - setSpeaking(false); - return; - } - - function speakMessage() { - // if the user is pausing this particular message - // while the synth if speaking we can end it. - // If they are clicking another message's TTS - // we need to ignore that until they pause the one that is playing. - if (window.speechSynthesis.speaking && speaking) { - endSpeechUtterance(); - return; - } - - if (window.speechSynthesis.speaking && !speaking) return; - const utterance = new SpeechSynthesisUtterance(message); - utterance.addEventListener("end", endSpeechUtterance); - window.speechSynthesis.speak(utterance); - setSpeaking(true); - } - - if (!supported) return null; - return ( - <div className="mt-3 relative"> - <button - onClick={speakMessage} - data-tooltip-id="message-to-speech" - data-tooltip-content={ - speaking ? "Pause TTS speech of message" : "TTS Speak message" - } - className="border-none text-zinc-300" - aria-label={speaking ? "Pause speech" : "Speak message"} - > - {speaking ? ( - <PauseCircle size={18} className="mb-1" /> - ) : ( - <SpeakerHigh size={18} className="mb-1" /> - )} - </button> - <Tooltip - id="message-to-speech" - place="bottom" - delayShow={300} - className="tooltip !text-xs" - /> - </div> - ); -} - export default memo(Actions); diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..6cbcfbf8d5a42c58138520b26f9ada6635b5bccb --- /dev/null +++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx @@ -0,0 +1,82 @@ +import { useEffect } from "react"; +import { Microphone } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; +import _regeneratorRuntime from "regenerator-runtime"; +import SpeechRecognition, { + useSpeechRecognition, +} from "react-speech-recognition"; + +let timeout; +const SILENCE_INTERVAL = 3_200; // wait in seconds of silence before closing. +export default function SpeechToText({ sendCommand }) { + const { + transcript, + listening, + resetTranscript, + browserSupportsSpeechRecognition, + browserSupportsContinuousListening, + isMicrophoneAvailable, + } = useSpeechRecognition({ + clearTranscriptOnListen: true, + }); + + function startSTTSession() { + if (!isMicrophoneAvailable) { + alert( + "AnythingLLM does not have access to microphone. Please enable for this site to use this feature." + ); + return; + } + + resetTranscript(); + SpeechRecognition.startListening({ + continuous: browserSupportsContinuousListening, + language: window?.navigator?.language ?? "en-US", + }); + } + + function endTTSSession() { + SpeechRecognition.stopListening(); + if (transcript.length > 0) { + sendCommand(transcript, true); + } + + resetTranscript(); + clearTimeout(timeout); + } + + useEffect(() => { + if (transcript?.length > 0) { + sendCommand(transcript, false); + clearTimeout(timeout); + timeout = setTimeout(() => { + endTTSSession(); + }, SILENCE_INTERVAL); + } + }, [transcript]); + + if (!browserSupportsSpeechRecognition) return null; + return ( + <div + id="text-size-btn" + data-tooltip-id="tooltip-text-size-btn" + data-tooltip-content="Speak your prompt" + aria-label="Speak your prompt" + onClick={listening ? endTTSSession : startSTTSession} + className={`relative flex justify-center items-center opacity-60 hover:opacity-100 cursor-pointer ${ + !!listening ? "!opacity-100" : "" + }`} + > + <Microphone + weight="fill" + className="w-6 h-6 pointer-events-none text-white" + /> + <Tooltip + id="tooltip-text-size-btn" + place="top" + delayShow={300} + className="tooltip !text-xs z-99" + /> + </div> + ); +} diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx index 98ad11f8f9864b6c9a7e4b6d516fb9fa99437863..df08bcc7c4651fa1f07a3fb2950847d0ce4b7306 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx @@ -12,6 +12,7 @@ import AvailableAgentsButton, { useAvailableAgents, } from "./AgentMenu"; import TextSizeButton from "./TextSizeMenu"; +import SpeechToText from "./SpeechToText"; export const PROMPT_INPUT_EVENT = "set_prompt_input"; export default function PromptInput({ @@ -34,6 +35,7 @@ export default function PromptInput({ function handlePromptUpdate(e) { setPromptInput(e?.detail ?? ""); } + useEffect(() => { if (!!window) window.addEventListener(PROMPT_INPUT_EVENT, handlePromptUpdate); @@ -156,6 +158,9 @@ export default function PromptInput({ /> <TextSizeButton /> </div> + <div className="flex gap-x-2"> + <SpeechToText sendCommand={sendCommand} /> + </div> </div> </div> </div> diff --git a/frontend/src/media/ttsproviders/elevenlabs.png b/frontend/src/media/ttsproviders/elevenlabs.png new file mode 100644 index 0000000000000000000000000000000000000000..b1047e422e3c7a855caec5ef2bae1fcc647495e5 Binary files /dev/null and b/frontend/src/media/ttsproviders/elevenlabs.png differ diff --git a/frontend/src/models/system.js b/frontend/src/models/system.js index e64b01199864eb29bf7dd2e326b9d17429943520..f8f123448494c49d8170679440299d27cda99353 100644 --- a/frontend/src/models/system.js +++ b/frontend/src/models/system.js @@ -332,7 +332,7 @@ const System = { }) .then((blob) => (blob ? URL.createObjectURL(blob) : null)) .catch((e) => { - console.log(e); + // console.log(e); return null; }); }, diff --git a/frontend/src/models/workspace.js b/frontend/src/models/workspace.js index 91f4a2db332b26b91055be3dd639782c251d1935..64732c0441e0c4d9bd6cafb0bf04fc50b576300f 100644 --- a/frontend/src/models/workspace.js +++ b/frontend/src/models/workspace.js @@ -272,6 +272,21 @@ const Workspace = { return false; }); }, + ttsMessage: async function (slug, chatId) { + return await fetch(`${API_BASE}/workspace/${slug}/tts/${chatId}`, { + method: "GET", + cache: "no-cache", + headers: baseHeaders(), + }) + .then((res) => { + if (res.ok && res.status !== 204) return res.blob(); + throw new Error("Failed to fetch TTS."); + }) + .then((blob) => (blob ? URL.createObjectURL(blob) : null)) + .catch((e) => { + return null; + }); + }, threads: WorkspaceThread, uploadPfp: async function (formData, slug) { @@ -302,7 +317,7 @@ const Workspace = { }) .then((blob) => (blob ? URL.createObjectURL(blob) : null)) .catch((e) => { - console.log(e); + // console.log(e); return null; }); }, diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..c4abaf5461ee597dbfd243490e5f9ad170d55f9d --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/index.jsx @@ -0,0 +1,45 @@ +import React, { useEffect, useState, useRef } from "react"; +import { isMobile } from "react-device-detect"; +import Sidebar from "@/components/SettingsSidebar"; +import System from "@/models/system"; +import PreLoader from "@/components/Preloader"; +import SpeechToTextProvider from "./stt"; +import TextToSpeechProvider from "./tts"; + +export default function AudioPreference() { + const [settings, setSettings] = useState(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + async function fetchKeys() { + const _settings = await System.keys(); + setSettings(_settings); + setLoading(false); + } + fetchKeys(); + }, []); + + return ( + <div className="w-screen h-screen overflow-hidden bg-sidebar flex"> + <Sidebar /> + {loading ? ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <div className="w-full h-full flex justify-center items-center"> + <PreLoader /> + </div> + </div> + ) : ( + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll" + > + <SpeechToTextProvider settings={settings} /> + <TextToSpeechProvider settings={settings} /> + </div> + )} + </div> + ); +} diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx new file mode 100644 index 0000000000000000000000000000000000000000..58bb1489be11465abfe7f05a4e04294d20796aec --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/stt.jsx @@ -0,0 +1,191 @@ +import React, { useEffect, useState, useRef } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import LLMItem from "@/components/LLMSelection/LLMItem"; +import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react"; +import CTAButton from "@/components/lib/CTAButton"; +import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; +import BrowserNative from "@/components/SpeechToText/BrowserNative"; + +const PROVIDERS = [ + { + name: "System native", + value: "native", + logo: AnythingLLMIcon, + options: (settings) => <BrowserNative settings={settings} />, + description: "Uses your browser's built in STT service if supported.", + }, +]; + +export default function SpeechToTextProvider({ settings }) { + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); + const [searchQuery, setSearchQuery] = useState(""); + const [filteredProviders, setFilteredProviders] = useState([]); + const [selectedProvider, setSelectedProvider] = useState( + settings?.SpeechToTextProvider || "native" + ); + const [searchMenuOpen, setSearchMenuOpen] = useState(false); + const searchInputRef = useRef(null); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = e.target; + const data = { SpeechToTextProvider: selectedProvider }; + const formData = new FormData(form); + + for (var [key, value] of formData.entries()) data[key] = value; + const { error } = await System.updateSystem(data); + setSaving(true); + + if (error) { + showToast(`Failed to save preferences: ${error}`, "error"); + } else { + showToast("Speech-to-text preferences saved successfully.", "success"); + } + setSaving(false); + setHasChanges(!!error); + }; + + const updateProviderChoice = (selection) => { + setSearchQuery(""); + setSelectedProvider(selection); + setSearchMenuOpen(false); + setHasChanges(true); + }; + + const handleXButton = () => { + if (searchQuery.length > 0) { + setSearchQuery(""); + if (searchInputRef.current) searchInputRef.current.value = ""; + } else { + setSearchMenuOpen(!searchMenuOpen); + } + }; + + useEffect(() => { + const filtered = PROVIDERS.filter((provider) => + provider.name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + setFilteredProviders(filtered); + }, [searchQuery, selectedProvider]); + + const selectedProviderObject = PROVIDERS.find( + (provider) => provider.value === selectedProvider + ); + + return ( + <form onSubmit={handleSubmit} className="flex w-full"> + <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16"> + <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10"> + <div className="flex gap-x-4 items-center"> + <p className="text-lg leading-6 font-bold text-white"> + Speech-to-text Preference + </p> + </div> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60"> + Here you can specify what kind of text-to-speech and speech-to-text + providers you would want to use in your AnythingLLM experience. By + default, we use the browser's built in support for these services, + but you may want to use others. + </p> + </div> + <div className="w-full justify-end flex"> + {hasChanges && ( + <CTAButton + onClick={() => handleSubmit()} + className="mt-3 mr-0 -mb-14 z-10" + > + {saving ? "Saving..." : "Save changes"} + </CTAButton> + )} + </div> + <div className="text-base font-bold text-white mt-6 mb-4">Provider</div> + <div className="relative"> + {searchMenuOpen && ( + <div + className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10" + onClick={() => setSearchMenuOpen(false)} + /> + )} + {searchMenuOpen ? ( + <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20"> + <div className="w-full flex flex-col gap-y-1"> + <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]"> + <MagnifyingGlass + size={20} + weight="bold" + className="absolute left-4 z-30 text-white -ml-4 my-2" + /> + <input + type="text" + name="stt-provider-search" + autoComplete="off" + placeholder="Search speech to text providers" + className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium" + onChange={(e) => setSearchQuery(e.target.value)} + ref={searchInputRef} + onKeyDown={(e) => { + if (e.key === "Enter") e.preventDefault(); + }} + /> + <X + size={20} + weight="bold" + className="cursor-pointer text-white hover:text-[#9CA3AF]" + onClick={handleXButton} + /> + </div> + <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4"> + {filteredProviders.map((provider) => ( + <LLMItem + key={provider.name} + name={provider.name} + value={provider.value} + image={provider.logo} + description={provider.description} + checked={selectedProvider === provider.value} + onClick={() => updateProviderChoice(provider.value)} + /> + ))} + </div> + </div> + </div> + ) : ( + <button + className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300" + type="button" + onClick={() => setSearchMenuOpen(true)} + > + <div className="flex gap-x-4 items-center"> + <img + src={selectedProviderObject.logo} + alt={`${selectedProviderObject.name} logo`} + className="w-10 h-10 rounded-md" + /> + <div className="flex flex-col text-left"> + <div className="text-sm font-semibold text-white"> + {selectedProviderObject.name} + </div> + <div className="mt-1 text-xs text-[#D2D5DB]"> + {selectedProviderObject.description} + </div> + </div> + </div> + <CaretUpDown size={24} weight="bold" className="text-white" /> + </button> + )} + </div> + <div + onChange={() => setHasChanges(true)} + className="mt-4 flex flex-col gap-y-1" + > + {selectedProvider && + PROVIDERS.find( + (provider) => provider.value === selectedProvider + )?.options(settings)} + </div> + </div> + </form> + ); +} diff --git a/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx new file mode 100644 index 0000000000000000000000000000000000000000..6b11f1a462e294108a293a1d0f8d9cf920aec186 --- /dev/null +++ b/frontend/src/pages/GeneralSettings/AudioPreference/tts.jsx @@ -0,0 +1,209 @@ +import React, { useEffect, useState, useRef } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import LLMItem from "@/components/LLMSelection/LLMItem"; +import { CaretUpDown, MagnifyingGlass, X } from "@phosphor-icons/react"; +import CTAButton from "@/components/lib/CTAButton"; +import OpenAiLogo from "@/media/llmprovider/openai.png"; +import AnythingLLMIcon from "@/media/logo/anything-llm-icon.png"; +import ElevenLabsIcon from "@/media/ttsproviders/elevenlabs.png"; +import BrowserNative from "@/components/TextToSpeech/BrowserNative"; +import OpenAiTTSOptions from "@/components/TextToSpeech/OpenAiOptions"; +import ElevenLabsTTSOptions from "@/components/TextToSpeech/ElevenLabsOptions"; + +const PROVIDERS = [ + { + name: "System native", + value: "native", + logo: AnythingLLMIcon, + options: (settings) => <BrowserNative settings={settings} />, + description: "Uses your browser's built in TTS service if supported.", + }, + { + name: "OpenAI", + value: "openai", + logo: OpenAiLogo, + options: (settings) => <OpenAiTTSOptions settings={settings} />, + description: "Use OpenAI's text to speech voices.", + }, + { + name: "ElevenLabs", + value: "elevenlabs", + logo: ElevenLabsIcon, + options: (settings) => <ElevenLabsTTSOptions settings={settings} />, + description: "Use ElevenLabs's text to speech voices and technology.", + }, +]; + +export default function TextToSpeechProvider({ settings }) { + const [saving, setSaving] = useState(false); + const [hasChanges, setHasChanges] = useState(false); + const [searchQuery, setSearchQuery] = useState(""); + const [filteredProviders, setFilteredProviders] = useState([]); + const [selectedProvider, setSelectedProvider] = useState( + settings?.TextToSpeechProvider || "native" + ); + const [searchMenuOpen, setSearchMenuOpen] = useState(false); + const searchInputRef = useRef(null); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = e.target; + const data = { TextToSpeechProvider: selectedProvider }; + const formData = new FormData(form); + + for (var [key, value] of formData.entries()) data[key] = value; + const { error } = await System.updateSystem(data); + setSaving(true); + + if (error) { + showToast(`Failed to save preferences: ${error}`, "error"); + } else { + showToast("Text-to-speech preferences saved successfully.", "success"); + } + setSaving(false); + setHasChanges(!!error); + }; + + const updateProviderChoice = (selection) => { + setSearchQuery(""); + setSelectedProvider(selection); + setSearchMenuOpen(false); + setHasChanges(true); + }; + + const handleXButton = () => { + if (searchQuery.length > 0) { + setSearchQuery(""); + if (searchInputRef.current) searchInputRef.current.value = ""; + } else { + setSearchMenuOpen(!searchMenuOpen); + } + }; + + useEffect(() => { + const filtered = PROVIDERS.filter((provider) => + provider.name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + setFilteredProviders(filtered); + }, [searchQuery, selectedProvider]); + + const selectedProviderObject = PROVIDERS.find( + (provider) => provider.value === selectedProvider + ); + + return ( + <form onSubmit={handleSubmit} className="flex w-full"> + <div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16"> + <div className="w-full flex flex-col gap-y-1 pb-6 border-white border-b-2 border-opacity-10"> + <div className="flex gap-x-4 items-center"> + <p className="text-lg leading-6 font-bold text-white"> + Text-to-speech Preference + </p> + </div> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60"> + Here you can specify what kind of text-to-speech providers you would + want to use in your AnythingLLM experience. By default, we use the + browser's built in support for these services, but you may want to + use others. + </p> + </div> + <div className="w-full justify-end flex"> + {hasChanges && ( + <CTAButton + onClick={() => handleSubmit()} + className="mt-3 mr-0 -mb-14 z-10" + > + {saving ? "Saving..." : "Save changes"} + </CTAButton> + )} + </div> + <div className="text-base font-bold text-white mt-6 mb-4">Provider</div> + <div className="relative"> + {searchMenuOpen && ( + <div + className="fixed top-0 left-0 w-full h-full bg-black bg-opacity-70 backdrop-blur-sm z-10" + onClick={() => setSearchMenuOpen(false)} + /> + )} + {searchMenuOpen ? ( + <div className="absolute top-0 left-0 w-full max-w-[640px] max-h-[310px] overflow-auto white-scrollbar min-h-[64px] bg-[#18181B] rounded-lg flex flex-col justify-between cursor-pointer border-2 border-[#46C8FF] z-20"> + <div className="w-full flex flex-col gap-y-1"> + <div className="flex items-center sticky top-0 border-b border-[#9CA3AF] mx-4 bg-[#18181B]"> + <MagnifyingGlass + size={20} + weight="bold" + className="absolute left-4 z-30 text-white -ml-4 my-2" + /> + <input + type="text" + name="tts-provider-search" + autoComplete="off" + placeholder="Search text to speech providers" + className="-ml-4 my-2 bg-transparent z-20 pl-12 h-[38px] w-full px-4 py-1 text-sm outline-none focus:border-white text-white placeholder:text-white placeholder:font-medium" + onChange={(e) => setSearchQuery(e.target.value)} + ref={searchInputRef} + onKeyDown={(e) => { + if (e.key === "Enter") e.preventDefault(); + }} + /> + <X + size={20} + weight="bold" + className="cursor-pointer text-white hover:text-[#9CA3AF]" + onClick={handleXButton} + /> + </div> + <div className="flex-1 pl-4 pr-2 flex flex-col gap-y-1 overflow-y-auto white-scrollbar pb-4"> + {filteredProviders.map((provider) => ( + <LLMItem + key={provider.name} + name={provider.name} + value={provider.value} + image={provider.logo} + description={provider.description} + checked={selectedProvider === provider.value} + onClick={() => updateProviderChoice(provider.value)} + /> + ))} + </div> + </div> + </div> + ) : ( + <button + className="w-full max-w-[640px] h-[64px] bg-[#18181B] rounded-lg flex items-center p-[14px] justify-between cursor-pointer border-2 border-transparent hover:border-[#46C8FF] transition-all duration-300" + type="button" + onClick={() => setSearchMenuOpen(true)} + > + <div className="flex gap-x-4 items-center"> + <img + src={selectedProviderObject.logo} + alt={`${selectedProviderObject.name} logo`} + className="w-10 h-10 rounded-md" + /> + <div className="flex flex-col text-left"> + <div className="text-sm font-semibold text-white"> + {selectedProviderObject.name} + </div> + <div className="mt-1 text-xs text-[#D2D5DB]"> + {selectedProviderObject.description} + </div> + </div> + </div> + <CaretUpDown size={24} weight="bold" className="text-white" /> + </button> + )} + </div> + <div + onChange={() => setHasChanges(true)} + className="mt-4 flex flex-col gap-y-1" + > + {selectedProvider && + PROVIDERS.find( + (provider) => provider.value === selectedProvider + )?.options(settings)} + </div> + </div> + </form> + ); +} diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js index 4dc4d52850462d7918dd36e93df7540e5e6dc486..cc2b69eee57fb6f04e49983894ce66e28bec6db1 100644 --- a/frontend/src/utils/paths.js +++ b/frontend/src/utils/paths.js @@ -98,6 +98,9 @@ export default { transcriptionPreference: () => { return "/settings/transcription-preference"; }, + audioPreference: () => { + return "/settings/audio-preference"; + }, embedder: { modelPreference: () => "/settings/embedding-preference", chunkingPreference: () => "/settings/text-splitter-preference", diff --git a/frontend/yarn.lock b/frontend/yarn.lock index bd12e9fa348d5f2dd7d2e2d169458dcbfe238aae..93bdc0884fe36388aa0d759d5e3404adc1b4c9c5 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -2841,6 +2841,11 @@ react-smooth@^4.0.0: prop-types "^15.8.1" react-transition-group "^4.4.5" +react-speech-recognition@^3.10.0: + version "3.10.0" + resolved "https://registry.yarnpkg.com/react-speech-recognition/-/react-speech-recognition-3.10.0.tgz#7aa43bb28d78b92671864dabba3a70489ccad27b" + integrity sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw== + react-tag-input-component@^2.0.2: version "2.0.2" resolved "https://registry.yarnpkg.com/react-tag-input-component/-/react-tag-input-component-2.0.2.tgz#f62f013c6a535141dd1c6c3a88858223170150f1" diff --git a/server/.env.example b/server/.env.example index 290a07096fcf5fd1cad7390fffb895eb7df1d8f3..5e0233b7b4eb252d4758e11786d462723c00f6cd 100644 --- a/server/.env.example +++ b/server/.env.example @@ -168,6 +168,19 @@ WHISPER_PROVIDER="local" # WHISPER_PROVIDER="openai" # OPEN_AI_KEY=sk-xxxxxxxx +########################################### +######## TTS/STT Model Selection ########## +########################################### +TTS_PROVIDER="native" + +# TTS_PROVIDER="openai" +# TTS_OPEN_AI_KEY=sk-example +# TTS_OPEN_AI_VOICE_MODEL=nova + +# TTS_PROVIDER="elevenlabs" +# TTS_ELEVEN_LABS_KEY= +# TTS_ELEVEN_LABS_VOICE_MODEL=21m00Tcm4TlvDq8ikWAM # Rachel + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # STORAGE_DIR= # absolute filesystem path with no trailing slash diff --git a/server/endpoints/workspaces.js b/server/endpoints/workspaces.js index c22c679a0c834080218bb6b2813e883a3f888490..81cbd615418ebd899902a8d85e64420d20f68794 100644 --- a/server/endpoints/workspaces.js +++ b/server/endpoints/workspaces.js @@ -1,6 +1,11 @@ const path = require("path"); const fs = require("fs"); -const { reqBody, multiUserMode, userFromSession } = require("../utils/http"); +const { + reqBody, + multiUserMode, + userFromSession, + safeJsonParse, +} = require("../utils/http"); const { normalizePath } = require("../utils/files"); const { Workspace } = require("../models/workspace"); const { Document } = require("../models/documents"); @@ -25,6 +30,7 @@ const { determineWorkspacePfpFilepath, fetchPfp, } = require("../utils/files/pfp"); +const { getTTSProvider } = require("../utils/TextToSpeech"); function workspaceEndpoints(app) { if (!app) return; @@ -506,6 +512,48 @@ function workspaceEndpoints(app) { } ); + app.get( + "/workspace/:slug/tts/:chatId", + [validatedRequest, flexUserRoleValid([ROLES.all]), validWorkspaceSlug], + async function (request, response) { + try { + const { chatId } = request.params; + const workspace = response.locals.workspace; + const cacheKey = `${workspace.slug}:${chatId}`; + const wsChat = await WorkspaceChats.get({ + id: Number(chatId), + workspaceId: workspace.id, + }); + + const cachedResponse = responseCache.get(cacheKey); + if (cachedResponse) { + response.writeHead(200, { + "Content-Type": cachedResponse.mime || "audio/mpeg", + }); + response.end(cachedResponse.buffer); + return; + } + + const text = safeJsonParse(wsChat.response, null)?.text; + if (!text) return response.sendStatus(204).end(); + + const TTSProvider = getTTSProvider(); + const buffer = await TTSProvider.ttsBuffer(text); + if (buffer === null) return response.sendStatus(204).end(); + + responseCache.set(cacheKey, { buffer, mime: "audio/mpeg" }); + response.writeHead(200, { + "Content-Type": "audio/mpeg", + }); + response.end(buffer); + return; + } catch (error) { + console.error("Error processing the TTS request:", error); + response.status(500).json({ message: "TTS could not be completed" }); + } + } + ); + app.get( "/workspace/:slug/pfp", [validatedRequest, flexUserRoleValid([ROLES.all])], diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 904c448d55bd7492dec3b91974f5b63b60f3c074..248ca8cd79b1de7b9f26fa583ead72fa4d1c348c 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -131,6 +131,17 @@ const SystemSettings = { // -------------------------------------------------------- WhisperProvider: process.env.WHISPER_PROVIDER || "local", + // -------------------------------------------------------- + // TTS/STT Selection Settings & Configs + // - Currently the only 3rd party is OpenAI or the native browser-built in + // -------------------------------------------------------- + TextToSpeechProvider: process.env.TTS_PROVIDER || "native", + TTSOpenAIKey: !!process.env.TTS_OPEN_AI_KEY, + TTSOpenAIVoiceModel: process.env.TTS_OPEN_AI_VOICE_MODEL, + // Eleven Labs TTS + TTSElevenLabsKey: !!process.env.TTS_ELEVEN_LABS_KEY, + TTSElevenLabsVoiceModel: process.env.TTS_ELEVEN_LABS_VOICE_MODEL, + // -------------------------------------------------------- // Agent Settings & Configs // -------------------------------------------------------- diff --git a/server/package.json b/server/package.json index edee71b023c0532361f71077562e193d1aa662e6..73b947c461e5fb2139b03dafa3a87fc41fba0e34 100644 --- a/server/package.json +++ b/server/package.json @@ -44,6 +44,7 @@ "cohere-ai": "^7.9.5", "cors": "^2.8.5", "dotenv": "^16.0.3", + "elevenlabs": "^0.5.0", "express": "^4.18.2", "express-ws": "^5.0.2", "extract-json-from-string": "^1.0.1", diff --git a/server/utils/TextToSpeech/elevenLabs/index.js b/server/utils/TextToSpeech/elevenLabs/index.js new file mode 100644 index 0000000000000000000000000000000000000000..e3d25f3ae9be5deb936435ae16eb71b6b1c7bfbc --- /dev/null +++ b/server/utils/TextToSpeech/elevenLabs/index.js @@ -0,0 +1,54 @@ +const { ElevenLabsClient, stream } = require("elevenlabs"); + +class ElevenLabsTTS { + constructor() { + if (!process.env.TTS_ELEVEN_LABS_KEY) + throw new Error("No ElevenLabs API key was set."); + this.elevenLabs = new ElevenLabsClient({ + apiKey: process.env.TTS_ELEVEN_LABS_KEY, + }); + + // Rachel as default voice + // https://api.elevenlabs.io/v1/voices + this.voiceId = + process.env.TTS_ELEVEN_LABS_VOICE_MODEL ?? "21m00Tcm4TlvDq8ikWAM"; + this.modelId = "eleven_multilingual_v2"; + } + + static async voices(apiKey = null) { + try { + const client = new ElevenLabsClient({ + apiKey: apiKey ?? process.env.TTS_ELEVEN_LABS_KEY ?? null, + }); + return (await client.voices.getAll())?.voices ?? []; + } catch {} + return []; + } + + #stream2buffer(stream) { + return new Promise((resolve, reject) => { + const _buf = []; + stream.on("data", (chunk) => _buf.push(chunk)); + stream.on("end", () => resolve(Buffer.concat(_buf))); + stream.on("error", (err) => reject(err)); + }); + } + + async ttsBuffer(textInput) { + try { + const audio = await this.elevenLabs.generate({ + voice: this.voiceId, + text: textInput, + model_id: "eleven_multilingual_v2", + }); + return Buffer.from(await this.#stream2buffer(audio)); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + ElevenLabsTTS, +}; diff --git a/server/utils/TextToSpeech/index.js b/server/utils/TextToSpeech/index.js new file mode 100644 index 0000000000000000000000000000000000000000..155fc95405af72e109019b5eed0561b86677db1a --- /dev/null +++ b/server/utils/TextToSpeech/index.js @@ -0,0 +1,15 @@ +function getTTSProvider() { + const provider = process.env.TTS_PROVIDER || "openai"; + switch (provider) { + case "openai": + const { OpenAiTTS } = require("./openAi"); + return new OpenAiTTS(); + case "elevenlabs": + const { ElevenLabsTTS } = require("./elevenLabs"); + return new ElevenLabsTTS(); + default: + throw new Error("ENV: No TTS_PROVIDER value found in environment!"); + } +} + +module.exports = { getTTSProvider }; diff --git a/server/utils/TextToSpeech/openAi/index.js b/server/utils/TextToSpeech/openAi/index.js new file mode 100644 index 0000000000000000000000000000000000000000..3c5b4840d2816afe0fd8c4b64e4d7ae2d353db82 --- /dev/null +++ b/server/utils/TextToSpeech/openAi/index.js @@ -0,0 +1,29 @@ +class OpenAiTTS { + constructor() { + if (!process.env.TTS_OPEN_AI_KEY) + throw new Error("No OpenAI API key was set."); + const { OpenAI: OpenAIApi } = require("openai"); + this.openai = new OpenAIApi({ + apiKey: process.env.TTS_OPEN_AI_KEY, + }); + this.voice = process.env.TTS_OPEN_AI_VOICE_MODEL ?? "alloy"; + } + + async ttsBuffer(textInput) { + try { + const result = await this.openai.audio.speech.create({ + model: "tts-1", + voice: this.voice, + input: textInput, + }); + return Buffer.from(await result.arrayBuffer()); + } catch (e) { + console.error(e); + } + return null; + } +} + +module.exports = { + OpenAiTTS, +}; diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js index b7aae93bef517e39ec509fcc12d6827474c947d1..caf5a77c7555f981e94495bb6fc19dab54357e94 100644 --- a/server/utils/helpers/customModels.js +++ b/server/utils/helpers/customModels.js @@ -4,6 +4,7 @@ const { } = require("../AiProviders/openRouter"); const { perplexityModels } = require("../AiProviders/perplexity"); const { togetherAiModels } = require("../AiProviders/togetherAi"); +const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs"); const SUPPORT_CUSTOM_MODELS = [ "openai", "localai", @@ -15,6 +16,7 @@ const SUPPORT_CUSTOM_MODELS = [ "openrouter", "lmstudio", "koboldcpp", + "elevenlabs-tts", ]; async function getCustomModels(provider = "", apiKey = null, basePath = null) { @@ -42,6 +44,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) { return await getLMStudioModels(basePath); case "koboldcpp": return await getKoboldCPPModels(basePath); + case "elevenlabs-tts": + return await getElevenLabsModels(apiKey); default: return { models: [], error: "Invalid provider for custom models" }; } @@ -321,6 +325,32 @@ function nativeLLMModels() { return { models: files, error: null }; } +async function getElevenLabsModels(apiKey = null) { + const models = (await ElevenLabsTTS.voices(apiKey)).map((model) => { + return { + id: model.voice_id, + organization: model.category, + name: model.name, + }; + }); + + if (models.length === 0) { + return { + models: [ + { + id: "21m00Tcm4TlvDq8ikWAM", + organization: "premade", + name: "Rachel (default)", + }, + ], + error: null, + }; + } + + if (models.length > 0 && !!apiKey) process.env.TTS_ELEVEN_LABS_KEY = apiKey; + return { models, error: null }; +} + module.exports = { getCustomModels, }; diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 947fbc62492e51de25bf21e02bd8fd7b60b49021..e2b1d2e1c97b33e80d8c5df58a7200a118f5dedf 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -366,6 +366,32 @@ const KEY_MAPPING = { envKey: "AGENT_SERPER_DEV_KEY", checks: [], }, + + // TTS/STT Integration ENVS + TextToSpeechProvider: { + envKey: "TTS_PROVIDER", + checks: [supportedTTSProvider], + }, + + // TTS OpenAI + TTSOpenAIKey: { + envKey: "TTS_OPEN_AI_KEY", + checks: [validOpenAIKey], + }, + TTSOpenAIVoiceModel: { + envKey: "TTS_OPEN_AI_VOICE_MODEL", + checks: [], + }, + + // TTS ElevenLabs + TTSElevenLabsKey: { + envKey: "TTS_ELEVEN_LABS_KEY", + checks: [isNotEmpty], + }, + TTSElevenLabsVoiceModel: { + envKey: "TTS_ELEVEN_LABS_VOICE_MODEL", + checks: [], + }, }; function isNotEmpty(input = "") { @@ -419,6 +445,11 @@ function validOllamaLLMBasePath(input = "") { } } +function supportedTTSProvider(input = "") { + const validSelection = ["native", "openai", "elevenlabs"].includes(input); + return validSelection ? null : `${input} is not a valid TTS provider.`; +} + function supportedLLM(input = "") { const validSelection = [ "openai", diff --git a/server/yarn.lock b/server/yarn.lock index 5edd09a351fbc0b1aebfa9c35db1d60574c4205b..9e4f184d5f0d908e70e7b361504ee98867b1409d 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -1901,6 +1901,11 @@ combined-stream@^1.0.8: dependencies: delayed-stream "~1.0.0" +command-exists@^1.2.9: + version "1.2.9" + resolved "https://registry.yarnpkg.com/command-exists/-/command-exists-1.2.9.tgz#c50725af3808c8ab0260fd60b01fbfa25b954f69" + integrity sha512-LTQ/SGc+s0Xc0Fu5WaKnR0YiygZkm9eKFvyS+fRsU7/ZWFF8ykFM6Pc9aCVf1+xasOOZpO3BAVgVrKvsqKHV7w== + command-line-args@5.2.1, command-line-args@^5.2.1: version "5.2.1" resolved "https://registry.yarnpkg.com/command-line-args/-/command-line-args-5.2.1.tgz#c44c32e437a57d7c51157696893c5909e9cec42e" @@ -2255,6 +2260,18 @@ ee-first@1.1.1: resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d" integrity sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow== +elevenlabs@^0.5.0: + version "0.5.0" + resolved "https://registry.yarnpkg.com/elevenlabs/-/elevenlabs-0.5.0.tgz#07eb1a943b0ab99b925875bd5c57833a3a024e58" + integrity sha512-jfex4ecuWIlyAUuMrMJAJNa5MLziqYQOCDw4ZYuoc9PCYLxtHwaYBWpZoDhnYMcceLI7rRRvmbLMcT9HlVMfHA== + dependencies: + command-exists "^1.2.9" + execa "^5.1.1" + form-data "4.0.0" + node-fetch "2.7.0" + qs "6.11.2" + url-join "4.0.1" + emoji-regex@^10.2.1: version "10.3.0" resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-10.3.0.tgz#76998b9268409eb3dae3de989254d456e70cfe23" @@ -2605,6 +2622,21 @@ eventemitter3@^4.0.4: resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-4.0.7.tgz#2de9b68f6528d5644ef5c59526a1b4a07306169f" integrity sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw== +execa@^5.1.1: + version "5.1.1" + resolved "https://registry.yarnpkg.com/execa/-/execa-5.1.1.tgz#f80ad9cbf4298f7bd1d4c9555c21e93741c411dd" + integrity sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg== + dependencies: + cross-spawn "^7.0.3" + get-stream "^6.0.0" + human-signals "^2.1.0" + is-stream "^2.0.0" + merge-stream "^2.0.0" + npm-run-path "^4.0.1" + onetime "^5.1.2" + signal-exit "^3.0.3" + strip-final-newline "^2.0.0" + expand-template@^2.0.3: version "2.0.3" resolved "https://registry.yarnpkg.com/expand-template/-/expand-template-2.0.3.tgz#6e14b3fcee0f3a6340ecb57d2e8918692052a47c" @@ -3024,6 +3056,11 @@ get-stream@^5.1.0: dependencies: pump "^3.0.0" +get-stream@^6.0.0: + version "6.0.1" + resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-6.0.1.tgz#a262d8eef67aced57c2852ad6167526a43cbf7b7" + integrity sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg== + get-symbol-description@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/get-symbol-description/-/get-symbol-description-1.0.2.tgz#533744d5aa20aca4e079c8e5daf7fd44202821f5" @@ -3297,6 +3334,11 @@ https-proxy-agent@^7.0.0: agent-base "^7.0.2" debug "4" +human-signals@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" + integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== + humanize-ms@^1.2.1: version "1.2.1" resolved "https://registry.yarnpkg.com/humanize-ms/-/humanize-ms-1.2.1.tgz#c46e3159a293f6b896da29316d8b6fe8bb79bbed" @@ -4092,6 +4134,11 @@ merge-descriptors@1.0.1: resolved "https://registry.yarnpkg.com/merge-descriptors/-/merge-descriptors-1.0.1.tgz#b00aaa556dd8b44568150ec9d1b953f3f90cbb61" integrity sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w== +merge-stream@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60" + integrity sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w== + methods@~1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/methods/-/methods-1.1.2.tgz#5529a4d67654134edcc5266656835b0f851afcee" @@ -4455,6 +4502,13 @@ normalize-path@^3.0.0, normalize-path@~3.0.0: resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== +npm-run-path@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea" + integrity sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw== + dependencies: + path-key "^3.0.0" + npmlog@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/npmlog/-/npmlog-5.0.1.tgz#f06678e80e29419ad67ab964e0fa69959c1eb8b0" @@ -4593,7 +4647,7 @@ one-time@^1.0.0: dependencies: fn.name "1.x.x" -onetime@^5.1.0: +onetime@^5.1.0, onetime@^5.1.2: version "5.1.2" resolved "https://registry.yarnpkg.com/onetime/-/onetime-5.1.2.tgz#d0e96ebb56b07476df1dd9c4806e5237985ca45e" integrity sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg== @@ -4774,7 +4828,7 @@ path-is-absolute@^1.0.0: resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f" integrity sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg== -path-key@^3.1.0: +path-key@^3.0.0, path-key@^3.1.0: version "3.1.1" resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375" integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q== @@ -5322,7 +5376,7 @@ side-channel@^1.0.4, side-channel@^1.0.6: get-intrinsic "^1.2.4" object-inspect "^1.13.1" -signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.7: +signal-exit@^3.0.0, signal-exit@^3.0.2, signal-exit@^3.0.3, signal-exit@^3.0.7: version "3.0.7" resolved "https://registry.yarnpkg.com/signal-exit/-/signal-exit-3.0.7.tgz#a9a1767f8af84155114eaabd73f99273c8f59ad9" integrity sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ== @@ -5559,6 +5613,11 @@ strip-ansi@^7.0.1, strip-ansi@^7.1.0: dependencies: ansi-regex "^6.0.1" +strip-final-newline@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/strip-final-newline/-/strip-final-newline-2.0.0.tgz#89b852fb2fcbe936f6f4b3187afb0a12c1ab58ad" + integrity sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA== + strip-json-comments@^3.1.1: version "3.1.1" resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-3.1.1.tgz#31f1281b3832630434831c310c01cccda8cbe006"