Text to speech (#5062)

* Add tts UI

* Add tts backend

* Add description to eleven labs credentials

* Fix issue with fetching eleven labs voices

* Fix issue with text to speech tab not showing correct saved voice

* Add option to autoplay tts audio after prediction completes

* Fix crash issue when first changing tts provider

* Set up streaming response for text to speech audio

* Update controllers - fix issue with sse client getting removed before tts events are sent

* Use existing sse streamer to stream tts audio before sse client is removed

* Add tts sse to redis publisher

* Fix issues with TTS - openai voices, streaming audio, rate limiting, speed of speech

* Refactor

* Refactor TTS - fix issues with tts loading and stop audio buttons

* Abort TTS SSE when clicking the stop button

* Update SSE handling for TTS

* Fix issue with test voice feature

* Fix issue with tts voices not loading

* Update generate tts endpoint and its usage in internal chat

* Whitelist tts generate endpoint

* Refactor Text-to-Speech Provider Selection and Enhance UI Components

- Updated the text-to-speech controller to select the active provider based on status instead of the first available provider
- Added audio waveform controls and test audio functionality in the TextToSpeech component, allowing users to play and pause test audio
- Integrated Autocomplete for voice selection in the TextToSpeech component
- Implemented TTS action management in ChatMessage to prevent auto-scrolling during TTS actions

* - Implemented stopAllTTS function calls to halt existing TTS audio before playing new audio or starting a new TTS stream

* Updated the condition for enabling TTS providers to exclude the 'none' provider, ensuring only valid providers are considered for text-to-speech functionality.

* Remove unnecessary code

* Add ability to abort audio streaming in TTS and release lock on chat input

* Remove logger

* Fix tts audio not playing when clicking speaker button

* update

* TTS abort controller

* Fix abort not working for TTS autoplay

* Send metadata event when aborting autoplay TTS

* Fix UI issue

* Remove elevenlabs sdk from root package.json

* Remove redundant condition for tts autoplay in chatflow

---------

Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
Ilango
2025-10-02 16:49:06 +05:30
committed by GitHub
parent 8d0a198e2f
commit 9b8fee3d8f
34 changed files with 41358 additions and 39056 deletions
+3
View File
@@ -441,6 +441,9 @@ export interface IServerSideEventStreamer {
streamAbortEvent(chatId: string): void
streamEndEvent(chatId: string): void
streamUsageMetadataEvent(chatId: string, data: any): void
streamTTSStartEvent(chatId: string, chatMessageId: string, format: string): void
streamTTSDataEvent(chatId: string, chatMessageId: string, audioChunk: string): void
streamTTSEndEvent(chatId: string, chatMessageId: string): void
}
export enum FollowUpPromptProvider {
+1
View File
@@ -7,6 +7,7 @@ dotenv.config({ path: envPath, override: true })
export * from './Interface'
export * from './utils'
export * from './speechToText'
export * from './textToSpeech'
export * from './storageUtils'
export * from './handler'
export * from '../evaluation/EvaluationRunner'
+240
View File
@@ -0,0 +1,240 @@
import { ICommonObject } from './Interface'
import { getCredentialData } from './utils'
import OpenAI from 'openai'
import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
import { Readable } from 'node:stream'
import type { ReadableStream } from 'node:stream/web'
const TextToSpeechType = {
OPENAI_TTS: 'openai',
ELEVEN_LABS_TTS: 'elevenlabs'
}
export const convertTextToSpeechStream = async (
text: string,
textToSpeechConfig: ICommonObject,
options: ICommonObject,
abortController: AbortController,
onStart: (format: string) => void,
onChunk: (chunk: Buffer) => void,
onEnd: () => void
): Promise<void> => {
return new Promise<void>((resolve, reject) => {
let streamDestroyed = false
// Handle abort signal early
if (abortController.signal.aborted) {
reject(new Error('TTS generation aborted'))
return
}
const processStream = async () => {
try {
if (textToSpeechConfig) {
const credentialId = textToSpeechConfig.credentialId as string
const credentialData = await getCredentialData(credentialId ?? '', options)
switch (textToSpeechConfig.name) {
case TextToSpeechType.OPENAI_TTS: {
onStart('mp3')
const openai = new OpenAI({
apiKey: credentialData.openAIApiKey
})
const response = await openai.audio.speech.create(
{
model: 'gpt-4o-mini-tts',
voice: (textToSpeechConfig.voice || 'alloy') as
| 'alloy'
| 'ash'
| 'ballad'
| 'coral'
| 'echo'
| 'fable'
| 'nova'
| 'onyx'
| 'sage'
| 'shimmer',
input: text,
response_format: 'mp3'
},
{
signal: abortController.signal
}
)
const stream = response.body as unknown as Readable
if (!stream) {
throw new Error('Failed to get response stream')
}
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 20, abortController, () => {
streamDestroyed = true
})
break
}
case TextToSpeechType.ELEVEN_LABS_TTS: {
onStart('mp3')
const client = new ElevenLabsClient({
apiKey: credentialData.elevenLabsApiKey
})
const response = await client.textToSpeech.stream(
textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM',
{
text: text,
modelId: 'eleven_multilingual_v2'
},
{ abortSignal: abortController.signal }
)
const stream = Readable.fromWeb(response as unknown as ReadableStream)
if (!stream) {
throw new Error('Failed to get response stream')
}
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 40, abortController, () => {
streamDestroyed = true
})
break
}
}
} else {
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
}
} catch (error) {
reject(error)
}
}
// Handle abort signal
abortController.signal.addEventListener('abort', () => {
if (!streamDestroyed) {
reject(new Error('TTS generation aborted'))
}
})
processStream()
})
}
const processStreamWithRateLimit = async (
stream: Readable,
onChunk: (chunk: Buffer) => void,
onEnd: () => void,
resolve: () => void,
reject: (error: any) => void,
targetChunkSize: number = 640,
rateLimitMs: number = 20,
abortController: AbortController,
onStreamDestroy?: () => void
) => {
const TARGET_CHUNK_SIZE = targetChunkSize
const RATE_LIMIT_MS = rateLimitMs
let buffer: Buffer = Buffer.alloc(0)
let isEnded = false
const processChunks = async () => {
while (!isEnded || buffer.length > 0) {
// Check if aborted
if (abortController.signal.aborted) {
if (!stream.destroyed) {
stream.destroy()
}
onStreamDestroy?.()
reject(new Error('TTS generation aborted'))
return
}
if (buffer.length >= TARGET_CHUNK_SIZE) {
const chunk = buffer.subarray(0, TARGET_CHUNK_SIZE)
buffer = buffer.subarray(TARGET_CHUNK_SIZE)
onChunk(chunk)
await sleep(RATE_LIMIT_MS)
} else if (isEnded && buffer.length > 0) {
onChunk(buffer)
buffer = Buffer.alloc(0)
} else if (!isEnded) {
await sleep(RATE_LIMIT_MS)
} else {
break
}
}
onEnd()
resolve()
}
stream.on('data', (chunk) => {
if (!abortController.signal.aborted) {
buffer = Buffer.concat([buffer, Buffer.from(chunk)])
}
})
stream.on('end', () => {
isEnded = true
})
stream.on('error', (error) => {
reject(error)
})
// Handle abort signal
abortController.signal.addEventListener('abort', () => {
if (!stream.destroyed) {
stream.destroy()
}
onStreamDestroy?.()
reject(new Error('TTS generation aborted'))
})
processChunks().catch(reject)
}
const sleep = (ms: number): Promise<void> => {
return new Promise((resolve) => setTimeout(resolve, ms))
}
export const getVoices = async (provider: string, credentialId: string, options: ICommonObject) => {
const credentialData = await getCredentialData(credentialId ?? '', options)
switch (provider) {
case TextToSpeechType.OPENAI_TTS:
return [
{ id: 'alloy', name: 'Alloy' },
{ id: 'ash', name: 'Ash' },
{ id: 'ballad', name: 'Ballad' },
{ id: 'coral', name: 'Coral' },
{ id: 'echo', name: 'Echo' },
{ id: 'fable', name: 'Fable' },
{ id: 'nova', name: 'Nova' },
{ id: 'onyx', name: 'Onyx' },
{ id: 'sage', name: 'Sage' },
{ id: 'shimmer', name: 'Shimmer' }
]
case TextToSpeechType.ELEVEN_LABS_TTS: {
const client = new ElevenLabsClient({
apiKey: credentialData.elevenLabsApiKey
})
const voices = await client.voices.search({
pageSize: 100,
voiceType: 'default',
category: 'premade'
})
return voices.voices.map((voice) => ({
id: voice.voiceId,
name: voice.name,
category: voice.category
}))
}
default:
throw new Error(`Unsupported TTS provider: ${provider}`)
}
}