Text to speech (#5062)

* Add tts UI * Add tts backend * Add description to eleven labs credentials * Fix issue with fetching eleven labs voices * Fix issue with text to speech tab not showing correct saved voice * Add option to autoplay tts audio after prediction completes * Fix crash issue when first changing tts provider * Set up streaming response for text to speech audio * Update controllers - fix issue with sse client getting removed before tts events are sent * Use existing sse streamer to stream tts audio before sse client is removed * Add tts sse to redis publisher * Fix issues with TTS - openai voices, streaming audio, rate limiting, speed of speech * Refactor * Refactor TTS - fix issues with tts loading and stop audio buttons * Abort TTS SSE when clicking the stop button * Update SSE handling for TTS * Fix issue with test voice feature * Fix issue with tts voices not loading * Update generate tts endpoint and its usage in internal chat * Whitelist tts generate endpoint * Refactor Text-to-Speech Provider Selection and Enhance UI Components - Updated the text-to-speech controller to select the active provider based on status instead of the first available provider - Added audio waveform controls and test audio functionality in the TextToSpeech component, allowing users to play and pause test audio - Integrated Autocomplete for voice selection in the TextToSpeech component - Implemented TTS action management in ChatMessage to prevent auto-scrolling during TTS actions * - Implemented stopAllTTS function calls to halt existing TTS audio before playing new audio or starting a new TTS stream * Updated the condition for enabling TTS providers to exclude the 'none' provider, ensuring only valid providers are considered for text-to-speech functionality. * Remove unnecessary code * Add ability to abort audio streaming in TTS and release lock on chat input * Remove logger * Fix tts audio not playing when clicking speaker button * update * TTS abort controller * Fix abort not working for TTS autoplay * Send metadata event when aborting autoplay TTS * Fix UI issue * Remove elevenlabs sdk from root package.json * Remove redundant condition for tts autoplay in chatflow --------- Co-authored-by: Henry <hzj94@hotmail.com>
2026-06-28 13:00:56 +03:00 · 2025-10-02 16:49:06 +05:30
parent 8d0a198e2f
commit 9b8fee3d8f
34 changed files with 41358 additions and 39056 deletions
@@ -441,6 +441,9 @@ export interface IServerSideEventStreamer {
    streamAbortEvent(chatId: string): void
    streamEndEvent(chatId: string): void
    streamUsageMetadataEvent(chatId: string, data: any): void
+    streamTTSStartEvent(chatId: string, chatMessageId: string, format: string): void
+    streamTTSDataEvent(chatId: string, chatMessageId: string, audioChunk: string): void
+    streamTTSEndEvent(chatId: string, chatMessageId: string): void
 }

 export enum FollowUpPromptProvider {
@@ -7,6 +7,7 @@ dotenv.config({ path: envPath, override: true })
 export * from './Interface'
 export * from './utils'
 export * from './speechToText'
+export * from './textToSpeech'
 export * from './storageUtils'
 export * from './handler'
 export * from '../evaluation/EvaluationRunner'
@@ -0,0 +1,240 @@
+import { ICommonObject } from './Interface'
+import { getCredentialData } from './utils'
+import OpenAI from 'openai'
+import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
+import { Readable } from 'node:stream'
+import type { ReadableStream } from 'node:stream/web'
+
+const TextToSpeechType = {
+    OPENAI_TTS: 'openai',
+    ELEVEN_LABS_TTS: 'elevenlabs'
+}
+
+export const convertTextToSpeechStream = async (
+    text: string,
+    textToSpeechConfig: ICommonObject,
+    options: ICommonObject,
+    abortController: AbortController,
+    onStart: (format: string) => void,
+    onChunk: (chunk: Buffer) => void,
+    onEnd: () => void
+): Promise<void> => {
+    return new Promise<void>((resolve, reject) => {
+        let streamDestroyed = false
+
+        // Handle abort signal early
+        if (abortController.signal.aborted) {
+            reject(new Error('TTS generation aborted'))
+            return
+        }
+        const processStream = async () => {
+            try {
+                if (textToSpeechConfig) {
+                    const credentialId = textToSpeechConfig.credentialId as string
+                    const credentialData = await getCredentialData(credentialId ?? '', options)
+
+                    switch (textToSpeechConfig.name) {
+                        case TextToSpeechType.OPENAI_TTS: {
+                            onStart('mp3')
+
+                            const openai = new OpenAI({
+                                apiKey: credentialData.openAIApiKey
+                            })
+
+                            const response = await openai.audio.speech.create(
+                                {
+                                    model: 'gpt-4o-mini-tts',
+                                    voice: (textToSpeechConfig.voice || 'alloy') as
+                                        | 'alloy'
+                                        | 'ash'
+                                        | 'ballad'
+                                        | 'coral'
+                                        | 'echo'
+                                        | 'fable'
+                                        | 'nova'
+                                        | 'onyx'
+                                        | 'sage'
+                                        | 'shimmer',
+                                    input: text,
+                                    response_format: 'mp3'
+                                },
+                                {
+                                    signal: abortController.signal
+                                }
+                            )
+
+                            const stream = response.body as unknown as Readable
+                            if (!stream) {
+                                throw new Error('Failed to get response stream')
+                            }
+
+                            await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 20, abortController, () => {
+                                streamDestroyed = true
+                            })
+                            break
+                        }
+
+                        case TextToSpeechType.ELEVEN_LABS_TTS: {
+                            onStart('mp3')
+
+                            const client = new ElevenLabsClient({
+                                apiKey: credentialData.elevenLabsApiKey
+                            })
+
+                            const response = await client.textToSpeech.stream(
+                                textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM',
+                                {
+                                    text: text,
+                                    modelId: 'eleven_multilingual_v2'
+                                },
+                                { abortSignal: abortController.signal }
+                            )
+
+                            const stream = Readable.fromWeb(response as unknown as ReadableStream)
+                            if (!stream) {
+                                throw new Error('Failed to get response stream')
+                            }
+
+                            await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 40, abortController, () => {
+                                streamDestroyed = true
+                            })
+                            break
+                        }
+                    }
+                } else {
+                    reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
+                }
+            } catch (error) {
+                reject(error)
+            }
+        }
+
+        // Handle abort signal
+        abortController.signal.addEventListener('abort', () => {
+            if (!streamDestroyed) {
+                reject(new Error('TTS generation aborted'))
+            }
+        })
+
+        processStream()
+    })
+}
+
+const processStreamWithRateLimit = async (
+    stream: Readable,
+    onChunk: (chunk: Buffer) => void,
+    onEnd: () => void,
+    resolve: () => void,
+    reject: (error: any) => void,
+    targetChunkSize: number = 640,
+    rateLimitMs: number = 20,
+    abortController: AbortController,
+    onStreamDestroy?: () => void
+) => {
+    const TARGET_CHUNK_SIZE = targetChunkSize
+    const RATE_LIMIT_MS = rateLimitMs
+
+    let buffer: Buffer = Buffer.alloc(0)
+    let isEnded = false
+
+    const processChunks = async () => {
+        while (!isEnded || buffer.length > 0) {
+            // Check if aborted
+            if (abortController.signal.aborted) {
+                if (!stream.destroyed) {
+                    stream.destroy()
+                }
+                onStreamDestroy?.()
+                reject(new Error('TTS generation aborted'))
+                return
+            }
+
+            if (buffer.length >= TARGET_CHUNK_SIZE) {
+                const chunk = buffer.subarray(0, TARGET_CHUNK_SIZE)
+                buffer = buffer.subarray(TARGET_CHUNK_SIZE)
+                onChunk(chunk)
+                await sleep(RATE_LIMIT_MS)
+            } else if (isEnded && buffer.length > 0) {
+                onChunk(buffer)
+                buffer = Buffer.alloc(0)
+            } else if (!isEnded) {
+                await sleep(RATE_LIMIT_MS)
+            } else {
+                break
+            }
+        }
+
+        onEnd()
+        resolve()
+    }
+
+    stream.on('data', (chunk) => {
+        if (!abortController.signal.aborted) {
+            buffer = Buffer.concat([buffer, Buffer.from(chunk)])
+        }
+    })
+
+    stream.on('end', () => {
+        isEnded = true
+    })
+
+    stream.on('error', (error) => {
+        reject(error)
+    })
+
+    // Handle abort signal
+    abortController.signal.addEventListener('abort', () => {
+        if (!stream.destroyed) {
+            stream.destroy()
+        }
+        onStreamDestroy?.()
+        reject(new Error('TTS generation aborted'))
+    })
+
+    processChunks().catch(reject)
+}
+
+const sleep = (ms: number): Promise<void> => {
+    return new Promise((resolve) => setTimeout(resolve, ms))
+}
+
+export const getVoices = async (provider: string, credentialId: string, options: ICommonObject) => {
+    const credentialData = await getCredentialData(credentialId ?? '', options)
+
+    switch (provider) {
+        case TextToSpeechType.OPENAI_TTS:
+            return [
+                { id: 'alloy', name: 'Alloy' },
+                { id: 'ash', name: 'Ash' },
+                { id: 'ballad', name: 'Ballad' },
+                { id: 'coral', name: 'Coral' },
+                { id: 'echo', name: 'Echo' },
+                { id: 'fable', name: 'Fable' },
+                { id: 'nova', name: 'Nova' },
+                { id: 'onyx', name: 'Onyx' },
+                { id: 'sage', name: 'Sage' },
+                { id: 'shimmer', name: 'Shimmer' }
+            ]
+
+        case TextToSpeechType.ELEVEN_LABS_TTS: {
+            const client = new ElevenLabsClient({
+                apiKey: credentialData.elevenLabsApiKey
+            })
+
+            const voices = await client.voices.search({
+                pageSize: 100,
+                voiceType: 'default',
+                category: 'premade'
+            })
+
+            return voices.voices.map((voice) => ({
+                id: voice.voiceId,
+                name: voice.name,
+                category: voice.category
+            }))
+        }
+
+        default:
+            throw new Error(`Unsupported TTS provider: ${provider}`)
+    }
+}