mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 13:00:56 +03:00
Text to speech (#5062)
* Add tts UI * Add tts backend * Add description to eleven labs credentials * Fix issue with fetching eleven labs voices * Fix issue with text to speech tab not showing correct saved voice * Add option to autoplay tts audio after prediction completes * Fix crash issue when first changing tts provider * Set up streaming response for text to speech audio * Update controllers - fix issue with sse client getting removed before tts events are sent * Use existing sse streamer to stream tts audio before sse client is removed * Add tts sse to redis publisher * Fix issues with TTS - openai voices, streaming audio, rate limiting, speed of speech * Refactor * Refactor TTS - fix issues with tts loading and stop audio buttons * Abort TTS SSE when clicking the stop button * Update SSE handling for TTS * Fix issue with test voice feature * Fix issue with tts voices not loading * Update generate tts endpoint and its usage in internal chat * Whitelist tts generate endpoint * Refactor Text-to-Speech Provider Selection and Enhance UI Components - Updated the text-to-speech controller to select the active provider based on status instead of the first available provider - Added audio waveform controls and test audio functionality in the TextToSpeech component, allowing users to play and pause test audio - Integrated Autocomplete for voice selection in the TextToSpeech component - Implemented TTS action management in ChatMessage to prevent auto-scrolling during TTS actions * - Implemented stopAllTTS function calls to halt existing TTS audio before playing new audio or starting a new TTS stream * Updated the condition for enabling TTS providers to exclude the 'none' provider, ensuring only valid providers are considered for text-to-speech functionality. * Remove unnecessary code * Add ability to abort audio streaming in TTS and release lock on chat input * Remove logger * Fix tts audio not playing when clicking speaker button * update * TTS abort controller * Fix abort not working for TTS autoplay * Send metadata event when aborting autoplay TTS * Fix UI issue * Remove elevenlabs sdk from root package.json * Remove redundant condition for tts autoplay in chatflow --------- Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
@@ -441,6 +441,9 @@ export interface IServerSideEventStreamer {
|
||||
streamAbortEvent(chatId: string): void
|
||||
streamEndEvent(chatId: string): void
|
||||
streamUsageMetadataEvent(chatId: string, data: any): void
|
||||
streamTTSStartEvent(chatId: string, chatMessageId: string, format: string): void
|
||||
streamTTSDataEvent(chatId: string, chatMessageId: string, audioChunk: string): void
|
||||
streamTTSEndEvent(chatId: string, chatMessageId: string): void
|
||||
}
|
||||
|
||||
export enum FollowUpPromptProvider {
|
||||
|
||||
@@ -7,6 +7,7 @@ dotenv.config({ path: envPath, override: true })
|
||||
export * from './Interface'
|
||||
export * from './utils'
|
||||
export * from './speechToText'
|
||||
export * from './textToSpeech'
|
||||
export * from './storageUtils'
|
||||
export * from './handler'
|
||||
export * from '../evaluation/EvaluationRunner'
|
||||
|
||||
@@ -0,0 +1,240 @@
|
||||
import { ICommonObject } from './Interface'
|
||||
import { getCredentialData } from './utils'
|
||||
import OpenAI from 'openai'
|
||||
import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
|
||||
import { Readable } from 'node:stream'
|
||||
import type { ReadableStream } from 'node:stream/web'
|
||||
|
||||
const TextToSpeechType = {
|
||||
OPENAI_TTS: 'openai',
|
||||
ELEVEN_LABS_TTS: 'elevenlabs'
|
||||
}
|
||||
|
||||
export const convertTextToSpeechStream = async (
|
||||
text: string,
|
||||
textToSpeechConfig: ICommonObject,
|
||||
options: ICommonObject,
|
||||
abortController: AbortController,
|
||||
onStart: (format: string) => void,
|
||||
onChunk: (chunk: Buffer) => void,
|
||||
onEnd: () => void
|
||||
): Promise<void> => {
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
let streamDestroyed = false
|
||||
|
||||
// Handle abort signal early
|
||||
if (abortController.signal.aborted) {
|
||||
reject(new Error('TTS generation aborted'))
|
||||
return
|
||||
}
|
||||
const processStream = async () => {
|
||||
try {
|
||||
if (textToSpeechConfig) {
|
||||
const credentialId = textToSpeechConfig.credentialId as string
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
|
||||
switch (textToSpeechConfig.name) {
|
||||
case TextToSpeechType.OPENAI_TTS: {
|
||||
onStart('mp3')
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: credentialData.openAIApiKey
|
||||
})
|
||||
|
||||
const response = await openai.audio.speech.create(
|
||||
{
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||
| 'alloy'
|
||||
| 'ash'
|
||||
| 'ballad'
|
||||
| 'coral'
|
||||
| 'echo'
|
||||
| 'fable'
|
||||
| 'nova'
|
||||
| 'onyx'
|
||||
| 'sage'
|
||||
| 'shimmer',
|
||||
input: text,
|
||||
response_format: 'mp3'
|
||||
},
|
||||
{
|
||||
signal: abortController.signal
|
||||
}
|
||||
)
|
||||
|
||||
const stream = response.body as unknown as Readable
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
}
|
||||
|
||||
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 20, abortController, () => {
|
||||
streamDestroyed = true
|
||||
})
|
||||
break
|
||||
}
|
||||
|
||||
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||
onStart('mp3')
|
||||
|
||||
const client = new ElevenLabsClient({
|
||||
apiKey: credentialData.elevenLabsApiKey
|
||||
})
|
||||
|
||||
const response = await client.textToSpeech.stream(
|
||||
textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM',
|
||||
{
|
||||
text: text,
|
||||
modelId: 'eleven_multilingual_v2'
|
||||
},
|
||||
{ abortSignal: abortController.signal }
|
||||
)
|
||||
|
||||
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
}
|
||||
|
||||
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 40, abortController, () => {
|
||||
streamDestroyed = true
|
||||
})
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
|
||||
}
|
||||
} catch (error) {
|
||||
reject(error)
|
||||
}
|
||||
}
|
||||
|
||||
// Handle abort signal
|
||||
abortController.signal.addEventListener('abort', () => {
|
||||
if (!streamDestroyed) {
|
||||
reject(new Error('TTS generation aborted'))
|
||||
}
|
||||
})
|
||||
|
||||
processStream()
|
||||
})
|
||||
}
|
||||
|
||||
const processStreamWithRateLimit = async (
|
||||
stream: Readable,
|
||||
onChunk: (chunk: Buffer) => void,
|
||||
onEnd: () => void,
|
||||
resolve: () => void,
|
||||
reject: (error: any) => void,
|
||||
targetChunkSize: number = 640,
|
||||
rateLimitMs: number = 20,
|
||||
abortController: AbortController,
|
||||
onStreamDestroy?: () => void
|
||||
) => {
|
||||
const TARGET_CHUNK_SIZE = targetChunkSize
|
||||
const RATE_LIMIT_MS = rateLimitMs
|
||||
|
||||
let buffer: Buffer = Buffer.alloc(0)
|
||||
let isEnded = false
|
||||
|
||||
const processChunks = async () => {
|
||||
while (!isEnded || buffer.length > 0) {
|
||||
// Check if aborted
|
||||
if (abortController.signal.aborted) {
|
||||
if (!stream.destroyed) {
|
||||
stream.destroy()
|
||||
}
|
||||
onStreamDestroy?.()
|
||||
reject(new Error('TTS generation aborted'))
|
||||
return
|
||||
}
|
||||
|
||||
if (buffer.length >= TARGET_CHUNK_SIZE) {
|
||||
const chunk = buffer.subarray(0, TARGET_CHUNK_SIZE)
|
||||
buffer = buffer.subarray(TARGET_CHUNK_SIZE)
|
||||
onChunk(chunk)
|
||||
await sleep(RATE_LIMIT_MS)
|
||||
} else if (isEnded && buffer.length > 0) {
|
||||
onChunk(buffer)
|
||||
buffer = Buffer.alloc(0)
|
||||
} else if (!isEnded) {
|
||||
await sleep(RATE_LIMIT_MS)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
onEnd()
|
||||
resolve()
|
||||
}
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
if (!abortController.signal.aborted) {
|
||||
buffer = Buffer.concat([buffer, Buffer.from(chunk)])
|
||||
}
|
||||
})
|
||||
|
||||
stream.on('end', () => {
|
||||
isEnded = true
|
||||
})
|
||||
|
||||
stream.on('error', (error) => {
|
||||
reject(error)
|
||||
})
|
||||
|
||||
// Handle abort signal
|
||||
abortController.signal.addEventListener('abort', () => {
|
||||
if (!stream.destroyed) {
|
||||
stream.destroy()
|
||||
}
|
||||
onStreamDestroy?.()
|
||||
reject(new Error('TTS generation aborted'))
|
||||
})
|
||||
|
||||
processChunks().catch(reject)
|
||||
}
|
||||
|
||||
const sleep = (ms: number): Promise<void> => {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
|
||||
export const getVoices = async (provider: string, credentialId: string, options: ICommonObject) => {
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
|
||||
switch (provider) {
|
||||
case TextToSpeechType.OPENAI_TTS:
|
||||
return [
|
||||
{ id: 'alloy', name: 'Alloy' },
|
||||
{ id: 'ash', name: 'Ash' },
|
||||
{ id: 'ballad', name: 'Ballad' },
|
||||
{ id: 'coral', name: 'Coral' },
|
||||
{ id: 'echo', name: 'Echo' },
|
||||
{ id: 'fable', name: 'Fable' },
|
||||
{ id: 'nova', name: 'Nova' },
|
||||
{ id: 'onyx', name: 'Onyx' },
|
||||
{ id: 'sage', name: 'Sage' },
|
||||
{ id: 'shimmer', name: 'Shimmer' }
|
||||
]
|
||||
|
||||
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||
const client = new ElevenLabsClient({
|
||||
apiKey: credentialData.elevenLabsApiKey
|
||||
})
|
||||
|
||||
const voices = await client.voices.search({
|
||||
pageSize: 100,
|
||||
voiceType: 'default',
|
||||
category: 'premade'
|
||||
})
|
||||
|
||||
return voices.voices.map((voice) => ({
|
||||
id: voice.voiceId,
|
||||
name: voice.name,
|
||||
category: voice.category
|
||||
}))
|
||||
}
|
||||
|
||||
default:
|
||||
throw new Error(`Unsupported TTS provider: ${provider}`)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user