[FEATURE] Added support for LocalAI Speech To Text configuration (#2376)

* Added support for LocalAI to the Speech To Text configuration. Added a few debug statements around speech to text conversion. Finally, refactored the speechToTextProviders a bit to try and remove some magic strings that have undocumented rules around naming. * LocalAI STT - PR Feedback - Updated LocalAI Image, changed casing, and updated the default model to whisper-1.
2026-06-22 11:01:22 +03:00 · 2024-05-13 07:21:27 -04:00
parent 823cefb5c5
commit d3f03e380e
4 changed files with 129 additions and 32 deletions
@@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'

+const SpeechToTextType = {
+    OPENAI_WHISPER: 'openAIWhisper',
+    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+    LOCALAI_STT: 'localAISTT'
+}
+
 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
    if (speechToTextConfig) {
        const credentialId = speechToTextConfig.credentialId as string
        const credentialData = await getCredentialData(credentialId ?? '', options)
        const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)

-        if (speechToTextConfig.name === 'openAIWhisper') {
-            const openAIClientOptions: ClientOptions = {
-                apiKey: credentialData.openAIApiKey
+        switch (speechToTextConfig.name) {
+            case SpeechToTextType.OPENAI_WHISPER: {
+                const openAIClientOptions: ClientOptions = {
+                    apiKey: credentialData.openAIApiKey
+                }
+                const openAIClient = new OpenAIClient(openAIClientOptions)
+                const openAITranscription = await openAIClient.audio.transcriptions.create({
+                    file: new File([new Blob([audio_file])], upload.name),
+                    model: 'whisper-1',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    prompt: speechToTextConfig?.prompt
+                })
+                if (openAITranscription?.text) {
+                    return openAITranscription.text
+                }
+                break
            }
-            const openAIClient = new OpenAIClient(openAIClientOptions)
-            const transcription = await openAIClient.audio.transcriptions.create({
-                file: new File([new Blob([audio_file])], upload.name),
-                model: 'whisper-1',
-                language: speechToTextConfig?.language,
-                temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
-                prompt: speechToTextConfig?.prompt
-            })
-            if (transcription?.text) {
-                return transcription.text
-            }
-        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
-            const client = new AssemblyAI({
-                apiKey: credentialData.assemblyAIApiKey
-            })
+            case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
+                const assemblyAIClient = new AssemblyAI({
+                    apiKey: credentialData.assemblyAIApiKey
+                })

-            const params = {
-                audio: audio_file,
-                speaker_labels: false
-            }
+                const params = {
+                    audio: audio_file,
+                    speaker_labels: false
+                }

-            const transcription = await client.transcripts.transcribe(params)
-            if (transcription?.text) {
-                return transcription.text
+                const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
+                if (assemblyAITranscription?.text) {
+                    return assemblyAITranscription.text
+                }
+                break
+            }
+            case SpeechToTextType.LOCALAI_STT: {
+                const LocalAIClientOptions: ClientOptions = {
+                    apiKey: credentialData.localAIApiKey,
+                    baseURL: speechToTextConfig?.baseUrl
+                }
+                const localAIClient = new OpenAIClient(LocalAIClientOptions)
+                const localAITranscription = await localAIClient.audio.transcriptions.create({
+                    file: new File([new Blob([audio_file])], upload.name),
+                    model: speechToTextConfig?.model || 'whisper-1',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    prompt: speechToTextConfig?.prompt
+                })
+                if (localAITranscription?.text) {
+                    return localAITranscription.text
+                }
+                break
            }
        }
    } else {
@@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
                }

                // Run Speech to Text conversion
-                if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
+                if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
+                    logger.debug(`Attempting a speech to text conversion...`)
                    let speechToTextConfig: ICommonObject = {}
                    if (chatflow.speechToText) {
                        const speechToTextProviders = JSON.parse(chatflow.speechToText)
@@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
                            databaseEntities: databaseEntities
                        }
                        const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
+                        logger.debug(`Speech to text result: ${speechToTextResult}`)
                        if (speechToTextResult) {
                            incomingInput.question = speechToTextResult
                        }
@@ -16,6 +16,7 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
 import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
+import localAiPng from '@/assets/images/localai.png'

 // store
 import useNotifier from '@/utils/useNotifier'
@@ -23,10 +24,19 @@ import useNotifier from '@/utils/useNotifier'
 // API
 import chatflowsApi from '@/api/chatflows'

+// If implementing a new provider, this must be updated in
+// components/src/speechToText.ts as well
+const SpeechToTextType = {
+    OPENAI_WHISPER: 'openAIWhisper',
+    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+    LOCALAI_STT: 'localAISTT'
+}
+
+// Weird quirk - the key must match the name property value.
 const speechToTextProviders = {
-    openAIWhisper: {
+    [SpeechToTextType.OPENAI_WHISPER]: {
        label: 'OpenAI Whisper',
-        name: 'openAIWhisper',
+        name: SpeechToTextType.OPENAI_WHISPER,
        icon: openAISVG,
        url: 'https://platform.openai.com/docs/guides/speech-to-text',
        inputs: [
@@ -63,9 +73,9 @@ const speechToTextProviders = {
            }
        ]
    },
-    assemblyAiTranscribe: {
+    [SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
        label: 'Assembly AI',
-        name: 'assemblyAiTranscribe',
+        name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
        icon: assemblyAIPng,
        url: 'https://www.assemblyai.com/',
        inputs: [
@@ -76,6 +86,59 @@ const speechToTextProviders = {
                credentialNames: ['assemblyAIApi']
            }
        ]
+    },
+    [SpeechToTextType.LOCALAI_STT]: {
+        label: 'LocalAi STT',
+        name: SpeechToTextType.LOCALAI_STT,
+        icon: localAiPng,
+        url: 'https://localai.io/features/audio-to-text/',
+        inputs: [
+            {
+                label: 'Connect Credential',
+                name: 'credential',
+                type: 'credential',
+                credentialNames: ['localAIApi']
+            },
+            {
+                label: 'Base URL',
+                name: 'baseUrl',
+                type: 'string',
+                description: 'The base URL of the local AI server'
+            },
+            {
+                label: 'Language',
+                name: 'language',
+                type: 'string',
+                description:
+                    'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
+                placeholder: 'en',
+                optional: true
+            },
+            {
+                label: 'Model',
+                name: 'model',
+                type: 'string',
+                description: `The STT model to load. Defaults to whisper-1 if left blank.`,
+                placeholder: 'whisper-1',
+                optional: true
+            },
+            {
+                label: 'Prompt',
+                name: 'prompt',
+                type: 'string',
+                rows: 4,
+                description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
+                optional: true
+            },
+            {
+                label: 'Temperature',
+                name: 'temperature',
+                type: 'number',
+                step: 0.1,
+                description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
+                optional: true
+            }
+        ]
    }
 }

@@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
                <FormControl fullWidth>
                    <Select size='small' value={selectedProvider} onChange={handleProviderChange}>
                        <MenuItem value='none'>None</MenuItem>
-                        <MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem>
-                        <MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem>
+                        {Object.values(speechToTextProviders).map((provider) => (
+                            <MenuItem key={provider.name} value={provider.name}>
+                                {provider.label}
+                            </MenuItem>
+                        ))}
                    </Select>
                </FormControl>
            </Box>