[FEATURE] Added support for LocalAI Speech To Text configuration (#2376)

* Added support for LocalAI to the Speech To Text configuration. Added a few debug statements around speech to text conversion. Finally, refactored the speechToTextProviders a bit to try and remove some magic strings that have undocumented rules around naming.

* LocalAI STT - PR Feedback - Updated LocalAI Image, changed casing, and updated the default model to whisper-1.
This commit is contained in:
clates
2024-05-13 07:21:27 -04:00
committed by GitHub
parent 823cefb5c5
commit d3f03e380e
4 changed files with 129 additions and 32 deletions
+54 -25
View File
@@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
import { AssemblyAI } from 'assemblyai' import { AssemblyAI } from 'assemblyai'
import { getFileFromStorage } from './storageUtils' import { getFileFromStorage } from './storageUtils'
const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT'
}
export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => { export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
if (speechToTextConfig) { if (speechToTextConfig) {
const credentialId = speechToTextConfig.credentialId as string const credentialId = speechToTextConfig.credentialId as string
const credentialData = await getCredentialData(credentialId ?? '', options) const credentialData = await getCredentialData(credentialId ?? '', options)
const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId) const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)
if (speechToTextConfig.name === 'openAIWhisper') { switch (speechToTextConfig.name) {
const openAIClientOptions: ClientOptions = { case SpeechToTextType.OPENAI_WHISPER: {
apiKey: credentialData.openAIApiKey const openAIClientOptions: ClientOptions = {
apiKey: credentialData.openAIApiKey
}
const openAIClient = new OpenAIClient(openAIClientOptions)
const openAITranscription = await openAIClient.audio.transcriptions.create({
file: new File([new Blob([audio_file])], upload.name),
model: 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (openAITranscription?.text) {
return openAITranscription.text
}
break
} }
const openAIClient = new OpenAIClient(openAIClientOptions) case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
const transcription = await openAIClient.audio.transcriptions.create({ const assemblyAIClient = new AssemblyAI({
file: new File([new Blob([audio_file])], upload.name), apiKey: credentialData.assemblyAIApiKey
model: 'whisper-1', })
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (transcription?.text) {
return transcription.text
}
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
const client = new AssemblyAI({
apiKey: credentialData.assemblyAIApiKey
})
const params = { const params = {
audio: audio_file, audio: audio_file,
speaker_labels: false speaker_labels: false
} }
const transcription = await client.transcripts.transcribe(params) const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
if (transcription?.text) { if (assemblyAITranscription?.text) {
return transcription.text return assemblyAITranscription.text
}
break
}
case SpeechToTextType.LOCALAI_STT: {
const LocalAIClientOptions: ClientOptions = {
apiKey: credentialData.localAIApiKey,
baseURL: speechToTextConfig?.baseUrl
}
const localAIClient = new OpenAIClient(LocalAIClientOptions)
const localAITranscription = await localAIClient.audio.transcriptions.create({
file: new File([new Blob([audio_file])], upload.name),
model: speechToTextConfig?.model || 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (localAITranscription?.text) {
return localAITranscription.text
}
break
} }
} }
} else { } else {
+3 -1
View File
@@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
} }
// Run Speech to Text conversion // Run Speech to Text conversion
if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') { if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
logger.debug(`Attempting a speech to text conversion...`)
let speechToTextConfig: ICommonObject = {} let speechToTextConfig: ICommonObject = {}
if (chatflow.speechToText) { if (chatflow.speechToText) {
const speechToTextProviders = JSON.parse(chatflow.speechToText) const speechToTextProviders = JSON.parse(chatflow.speechToText)
@@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
databaseEntities: databaseEntities databaseEntities: databaseEntities
} }
const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options) const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
logger.debug(`Speech to text result: ${speechToTextResult}`)
if (speechToTextResult) { if (speechToTextResult) {
incomingInput.question = speechToTextResult incomingInput.question = speechToTextResult
} }
Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

@@ -16,6 +16,7 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
import { Dropdown } from '@/ui-component/dropdown/Dropdown' import { Dropdown } from '@/ui-component/dropdown/Dropdown'
import openAISVG from '@/assets/images/openai.svg' import openAISVG from '@/assets/images/openai.svg'
import assemblyAIPng from '@/assets/images/assemblyai.png' import assemblyAIPng from '@/assets/images/assemblyai.png'
import localAiPng from '@/assets/images/localai.png'
// store // store
import useNotifier from '@/utils/useNotifier' import useNotifier from '@/utils/useNotifier'
@@ -23,10 +24,19 @@ import useNotifier from '@/utils/useNotifier'
// API // API
import chatflowsApi from '@/api/chatflows' import chatflowsApi from '@/api/chatflows'
// If implementing a new provider, this must be updated in
// components/src/speechToText.ts as well
const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT'
}
// Weird quirk - the key must match the name property value.
const speechToTextProviders = { const speechToTextProviders = {
openAIWhisper: { [SpeechToTextType.OPENAI_WHISPER]: {
label: 'OpenAI Whisper', label: 'OpenAI Whisper',
name: 'openAIWhisper', name: SpeechToTextType.OPENAI_WHISPER,
icon: openAISVG, icon: openAISVG,
url: 'https://platform.openai.com/docs/guides/speech-to-text', url: 'https://platform.openai.com/docs/guides/speech-to-text',
inputs: [ inputs: [
@@ -63,9 +73,9 @@ const speechToTextProviders = {
} }
] ]
}, },
assemblyAiTranscribe: { [SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
label: 'Assembly AI', label: 'Assembly AI',
name: 'assemblyAiTranscribe', name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
icon: assemblyAIPng, icon: assemblyAIPng,
url: 'https://www.assemblyai.com/', url: 'https://www.assemblyai.com/',
inputs: [ inputs: [
@@ -76,6 +86,59 @@ const speechToTextProviders = {
credentialNames: ['assemblyAIApi'] credentialNames: ['assemblyAIApi']
} }
] ]
},
[SpeechToTextType.LOCALAI_STT]: {
label: 'LocalAi STT',
name: SpeechToTextType.LOCALAI_STT,
icon: localAiPng,
url: 'https://localai.io/features/audio-to-text/',
inputs: [
{
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['localAIApi']
},
{
label: 'Base URL',
name: 'baseUrl',
type: 'string',
description: 'The base URL of the local AI server'
},
{
label: 'Language',
name: 'language',
type: 'string',
description:
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
placeholder: 'en',
optional: true
},
{
label: 'Model',
name: 'model',
type: 'string',
description: `The STT model to load. Defaults to whisper-1 if left blank.`,
placeholder: 'whisper-1',
optional: true
},
{
label: 'Prompt',
name: 'prompt',
type: 'string',
rows: 4,
description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
optional: true
},
{
label: 'Temperature',
name: 'temperature',
type: 'number',
step: 0.1,
description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
optional: true
}
]
} }
} }
@@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
<FormControl fullWidth> <FormControl fullWidth>
<Select size='small' value={selectedProvider} onChange={handleProviderChange}> <Select size='small' value={selectedProvider} onChange={handleProviderChange}>
<MenuItem value='none'>None</MenuItem> <MenuItem value='none'>None</MenuItem>
<MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem> {Object.values(speechToTextProviders).map((provider) => (
<MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem> <MenuItem key={provider.name} value={provider.name}>
{provider.label}
</MenuItem>
))}
</Select> </Select>
</FormControl> </FormControl>
</Box> </Box>