mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 23:01:09 +03:00
[FEATURE] Added support for LocalAI Speech To Text configuration (#2376)
* Added support for LocalAI to the Speech To Text configuration. Added a few debug statements around speech to text conversion. Finally, refactored the speechToTextProviders a bit to try and remove some magic strings that have undocumented rules around naming. * LocalAI STT - PR Feedback - Updated LocalAI Image, changed casing, and updated the default model to whisper-1.
This commit is contained in:
@@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
|
|||||||
import { AssemblyAI } from 'assemblyai'
|
import { AssemblyAI } from 'assemblyai'
|
||||||
import { getFileFromStorage } from './storageUtils'
|
import { getFileFromStorage } from './storageUtils'
|
||||||
|
|
||||||
|
const SpeechToTextType = {
|
||||||
|
OPENAI_WHISPER: 'openAIWhisper',
|
||||||
|
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
||||||
|
LOCALAI_STT: 'localAISTT'
|
||||||
|
}
|
||||||
|
|
||||||
export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
|
export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
|
||||||
if (speechToTextConfig) {
|
if (speechToTextConfig) {
|
||||||
const credentialId = speechToTextConfig.credentialId as string
|
const credentialId = speechToTextConfig.credentialId as string
|
||||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||||
const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)
|
const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)
|
||||||
|
|
||||||
if (speechToTextConfig.name === 'openAIWhisper') {
|
switch (speechToTextConfig.name) {
|
||||||
const openAIClientOptions: ClientOptions = {
|
case SpeechToTextType.OPENAI_WHISPER: {
|
||||||
apiKey: credentialData.openAIApiKey
|
const openAIClientOptions: ClientOptions = {
|
||||||
|
apiKey: credentialData.openAIApiKey
|
||||||
|
}
|
||||||
|
const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||||
|
const openAITranscription = await openAIClient.audio.transcriptions.create({
|
||||||
|
file: new File([new Blob([audio_file])], upload.name),
|
||||||
|
model: 'whisper-1',
|
||||||
|
language: speechToTextConfig?.language,
|
||||||
|
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
|
||||||
|
prompt: speechToTextConfig?.prompt
|
||||||
|
})
|
||||||
|
if (openAITranscription?.text) {
|
||||||
|
return openAITranscription.text
|
||||||
|
}
|
||||||
|
break
|
||||||
}
|
}
|
||||||
const openAIClient = new OpenAIClient(openAIClientOptions)
|
case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
|
||||||
const transcription = await openAIClient.audio.transcriptions.create({
|
const assemblyAIClient = new AssemblyAI({
|
||||||
file: new File([new Blob([audio_file])], upload.name),
|
apiKey: credentialData.assemblyAIApiKey
|
||||||
model: 'whisper-1',
|
})
|
||||||
language: speechToTextConfig?.language,
|
|
||||||
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
|
|
||||||
prompt: speechToTextConfig?.prompt
|
|
||||||
})
|
|
||||||
if (transcription?.text) {
|
|
||||||
return transcription.text
|
|
||||||
}
|
|
||||||
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
|
|
||||||
const client = new AssemblyAI({
|
|
||||||
apiKey: credentialData.assemblyAIApiKey
|
|
||||||
})
|
|
||||||
|
|
||||||
const params = {
|
const params = {
|
||||||
audio: audio_file,
|
audio: audio_file,
|
||||||
speaker_labels: false
|
speaker_labels: false
|
||||||
}
|
}
|
||||||
|
|
||||||
const transcription = await client.transcripts.transcribe(params)
|
const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
|
||||||
if (transcription?.text) {
|
if (assemblyAITranscription?.text) {
|
||||||
return transcription.text
|
return assemblyAITranscription.text
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
case SpeechToTextType.LOCALAI_STT: {
|
||||||
|
const LocalAIClientOptions: ClientOptions = {
|
||||||
|
apiKey: credentialData.localAIApiKey,
|
||||||
|
baseURL: speechToTextConfig?.baseUrl
|
||||||
|
}
|
||||||
|
const localAIClient = new OpenAIClient(LocalAIClientOptions)
|
||||||
|
const localAITranscription = await localAIClient.audio.transcriptions.create({
|
||||||
|
file: new File([new Blob([audio_file])], upload.name),
|
||||||
|
model: speechToTextConfig?.model || 'whisper-1',
|
||||||
|
language: speechToTextConfig?.language,
|
||||||
|
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
|
||||||
|
prompt: speechToTextConfig?.prompt
|
||||||
|
})
|
||||||
|
if (localAITranscription?.text) {
|
||||||
|
return localAITranscription.text
|
||||||
|
}
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run Speech to Text conversion
|
// Run Speech to Text conversion
|
||||||
if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
|
if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
|
||||||
|
logger.debug(`Attempting a speech to text conversion...`)
|
||||||
let speechToTextConfig: ICommonObject = {}
|
let speechToTextConfig: ICommonObject = {}
|
||||||
if (chatflow.speechToText) {
|
if (chatflow.speechToText) {
|
||||||
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
||||||
@@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
|
|||||||
databaseEntities: databaseEntities
|
databaseEntities: databaseEntities
|
||||||
}
|
}
|
||||||
const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
|
const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
|
||||||
|
logger.debug(`Speech to text result: ${speechToTextResult}`)
|
||||||
if (speechToTextResult) {
|
if (speechToTextResult) {
|
||||||
incomingInput.question = speechToTextResult
|
incomingInput.question = speechToTextResult
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 141 KiB |
@@ -16,6 +16,7 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
|
|||||||
import { Dropdown } from '@/ui-component/dropdown/Dropdown'
|
import { Dropdown } from '@/ui-component/dropdown/Dropdown'
|
||||||
import openAISVG from '@/assets/images/openai.svg'
|
import openAISVG from '@/assets/images/openai.svg'
|
||||||
import assemblyAIPng from '@/assets/images/assemblyai.png'
|
import assemblyAIPng from '@/assets/images/assemblyai.png'
|
||||||
|
import localAiPng from '@/assets/images/localai.png'
|
||||||
|
|
||||||
// store
|
// store
|
||||||
import useNotifier from '@/utils/useNotifier'
|
import useNotifier from '@/utils/useNotifier'
|
||||||
@@ -23,10 +24,19 @@ import useNotifier from '@/utils/useNotifier'
|
|||||||
// API
|
// API
|
||||||
import chatflowsApi from '@/api/chatflows'
|
import chatflowsApi from '@/api/chatflows'
|
||||||
|
|
||||||
|
// If implementing a new provider, this must be updated in
|
||||||
|
// components/src/speechToText.ts as well
|
||||||
|
const SpeechToTextType = {
|
||||||
|
OPENAI_WHISPER: 'openAIWhisper',
|
||||||
|
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
||||||
|
LOCALAI_STT: 'localAISTT'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Weird quirk - the key must match the name property value.
|
||||||
const speechToTextProviders = {
|
const speechToTextProviders = {
|
||||||
openAIWhisper: {
|
[SpeechToTextType.OPENAI_WHISPER]: {
|
||||||
label: 'OpenAI Whisper',
|
label: 'OpenAI Whisper',
|
||||||
name: 'openAIWhisper',
|
name: SpeechToTextType.OPENAI_WHISPER,
|
||||||
icon: openAISVG,
|
icon: openAISVG,
|
||||||
url: 'https://platform.openai.com/docs/guides/speech-to-text',
|
url: 'https://platform.openai.com/docs/guides/speech-to-text',
|
||||||
inputs: [
|
inputs: [
|
||||||
@@ -63,9 +73,9 @@ const speechToTextProviders = {
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
assemblyAiTranscribe: {
|
[SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
|
||||||
label: 'Assembly AI',
|
label: 'Assembly AI',
|
||||||
name: 'assemblyAiTranscribe',
|
name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
|
||||||
icon: assemblyAIPng,
|
icon: assemblyAIPng,
|
||||||
url: 'https://www.assemblyai.com/',
|
url: 'https://www.assemblyai.com/',
|
||||||
inputs: [
|
inputs: [
|
||||||
@@ -76,6 +86,59 @@ const speechToTextProviders = {
|
|||||||
credentialNames: ['assemblyAIApi']
|
credentialNames: ['assemblyAIApi']
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
[SpeechToTextType.LOCALAI_STT]: {
|
||||||
|
label: 'LocalAi STT',
|
||||||
|
name: SpeechToTextType.LOCALAI_STT,
|
||||||
|
icon: localAiPng,
|
||||||
|
url: 'https://localai.io/features/audio-to-text/',
|
||||||
|
inputs: [
|
||||||
|
{
|
||||||
|
label: 'Connect Credential',
|
||||||
|
name: 'credential',
|
||||||
|
type: 'credential',
|
||||||
|
credentialNames: ['localAIApi']
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Base URL',
|
||||||
|
name: 'baseUrl',
|
||||||
|
type: 'string',
|
||||||
|
description: 'The base URL of the local AI server'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Language',
|
||||||
|
name: 'language',
|
||||||
|
type: 'string',
|
||||||
|
description:
|
||||||
|
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
|
||||||
|
placeholder: 'en',
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Model',
|
||||||
|
name: 'model',
|
||||||
|
type: 'string',
|
||||||
|
description: `The STT model to load. Defaults to whisper-1 if left blank.`,
|
||||||
|
placeholder: 'whisper-1',
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Prompt',
|
||||||
|
name: 'prompt',
|
||||||
|
type: 'string',
|
||||||
|
rows: 4,
|
||||||
|
description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Temperature',
|
||||||
|
name: 'temperature',
|
||||||
|
type: 'number',
|
||||||
|
step: 0.1,
|
||||||
|
description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
|
||||||
|
optional: true
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
|
|||||||
<FormControl fullWidth>
|
<FormControl fullWidth>
|
||||||
<Select size='small' value={selectedProvider} onChange={handleProviderChange}>
|
<Select size='small' value={selectedProvider} onChange={handleProviderChange}>
|
||||||
<MenuItem value='none'>None</MenuItem>
|
<MenuItem value='none'>None</MenuItem>
|
||||||
<MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem>
|
{Object.values(speechToTextProviders).map((provider) => (
|
||||||
<MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem>
|
<MenuItem key={provider.name} value={provider.name}>
|
||||||
|
{provider.label}
|
||||||
|
</MenuItem>
|
||||||
|
))}
|
||||||
</Select>
|
</Select>
|
||||||
</FormControl>
|
</FormControl>
|
||||||
</Box>
|
</Box>
|
||||||
|
|||||||
Reference in New Issue
Block a user