SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 19:00:59 +03:00 · 2024-01-30 21:48:08 -05:00
parent 1d122084b9
commit 4604594c55
10 changed files with 136 additions and 112 deletions
@@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject)
    }
 }

-export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => {
-    const MODEL_NAME = 'whisper-1'
-    let input = undefined
-    let model = nodeData.inputs?.model as BaseChatModel
-    if (model instanceof ChatOpenAI && (model as any).multiModal) {
-        const multiModalConfig = (model as any).multiModal
-        if (options?.uploads) {
-            if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') {
-                const upload = options.uploads[0]
-                //special case, text input is empty, but we have an upload (recorded audio)
-                if (multiModalConfig.allowSpeechToText) {
-                    const openAIClientOptions: ClientOptions = {
-                        apiKey: model.openAIApiKey,
-                        organization: model.organization
-                    }
-                    const openAIClient = new OpenAIClient(openAIClientOptions)
-                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
-
-                    // as the image is stored in the server, read the file and convert it to base64
-                    const audio_file = fs.createReadStream(filePath)
-
-                    if (multiModalConfig.speechToTextMode === 'transcriptions') {
-                        const transcription = await openAIClient.audio.transcriptions.create({
-                            file: audio_file,
-                            model: MODEL_NAME
-                        })
-                        return transcription.text
-                    } else if (multiModalConfig.speechToTextMode === 'translations') {
-                        const translation = await openAIClient.audio.translations.create({
-                            file: audio_file,
-                            model: MODEL_NAME
-                        })
-                        return translation.text
-                    }
-                } else {
-                    throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
-                }
-            }
-        }
-    }
-    return input
-}
-
 export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
    const imageContent: MessageContent = []
    let model = nodeData.inputs?.model as BaseChatModel