SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 19:00:59 +03:00 · 2024-01-30 21:48:08 -05:00
parent 1d122084b9
commit 4604594c55
10 changed files with 136 additions and 112 deletions
@@ -54,7 +54,7 @@ export class NodesPool {
                            }
                        }

-                        const skipCategories = ['Analytic']
+                        const skipCategories = ['Analytic', 'SpeechToText']
                        if (!skipCategories.includes(newNodeInstance.category)) {
                            this.componentNodes[newNodeInstance.name] = newNodeInstance
                        }
@@ -46,7 +46,8 @@ import {
    getSessionChatHistory,
    getAllConnectedNodes,
    clearSessionMemory,
-    findMemoryNode
+    findMemoryNode,
+    convertedSpeechToText
 } from './utils'
 import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
 import { getDataSource } from './DataSource'
@@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool'
 import { Assistant } from './database/entities/Assistant'
 import { ChatflowPool } from './ChatflowPool'
 import { CachePool } from './CachePool'
-import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components'
+import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
 import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
 import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
 import { sanitizeMiddleware } from './utils/XSS'
@@ -473,6 +474,17 @@ export class App {
                const flowObj = JSON.parse(chatflow.flowData)
                const allowances: IUploadFileSizeAndTypes[] = []
                let allowSpeechToText = false
+                if (chatflow.speechToText) {
+                    const speechToTextProviders = JSON.parse(chatflow.speechToText)
+                    for (const provider in speechToTextProviders) {
+                        const providerObj = speechToTextProviders[provider]
+                        if (providerObj.status) {
+                            allowSpeechToText = true
+                            break
+                        }
+                    }
+                }
+
                let allowImageUploads = false
                flowObj.nodes.forEach((node: IReactFlowNode) => {
                    if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
@@ -488,9 +500,6 @@ export class App {
                                })
                                allowImageUploads = true
                            }
-                            if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) {
-                                allowSpeechToText = true
-                            }
                        })
                    }
                })
@@ -1602,7 +1611,8 @@ export class App {

            if (incomingInput.uploads) {
                // @ts-ignore
-                ;(incomingInput.uploads as any[]).forEach((upload: any) => {
+                const uploads = incomingInput.uploads as IFileUpload[]
+                for (const upload of uploads) {
                    if (upload.type === 'file' || upload.type === 'audio') {
                        const filename = upload.name
                        const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId)
@@ -1618,7 +1628,29 @@ export class App {
                        upload.data = chatId
                        upload.type = 'stored-file'
                    }
-                })
+
+                    if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) {
+                        //speechToText
+                        let speechToTextConfig: any = {}
+                        if (chatflow.speechToText) {
+                            const speechToTextProviders = JSON.parse(chatflow.speechToText)
+                            for (const provider in speechToTextProviders) {
+                                const providerObj = speechToTextProviders[provider]
+                                if (providerObj.status) {
+                                    speechToTextConfig = providerObj
+                                    speechToTextConfig['name'] = provider
+                                    break
+                                }
+                            }
+                        }
+                        if (speechToTextConfig) {
+                            const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
+                            if (speechToTextResult) {
+                                incomingInput.question = speechToTextResult
+                            }
+                        }
+                    }
+                }
            }

            let isStreamValid = false
@@ -593,7 +593,6 @@ export const resolveVariables = (
    }

    const paramsObj = flowNodeData[types] ?? {}
-
    getParamValues(paramsObj)

    return flowNodeData
@@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => {
    extractValues(obj)
    return values
 }
+
+export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
+    // const MODEL_NAME = 'whisper-1'
+    if (speechToTextConfig) {
+        //special case, text input is empty, but we have an upload (recorded audio)
+        // const openAIClientOptions: ClientOptions = {
+        //     apiKey: model.openAIApiKey,
+        //     organization: model.organization
+        // }
+        // const openAIClient = new OpenAIClient(openAIClientOptions)
+        // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
+        //
+        // // as the image is stored in the server, read the file and convert it to base64
+        // const audio_file = fs.createReadStream(filePath)
+        //
+        // if (multiModalConfig.speechToTextMode === 'transcriptions') {
+        //     const transcription = await openAIClient.audio.transcriptions.create({
+        //         file: audio_file,
+        //         model: MODEL_NAME
+        //     })
+        //     return transcription.text
+        // } else if (multiModalConfig.speechToTextMode === 'translations') {
+        //     const translation = await openAIClient.audio.translations.create({
+        //         file: audio_file,
+        //         model: MODEL_NAME
+        //     })
+        //     return translation.text
+        // }
+    } else {
+        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
+    }
+    return undefined
+}