SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 23:01:09 +03:00 · 2024-01-31 07:48:38 -05:00
parent 4604594c55
commit e81927ee13
7 changed files with 67 additions and 39 deletions
@@ -46,8 +46,7 @@ import {
    getSessionChatHistory,
    getAllConnectedNodes,
    clearSessionMemory,
-    findMemoryNode,
-    convertedSpeechToText
+    findMemoryNode
 } from './utils'
 import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
 import { getDataSource } from './DataSource'
@@ -59,7 +58,15 @@ import { Tool } from './database/entities/Tool'
 import { Assistant } from './database/entities/Assistant'
 import { ChatflowPool } from './ChatflowPool'
 import { CachePool } from './CachePool'
-import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
+import {
+    ICommonObject,
+    IMessage,
+    INodeOptionsValue,
+    INodeParams,
+    handleEscapeCharacters,
+    convertSpeechToText,
+    IFileUpload
+} from 'flowise-components'
 import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
 import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
 import { sanitizeMiddleware } from './utils/XSS'
@@ -1644,7 +1651,11 @@ export class App {
                            }
                        }
                        if (speechToTextConfig) {
-                            const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
+                            const options: ICommonObject = {
+                                appDataSource: this.AppDataSource,
+                                databaseEntities: databaseEntities
+                            }
+                            const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
                            if (speechToTextResult) {
                                incomingInput.question = speechToTextResult
                            }
@@ -1078,36 +1078,3 @@ export const getAllValuesFromJson = (obj: any): any[] => {
    extractValues(obj)
    return values
 }
-
-export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
-    // const MODEL_NAME = 'whisper-1'
-    if (speechToTextConfig) {
-        //special case, text input is empty, but we have an upload (recorded audio)
-        // const openAIClientOptions: ClientOptions = {
-        //     apiKey: model.openAIApiKey,
-        //     organization: model.organization
-        // }
-        // const openAIClient = new OpenAIClient(openAIClientOptions)
-        // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
-        //
-        // // as the image is stored in the server, read the file and convert it to base64
-        // const audio_file = fs.createReadStream(filePath)
-        //
-        // if (multiModalConfig.speechToTextMode === 'transcriptions') {
-        //     const transcription = await openAIClient.audio.transcriptions.create({
-        //         file: audio_file,
-        //         model: MODEL_NAME
-        //     })
-        //     return transcription.text
-        // } else if (multiModalConfig.speechToTextMode === 'translations') {
-        //     const translation = await openAIClient.audio.translations.create({
-        //         file: audio_file,
-        //         model: MODEL_NAME
-        //     })
-        //     return translation.text
-        // }
-    } else {
-        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
-    }
-    return undefined
-}