SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 23:01:09 +03:00 · 2024-01-30 21:48:08 -05:00
parent 1d122084b9
commit 4604594c55
10 changed files with 136 additions and 112 deletions
@@ -0,0 +1,23 @@
 import { INodeParams, INodeCredential } from '../src/Interface'
 class AssemblyAIApi implements INodeCredential {
    label: string
    name: string
    version: number
    inputs: INodeParams[]
    constructor() {
        this.label = 'AssemblyAI API'
        this.name = 'assemblyAIApi'
        this.version = 1.0
        this.inputs = [
            {
                label: 'AssemblyAI Api Key',
                name: 'assemblyAIApiKey',
                type: 'password'
            }
        ]
    }
 }
 module.exports = { credClass: AssemblyAIApi }
@@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode {
                default: false,
                optional: true
            },
            {
                label: 'Allow Speech to Text',
                name: 'allowSpeechToText',
                type: 'boolean',
                default: false,
                optional: true
            },
            // TODO: only show when speechToText is true
            {
                label: 'Speech to Text Method',
                description: 'How to turn audio into text',
                name: 'speechToTextMode',
                type: 'options',
                options: [
                    {
                        label: 'Transcriptions',
                        name: 'transcriptions',
                        description:
                            'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
                    },
                    {
                        label: 'Translations',
                        name: 'translations',
                        description: 'Translate and transcribe the audio into english.'
                    }
                ],
                optional: false,
                default: 'transcriptions',
                additionalParams: true
            },
            {
                label: 'Image Resolution',
                description: 'This parameter controls the resolution in which the model views the image.',
@@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode {
        const baseOptions = nodeData.inputs?.baseOptions
        const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
        const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
        const speechToTextMode = nodeData.inputs?.speechToTextMode as string
        const imageResolution = nodeData.inputs?.imageResolution as string
        const credentialData = await getCredentialData(nodeData.credential ?? '', options)
@@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode {
        const multiModal = {
            allowImageUploads: allowImageUploads ?? false,
-            allowSpeechToText: allowSpeechToText ?? false,
+            imageResolution
            imageResolution,
            speechToTextMode
        }
        model.multiModal = multiModal
        return model
@@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models'
 import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
 import { Callbacks } from '@langchain/core/callbacks/manager'
 import { ICommonObject, INodeData } from '../../../src'
-import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils'
+import { addImagesToMessages } from '../../../src/MultiModalUtils'
 import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts'
 export class FlowiseChatOpenAI extends ChatOpenAI {
    multiModal: {}
@@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
    private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
        const nodeData = FlowiseChatOpenAI.chainNodeData
        const optionsData = FlowiseChatOpenAI.chainNodeOptions
        let audioTrans = await checkSpeechToText(nodeData, optionsData)
        if (audioTrans) {
            if (messages.length > 0) {
                const lastMessage = messages[0].pop() as HumanMessage
                if (!nodeData.inputs?.prompt) {
                    lastMessage.content = audioTrans
                } else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) {
                    lastMessage.content = audioTrans
                } else if (nodeData.inputs?.prompt instanceof PromptTemplate) {
                    let prompt = nodeData.inputs?.prompt as PromptTemplate
                    let inputVar = prompt.inputVariables[0]
                    let formattedValues: any = {}
                    formattedValues[inputVar] = audioTrans
                    lastMessage.content = await prompt.format(formattedValues)
                }
                messages[0].push(lastMessage)
            }
        }
        const messageContent = addImagesToMessages(nodeData, optionsData)
        if (messageContent) {
            if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
@@ -0,0 +1,33 @@
 import { INode, INodeParams } from '../../../src/Interface'
 class AssemblyAI_SpeechToText implements INode {
    label: string
    name: string
    version: number
    description: string
    type: string
    icon: string
    category: string
    baseClasses: string[]
    inputs?: INodeParams[]
    credential: INodeParams
    constructor() {
        this.label = 'AssemblyAI'
        this.name = 'assemblyAI'
        this.version = 1.0
        this.type = 'AssemblyAI'
        this.icon = 'assemblyai.png'
        this.category = 'SpeechToText'
        this.baseClasses = [this.type]
        this.inputs = []
        this.credential = {
            label: 'Connect Credential',
            name: 'credential',
            type: 'credential',
            credentialNames: ['assemblyAIApi']
        }
    }
 }
 module.exports = { nodeClass: AssemblyAI_SpeechToText }
@@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject)
    }
 }
 export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => {
    const MODEL_NAME = 'whisper-1'
    let input = undefined
    let model = nodeData.inputs?.model as BaseChatModel
    if (model instanceof ChatOpenAI && (model as any).multiModal) {
        const multiModalConfig = (model as any).multiModal
        if (options?.uploads) {
            if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') {
                const upload = options.uploads[0]
                //special case, text input is empty, but we have an upload (recorded audio)
                if (multiModalConfig.allowSpeechToText) {
                    const openAIClientOptions: ClientOptions = {
                        apiKey: model.openAIApiKey,
                        organization: model.organization
                    }
                    const openAIClient = new OpenAIClient(openAIClientOptions)
                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
                    // as the image is stored in the server, read the file and convert it to base64
                    const audio_file = fs.createReadStream(filePath)
                    if (multiModalConfig.speechToTextMode === 'transcriptions') {
                        const transcription = await openAIClient.audio.transcriptions.create({
                            file: audio_file,
                            model: MODEL_NAME
                        })
                        return transcription.text
                    } else if (multiModalConfig.speechToTextMode === 'translations') {
                        const translation = await openAIClient.audio.translations.create({
                            file: audio_file,
                            model: MODEL_NAME
                        })
                        return translation.text
                    }
                } else {
                    throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
                }
            }
        }
    }
    return input
 }
 export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
    const imageContent: MessageContent = []
    let model = nodeData.inputs?.model as BaseChatModel
@@ -54,7 +54,7 @@ export class NodesPool {
                            }
                        }
-                        const skipCategories = ['Analytic']
+                        const skipCategories = ['Analytic', 'SpeechToText']
                        if (!skipCategories.includes(newNodeInstance.category)) {
                            this.componentNodes[newNodeInstance.name] = newNodeInstance
                        }
@@ -46,7 +46,8 @@ import {
    getSessionChatHistory,
    getAllConnectedNodes,
    clearSessionMemory,
-    findMemoryNode
+    findMemoryNode,
    convertedSpeechToText
 } from './utils'
 import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
 import { getDataSource } from './DataSource'
@@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool'
 import { Assistant } from './database/entities/Assistant'
 import { ChatflowPool } from './ChatflowPool'
 import { CachePool } from './CachePool'
-import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components'
+import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
 import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
 import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
 import { sanitizeMiddleware } from './utils/XSS'
@@ -473,6 +474,17 @@ export class App {
                const flowObj = JSON.parse(chatflow.flowData)
                const allowances: IUploadFileSizeAndTypes[] = []
                let allowSpeechToText = false
                if (chatflow.speechToText) {
                    const speechToTextProviders = JSON.parse(chatflow.speechToText)
                    for (const provider in speechToTextProviders) {
                        const providerObj = speechToTextProviders[provider]
                        if (providerObj.status) {
                            allowSpeechToText = true
                            break
                        }
                    }
                }
                let allowImageUploads = false
                flowObj.nodes.forEach((node: IReactFlowNode) => {
                    if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
@@ -488,9 +500,6 @@ export class App {
                                })
                                allowImageUploads = true
                            }
                            if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) {
                                allowSpeechToText = true
                            }
                        })
                    }
                })
@@ -1602,7 +1611,8 @@ export class App {
            if (incomingInput.uploads) {
                // @ts-ignore
-                ;(incomingInput.uploads as any[]).forEach((upload: any) => {
+                const uploads = incomingInput.uploads as IFileUpload[]
                for (const upload of uploads) {
                    if (upload.type === 'file' || upload.type === 'audio') {
                        const filename = upload.name
                        const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId)
@@ -1618,7 +1628,29 @@ export class App {
                        upload.data = chatId
                        upload.type = 'stored-file'
                    }
-                })
+
                    if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) {
                        //speechToText
                        let speechToTextConfig: any = {}
                        if (chatflow.speechToText) {
                            const speechToTextProviders = JSON.parse(chatflow.speechToText)
                            for (const provider in speechToTextProviders) {
                                const providerObj = speechToTextProviders[provider]
                                if (providerObj.status) {
                                    speechToTextConfig = providerObj
                                    speechToTextConfig['name'] = provider
                                    break
                                }
                            }
                        }
                        if (speechToTextConfig) {
                            const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
                            if (speechToTextResult) {
                                incomingInput.question = speechToTextResult
                            }
                        }
                    }
                }
            }
            let isStreamValid = false
@@ -593,7 +593,6 @@ export const resolveVariables = (
    }
    const paramsObj = flowNodeData[types] ?? {}
    getParamValues(paramsObj)
    return flowNodeData
@@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => {
    extractValues(obj)
    return values
 }
 export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
    // const MODEL_NAME = 'whisper-1'
    if (speechToTextConfig) {
        //special case, text input is empty, but we have an upload (recorded audio)
        // const openAIClientOptions: ClientOptions = {
        //     apiKey: model.openAIApiKey,
        //     organization: model.organization
        // }
        // const openAIClient = new OpenAIClient(openAIClientOptions)
        // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
        //
        // // as the image is stored in the server, read the file and convert it to base64
        // const audio_file = fs.createReadStream(filePath)
        //
        // if (multiModalConfig.speechToTextMode === 'transcriptions') {
        //     const transcription = await openAIClient.audio.transcriptions.create({
        //         file: audio_file,
        //         model: MODEL_NAME
        //     })
        //     return transcription.text
        // } else if (multiModalConfig.speechToTextMode === 'translations') {
        //     const translation = await openAIClient.audio.translations.create({
        //         file: audio_file,
        //         model: MODEL_NAME
        //     })
        //     return translation.text
        // }
    } else {
        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
    }
    return undefined
 }
@@ -41,8 +41,8 @@ import chatflowsApi from 'api/chatflows'
 const speechToTextProviders = [
    {
-        label: 'OpenAI Wisper',
+        label: 'OpenAI Whisper',
-        name: 'openAIWisper',
+        name: 'openAIWhisper',
        icon: openAISVG,
        url: 'https://platform.openai.com/docs/guides/speech-to-text',
        inputs: [
@@ -70,7 +70,7 @@ const speechToTextProviders = [
                label: 'Connect Credential',
                name: 'credential',
                type: 'credential',
-                credentialNames: ['assemblyAiApi']
+                credentialNames: ['assemblyAIApi']
            },
            {
                label: 'On/Off',
@@ -101,7 +101,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => {
            })
            if (saveResp.data) {
                enqueueSnackbar({
-                    message: 'Analytic Configuration Saved',
+                    message: 'Speech To Text Configuration Saved',
                    options: {
                        key: new Date().getTime() + Math.random(),
                        variant: 'success',
@@ -118,7 +118,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => {
        } catch (error) {
            const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}`
            enqueueSnackbar({
-                message: `Failed to save Analytic Configuration: ${errorData}`,
+                message: `Failed to save Speech To Text Configuration: ${errorData}`,
                options: {
                    key: new Date().getTime() + Math.random(),
                    variant: 'error',