SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 19:00:59 +03:00 · 2024-01-30 21:48:08 -05:00
parent 1d122084b9
commit 4604594c55
10 changed files with 136 additions and 112 deletions
@@ -0,0 +1,23 @@
+import { INodeParams, INodeCredential } from '../src/Interface'
+
+class AssemblyAIApi implements INodeCredential {
+    label: string
+    name: string
+    version: number
+    inputs: INodeParams[]
+
+    constructor() {
+        this.label = 'AssemblyAI API'
+        this.name = 'assemblyAIApi'
+        this.version = 1.0
+        this.inputs = [
+            {
+                label: 'AssemblyAI Api Key',
+                name: 'assemblyAIApiKey',
+                type: 'password'
+            }
+        ]
+    }
+}
+
+module.exports = { credClass: AssemblyAIApi }
@@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode {
                default: false,
                optional: true
            },
-            {
-                label: 'Allow Speech to Text',
-                name: 'allowSpeechToText',
-                type: 'boolean',
-                default: false,
-                optional: true
-            },
-            // TODO: only show when speechToText is true
-            {
-                label: 'Speech to Text Method',
-                description: 'How to turn audio into text',
-                name: 'speechToTextMode',
-                type: 'options',
-                options: [
-                    {
-                        label: 'Transcriptions',
-                        name: 'transcriptions',
-                        description:
-                            'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
-                    },
-                    {
-                        label: 'Translations',
-                        name: 'translations',
-                        description: 'Translate and transcribe the audio into english.'
-                    }
-                ],
-                optional: false,
-                default: 'transcriptions',
-                additionalParams: true
-            },
            {
                label: 'Image Resolution',
                description: 'This parameter controls the resolution in which the model views the image.',
@@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode {
        const baseOptions = nodeData.inputs?.baseOptions

        const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
-        const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
-        const speechToTextMode = nodeData.inputs?.speechToTextMode as string
        const imageResolution = nodeData.inputs?.imageResolution as string

        const credentialData = await getCredentialData(nodeData.credential ?? '', options)
@@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode {

        const multiModal = {
            allowImageUploads: allowImageUploads ?? false,
-            allowSpeechToText: allowSpeechToText ?? false,
-            imageResolution,
-            speechToTextMode
+            imageResolution
        }
        model.multiModal = multiModal
        return model
@@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models'
 import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
 import { Callbacks } from '@langchain/core/callbacks/manager'
 import { ICommonObject, INodeData } from '../../../src'
-import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils'
-import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts'
+import { addImagesToMessages } from '../../../src/MultiModalUtils'

 export class FlowiseChatOpenAI extends ChatOpenAI {
    multiModal: {}
@@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
    private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
        const nodeData = FlowiseChatOpenAI.chainNodeData
        const optionsData = FlowiseChatOpenAI.chainNodeOptions
-        let audioTrans = await checkSpeechToText(nodeData, optionsData)
-        if (audioTrans) {
-            if (messages.length > 0) {
-                const lastMessage = messages[0].pop() as HumanMessage
-                if (!nodeData.inputs?.prompt) {
-                    lastMessage.content = audioTrans
-                } else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) {
-                    lastMessage.content = audioTrans
-                } else if (nodeData.inputs?.prompt instanceof PromptTemplate) {
-                    let prompt = nodeData.inputs?.prompt as PromptTemplate
-                    let inputVar = prompt.inputVariables[0]
-                    let formattedValues: any = {}
-                    formattedValues[inputVar] = audioTrans
-                    lastMessage.content = await prompt.format(formattedValues)
-                }
-                messages[0].push(lastMessage)
-            }
-        }
        const messageContent = addImagesToMessages(nodeData, optionsData)
        if (messageContent) {
            if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
@@ -0,0 +1,33 @@
+import { INode, INodeParams } from '../../../src/Interface'
+
+class AssemblyAI_SpeechToText implements INode {
+    label: string
+    name: string
+    version: number
+    description: string
+    type: string
+    icon: string
+    category: string
+    baseClasses: string[]
+    inputs?: INodeParams[]
+    credential: INodeParams
+
+    constructor() {
+        this.label = 'AssemblyAI'
+        this.name = 'assemblyAI'
+        this.version = 1.0
+        this.type = 'AssemblyAI'
+        this.icon = 'assemblyai.png'
+        this.category = 'SpeechToText'
+        this.baseClasses = [this.type]
+        this.inputs = []
+        this.credential = {
+            label: 'Connect Credential',
+            name: 'credential',
+            type: 'credential',
+            credentialNames: ['assemblyAIApi']
+        }
+    }
+}
+
+module.exports = { nodeClass: AssemblyAI_SpeechToText }
@@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject)
    }
 }

-export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => {
-    const MODEL_NAME = 'whisper-1'
-    let input = undefined
-    let model = nodeData.inputs?.model as BaseChatModel
-    if (model instanceof ChatOpenAI && (model as any).multiModal) {
-        const multiModalConfig = (model as any).multiModal
-        if (options?.uploads) {
-            if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') {
-                const upload = options.uploads[0]
-                //special case, text input is empty, but we have an upload (recorded audio)
-                if (multiModalConfig.allowSpeechToText) {
-                    const openAIClientOptions: ClientOptions = {
-                        apiKey: model.openAIApiKey,
-                        organization: model.organization
-                    }
-                    const openAIClient = new OpenAIClient(openAIClientOptions)
-                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
-
-                    // as the image is stored in the server, read the file and convert it to base64
-                    const audio_file = fs.createReadStream(filePath)
-
-                    if (multiModalConfig.speechToTextMode === 'transcriptions') {
-                        const transcription = await openAIClient.audio.transcriptions.create({
-                            file: audio_file,
-                            model: MODEL_NAME
-                        })
-                        return transcription.text
-                    } else if (multiModalConfig.speechToTextMode === 'translations') {
-                        const translation = await openAIClient.audio.translations.create({
-                            file: audio_file,
-                            model: MODEL_NAME
-                        })
-                        return translation.text
-                    }
-                } else {
-                    throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
-                }
-            }
-        }
-    }
-    return input
-}
-
 export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
    const imageContent: MessageContent = []
    let model = nodeData.inputs?.model as BaseChatModel