SpeechToText: Adding SpeechToText at the Chatflow level.

2026-06-28 15:00:57 +03:00 · 2024-01-31 07:48:38 -05:00
parent 4604594c55
commit e81927ee13
7 changed files with 67 additions and 39 deletions
@@ -38,7 +38,7 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
        const nodeData = FlowiseChatOpenAI.chainNodeData
        const optionsData = FlowiseChatOpenAI.chainNodeOptions
        const messageContent = addImagesToMessages(nodeData, optionsData)
-        if (messageContent) {
+        if (messageContent?.length) {
            if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
                const lastMessage = messages[0].pop()
                if (lastMessage instanceof HumanMessage) {
@@ -40,6 +40,7 @@
        "@upstash/redis": "^1.22.1",
        "@zilliz/milvus2-sdk-node": "^2.2.24",
        "apify-client": "^2.7.1",
        "assemblyai": "^4.2.2",
        "axios": "1.6.2",
        "cheerio": "^1.0.0-rc.12",
        "chromadb": "^1.5.11",
@@ -1,6 +1,5 @@
 import { ICommonObject, INodeData } from './Interface'
 import { BaseChatModel } from 'langchain/chat_models/base'
 import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import { ChatOpenAI } from 'langchain/chat_models/openai'
 import path from 'path'
 import { getUserHome } from './utils'
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })
 export * from './Interface'
 export * from './utils'
 export * from './speechToText'
@@ -0,0 +1,49 @@
 import { ICommonObject } from './Interface'
 import { getCredentialData, getUserHome } from './utils'
 import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import fs from 'fs'
 import path from 'path'
 import { AssemblyAI } from 'assemblyai'
 export const convertSpeechToText = async (upload: any, speechToTextConfig: any, options: ICommonObject) => {
    if (speechToTextConfig) {
        const credentialId = speechToTextConfig.credentialId as string
        const credentialData = await getCredentialData(credentialId ?? '', options)
        const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
        // as the image is stored in the server, read the file and convert it to base64
        const audio_file = fs.createReadStream(filePath)
        if (speechToTextConfig.name === 'openAIWhisper') {
            const openAIClientOptions: ClientOptions = {
                apiKey: credentialData.openAIApiKey
            }
            const openAIClient = new OpenAIClient(openAIClientOptions)
            const transcription = await openAIClient.audio.transcriptions.create({
                file: audio_file,
                model: 'whisper-1'
            })
            if (transcription?.text) {
                return transcription.text
            }
        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
            const client = new AssemblyAI({
                apiKey: credentialData.assemblyAIApiKey
            })
            const params = {
                audio: audio_file,
                speaker_labels: false
            }
            const transcription = await client.transcripts.transcribe(params)
            if (transcription?.text) {
                return transcription.text
            }
        }
    } else {
        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
    }
    return undefined
 }
@@ -46,8 +46,7 @@ import {
    getSessionChatHistory,
    getAllConnectedNodes,
    clearSessionMemory,
-    findMemoryNode,
+    findMemoryNode
    convertedSpeechToText
 } from './utils'
 import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
 import { getDataSource } from './DataSource'
@@ -59,7 +58,15 @@ import { Tool } from './database/entities/Tool'
 import { Assistant } from './database/entities/Assistant'
 import { ChatflowPool } from './ChatflowPool'
 import { CachePool } from './CachePool'
-import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
+import {
    ICommonObject,
    IMessage,
    INodeOptionsValue,
    INodeParams,
    handleEscapeCharacters,
    convertSpeechToText,
    IFileUpload
 } from 'flowise-components'
 import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
 import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
 import { sanitizeMiddleware } from './utils/XSS'
@@ -1644,7 +1651,11 @@ export class App {
                            }
                        }
                        if (speechToTextConfig) {
-                            const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
+                            const options: ICommonObject = {
                                appDataSource: this.AppDataSource,
                                databaseEntities: databaseEntities
                            }
                            const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
                            if (speechToTextResult) {
                                incomingInput.question = speechToTextResult
                            }
@@ -1078,36 +1078,3 @@ export const getAllValuesFromJson = (obj: any): any[] => {
    extractValues(obj)
    return values
 }
 export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
    // const MODEL_NAME = 'whisper-1'
    if (speechToTextConfig) {
        //special case, text input is empty, but we have an upload (recorded audio)
        // const openAIClientOptions: ClientOptions = {
        //     apiKey: model.openAIApiKey,
        //     organization: model.organization
        // }
        // const openAIClient = new OpenAIClient(openAIClientOptions)
        // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
        //
        // // as the image is stored in the server, read the file and convert it to base64
        // const audio_file = fs.createReadStream(filePath)
        //
        // if (multiModalConfig.speechToTextMode === 'transcriptions') {
        //     const transcription = await openAIClient.audio.transcriptions.create({
        //         file: audio_file,
        //         model: MODEL_NAME
        //     })
        //     return transcription.text
        // } else if (multiModalConfig.speechToTextMode === 'translations') {
        //     const translation = await openAIClient.audio.translations.create({
        //         file: audio_file,
        //         model: MODEL_NAME
        //     })
        //     return translation.text
        // }
    } else {
        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
    }
    return undefined
 }