From e81927ee132985a51aafb96dd67f174b371a7b08 Mon Sep 17 00:00:00 2001
From: vinodkiran <vinodkiran@usa.net>
Date: Wed, 31 Jan 2024 07:48:38 -0500
Subject: [PATCH] SpeechToText: Adding SpeechToText at the Chatflow level.

---
 .../ChatOpenAI/FlowiseChatOpenAI.ts           |  2 +-
 packages/components/package.json              |  1 +
 packages/components/src/MultiModalUtils.ts    |  1 -
 packages/components/src/index.ts              |  1 +
 packages/components/src/speechToText.ts       | 49 +++++++++++++++++++
 packages/server/src/index.ts                  | 19 +++++--
 packages/server/src/utils/index.ts            | 33 -------------
 7 files changed, 67 insertions(+), 39 deletions(-)
 create mode 100644 packages/components/src/speechToText.ts

diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts
index 1bf4a286..b25ec0c3 100644
--- a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts
+++ b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts
@@ -38,7 +38,7 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
         const nodeData = FlowiseChatOpenAI.chainNodeData
         const optionsData = FlowiseChatOpenAI.chainNodeOptions
         const messageContent = addImagesToMessages(nodeData, optionsData)
-        if (messageContent) {
+        if (messageContent?.length) {
             if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
                 const lastMessage = messages[0].pop()
                 if (lastMessage instanceof HumanMessage) {
diff --git a/packages/components/package.json b/packages/components/package.json
index c90ea5cc..953a6c4c 100644
--- a/packages/components/package.json
+++ b/packages/components/package.json
@@ -40,6 +40,7 @@
         "@upstash/redis": "^1.22.1",
         "@zilliz/milvus2-sdk-node": "^2.2.24",
         "apify-client": "^2.7.1",
+        "assemblyai": "^4.2.2",
         "axios": "1.6.2",
         "cheerio": "^1.0.0-rc.12",
         "chromadb": "^1.5.11",
diff --git a/packages/components/src/MultiModalUtils.ts b/packages/components/src/MultiModalUtils.ts
index 62e3513c..337cc105 100644
--- a/packages/components/src/MultiModalUtils.ts
+++ b/packages/components/src/MultiModalUtils.ts
@@ -1,6 +1,5 @@
 import { ICommonObject, INodeData } from './Interface'
 import { BaseChatModel } from 'langchain/chat_models/base'
-import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import { ChatOpenAI } from 'langchain/chat_models/openai'
 import path from 'path'
 import { getUserHome } from './utils'
diff --git a/packages/components/src/index.ts b/packages/components/src/index.ts
index ae2e380e..10cd1036 100644
--- a/packages/components/src/index.ts
+++ b/packages/components/src/index.ts
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })
 
 export * from './Interface'
 export * from './utils'
+export * from './speechToText'
diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
new file mode 100644
index 00000000..cc40cf21
--- /dev/null
+++ b/packages/components/src/speechToText.ts
@@ -0,0 +1,49 @@
+import { ICommonObject } from './Interface'
+import { getCredentialData, getUserHome } from './utils'
+import { type ClientOptions, OpenAIClient } from '@langchain/openai'
+import fs from 'fs'
+import path from 'path'
+import { AssemblyAI } from 'assemblyai'
+
+export const convertSpeechToText = async (upload: any, speechToTextConfig: any, options: ICommonObject) => {
+    if (speechToTextConfig) {
+        const credentialId = speechToTextConfig.credentialId as string
+        const credentialData = await getCredentialData(credentialId ?? '', options)
+        const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
+
+        // as the image is stored in the server, read the file and convert it to base64
+        const audio_file = fs.createReadStream(filePath)
+
+        if (speechToTextConfig.name === 'openAIWhisper') {
+            const openAIClientOptions: ClientOptions = {
+                apiKey: credentialData.openAIApiKey
+            }
+            const openAIClient = new OpenAIClient(openAIClientOptions)
+
+            const transcription = await openAIClient.audio.transcriptions.create({
+                file: audio_file,
+                model: 'whisper-1'
+            })
+            if (transcription?.text) {
+                return transcription.text
+            }
+        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
+            const client = new AssemblyAI({
+                apiKey: credentialData.assemblyAIApiKey
+            })
+
+            const params = {
+                audio: audio_file,
+                speaker_labels: false
+            }
+
+            const transcription = await client.transcripts.transcribe(params)
+            if (transcription?.text) {
+                return transcription.text
+            }
+        }
+    } else {
+        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
+    }
+    return undefined
+}
diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts
index 7558c689..17689bcb 100644
--- a/packages/server/src/index.ts
+++ b/packages/server/src/index.ts
@@ -46,8 +46,7 @@ import {
     getSessionChatHistory,
     getAllConnectedNodes,
     clearSessionMemory,
-    findMemoryNode,
-    convertedSpeechToText
+    findMemoryNode
 } from './utils'
 import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
 import { getDataSource } from './DataSource'
@@ -59,7 +58,15 @@ import { Tool } from './database/entities/Tool'
 import { Assistant } from './database/entities/Assistant'
 import { ChatflowPool } from './ChatflowPool'
 import { CachePool } from './CachePool'
-import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
+import {
+    ICommonObject,
+    IMessage,
+    INodeOptionsValue,
+    INodeParams,
+    handleEscapeCharacters,
+    convertSpeechToText,
+    IFileUpload
+} from 'flowise-components'
 import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
 import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
 import { sanitizeMiddleware } from './utils/XSS'
@@ -1644,7 +1651,11 @@ export class App {
                             }
                         }
                         if (speechToTextConfig) {
-                            const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
+                            const options: ICommonObject = {
+                                appDataSource: this.AppDataSource,
+                                databaseEntities: databaseEntities
+                            }
+                            const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
                             if (speechToTextResult) {
                                 incomingInput.question = speechToTextResult
                             }
diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts
index 92f4d450..3ed00785 100644
--- a/packages/server/src/utils/index.ts
+++ b/packages/server/src/utils/index.ts
@@ -1078,36 +1078,3 @@ export const getAllValuesFromJson = (obj: any): any[] => {
     extractValues(obj)
     return values
 }
-
-export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
-    // const MODEL_NAME = 'whisper-1'
-    if (speechToTextConfig) {
-        //special case, text input is empty, but we have an upload (recorded audio)
-        // const openAIClientOptions: ClientOptions = {
-        //     apiKey: model.openAIApiKey,
-        //     organization: model.organization
-        // }
-        // const openAIClient = new OpenAIClient(openAIClientOptions)
-        // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
-        //
-        // // as the image is stored in the server, read the file and convert it to base64
-        // const audio_file = fs.createReadStream(filePath)
-        //
-        // if (multiModalConfig.speechToTextMode === 'transcriptions') {
-        //     const transcription = await openAIClient.audio.transcriptions.create({
-        //         file: audio_file,
-        //         model: MODEL_NAME
-        //     })
-        //     return transcription.text
-        // } else if (multiModalConfig.speechToTextMode === 'translations') {
-        //     const translation = await openAIClient.audio.translations.create({
-        //         file: audio_file,
-        //         model: MODEL_NAME
-        //     })
-        //     return translation.text
-        // }
-    } else {
-        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
-    }
-    return undefined
-}