From e81927ee132985a51aafb96dd67f174b371a7b08 Mon Sep 17 00:00:00 2001 From: vinodkiran Date: Wed, 31 Jan 2024 07:48:38 -0500 Subject: [PATCH] SpeechToText: Adding SpeechToText at the Chatflow level. --- .../ChatOpenAI/FlowiseChatOpenAI.ts | 2 +- packages/components/package.json | 1 + packages/components/src/MultiModalUtils.ts | 1 - packages/components/src/index.ts | 1 + packages/components/src/speechToText.ts | 49 +++++++++++++++++++ packages/server/src/index.ts | 19 +++++-- packages/server/src/utils/index.ts | 33 ------------- 7 files changed, 67 insertions(+), 39 deletions(-) create mode 100644 packages/components/src/speechToText.ts diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts index 1bf4a286..b25ec0c3 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts @@ -38,7 +38,7 @@ export class FlowiseChatOpenAI extends ChatOpenAI { const nodeData = FlowiseChatOpenAI.chainNodeData const optionsData = FlowiseChatOpenAI.chainNodeOptions const messageContent = addImagesToMessages(nodeData, optionsData) - if (messageContent) { + if (messageContent?.length) { if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) { const lastMessage = messages[0].pop() if (lastMessage instanceof HumanMessage) { diff --git a/packages/components/package.json b/packages/components/package.json index c90ea5cc..953a6c4c 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -40,6 +40,7 @@ "@upstash/redis": "^1.22.1", "@zilliz/milvus2-sdk-node": "^2.2.24", "apify-client": "^2.7.1", + "assemblyai": "^4.2.2", "axios": "1.6.2", "cheerio": "^1.0.0-rc.12", "chromadb": "^1.5.11", diff --git a/packages/components/src/MultiModalUtils.ts b/packages/components/src/MultiModalUtils.ts index 62e3513c..337cc105 100644 --- a/packages/components/src/MultiModalUtils.ts +++ b/packages/components/src/MultiModalUtils.ts @@ -1,6 +1,5 @@ import { ICommonObject, INodeData } from './Interface' import { BaseChatModel } from 'langchain/chat_models/base' -import { type ClientOptions, OpenAIClient } from '@langchain/openai' import { ChatOpenAI } from 'langchain/chat_models/openai' import path from 'path' import { getUserHome } from './utils' diff --git a/packages/components/src/index.ts b/packages/components/src/index.ts index ae2e380e..10cd1036 100644 --- a/packages/components/src/index.ts +++ b/packages/components/src/index.ts @@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true }) export * from './Interface' export * from './utils' +export * from './speechToText' diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts new file mode 100644 index 00000000..cc40cf21 --- /dev/null +++ b/packages/components/src/speechToText.ts @@ -0,0 +1,49 @@ +import { ICommonObject } from './Interface' +import { getCredentialData, getUserHome } from './utils' +import { type ClientOptions, OpenAIClient } from '@langchain/openai' +import fs from 'fs' +import path from 'path' +import { AssemblyAI } from 'assemblyai' + +export const convertSpeechToText = async (upload: any, speechToTextConfig: any, options: ICommonObject) => { + if (speechToTextConfig) { + const credentialId = speechToTextConfig.credentialId as string + const credentialData = await getCredentialData(credentialId ?? '', options) + const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) + + // as the image is stored in the server, read the file and convert it to base64 + const audio_file = fs.createReadStream(filePath) + + if (speechToTextConfig.name === 'openAIWhisper') { + const openAIClientOptions: ClientOptions = { + apiKey: credentialData.openAIApiKey + } + const openAIClient = new OpenAIClient(openAIClientOptions) + + const transcription = await openAIClient.audio.transcriptions.create({ + file: audio_file, + model: 'whisper-1' + }) + if (transcription?.text) { + return transcription.text + } + } else if (speechToTextConfig.name === 'assemblyAiTranscribe') { + const client = new AssemblyAI({ + apiKey: credentialData.assemblyAIApiKey + }) + + const params = { + audio: audio_file, + speaker_labels: false + } + + const transcription = await client.transcripts.transcribe(params) + if (transcription?.text) { + return transcription.text + } + } + } else { + throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') + } + return undefined +} diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 7558c689..17689bcb 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -46,8 +46,7 @@ import { getSessionChatHistory, getAllConnectedNodes, clearSessionMemory, - findMemoryNode, - convertedSpeechToText + findMemoryNode } from './utils' import { cloneDeep, omit, uniqWith, isEqual } from 'lodash' import { getDataSource } from './DataSource' @@ -59,7 +58,15 @@ import { Tool } from './database/entities/Tool' import { Assistant } from './database/entities/Assistant' import { ChatflowPool } from './ChatflowPool' import { CachePool } from './CachePool' -import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components' +import { + ICommonObject, + IMessage, + INodeOptionsValue, + INodeParams, + handleEscapeCharacters, + convertSpeechToText, + IFileUpload +} from 'flowise-components' import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit' import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey' import { sanitizeMiddleware } from './utils/XSS' @@ -1644,7 +1651,11 @@ export class App { } } if (speechToTextConfig) { - const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig) + const options: ICommonObject = { + appDataSource: this.AppDataSource, + databaseEntities: databaseEntities + } + const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options) if (speechToTextResult) { incomingInput.question = speechToTextResult } diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts index 92f4d450..3ed00785 100644 --- a/packages/server/src/utils/index.ts +++ b/packages/server/src/utils/index.ts @@ -1078,36 +1078,3 @@ export const getAllValuesFromJson = (obj: any): any[] => { extractValues(obj) return values } - -export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => { - // const MODEL_NAME = 'whisper-1' - if (speechToTextConfig) { - //special case, text input is empty, but we have an upload (recorded audio) - // const openAIClientOptions: ClientOptions = { - // apiKey: model.openAIApiKey, - // organization: model.organization - // } - // const openAIClient = new OpenAIClient(openAIClientOptions) - // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) - // - // // as the image is stored in the server, read the file and convert it to base64 - // const audio_file = fs.createReadStream(filePath) - // - // if (multiModalConfig.speechToTextMode === 'transcriptions') { - // const transcription = await openAIClient.audio.transcriptions.create({ - // file: audio_file, - // model: MODEL_NAME - // }) - // return transcription.text - // } else if (multiModalConfig.speechToTextMode === 'translations') { - // const translation = await openAIClient.audio.translations.create({ - // file: audio_file, - // model: MODEL_NAME - // }) - // return translation.text - // } - } else { - throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') - } - return undefined -}