diff --git a/packages/components/credentials/AssemblyAI.credential.ts b/packages/components/credentials/AssemblyAI.credential.ts new file mode 100644 index 00000000..019cd7aa --- /dev/null +++ b/packages/components/credentials/AssemblyAI.credential.ts @@ -0,0 +1,23 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class AssemblyAIApi implements INodeCredential { + label: string + name: string + version: number + inputs: INodeParams[] + + constructor() { + this.label = 'AssemblyAI API' + this.name = 'assemblyAIApi' + this.version = 1.0 + this.inputs = [ + { + label: 'AssemblyAI Api Key', + name: 'assemblyAIApiKey', + type: 'password' + } + ] + } +} + +module.exports = { credClass: AssemblyAIApi } diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts index 9543f1ee..1cb09f3f 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts @@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode { default: false, optional: true }, - { - label: 'Allow Speech to Text', - name: 'allowSpeechToText', - type: 'boolean', - default: false, - optional: true - }, - // TODO: only show when speechToText is true - { - label: 'Speech to Text Method', - description: 'How to turn audio into text', - name: 'speechToTextMode', - type: 'options', - options: [ - { - label: 'Transcriptions', - name: 'transcriptions', - description: - 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.' - }, - { - label: 'Translations', - name: 'translations', - description: 'Translate and transcribe the audio into english.' - } - ], - optional: false, - default: 'transcriptions', - additionalParams: true - }, { label: 'Image Resolution', description: 'This parameter controls the resolution in which the model views the image.', @@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode { const baseOptions = nodeData.inputs?.baseOptions const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean - const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean - const speechToTextMode = nodeData.inputs?.speechToTextMode as string const imageResolution = nodeData.inputs?.imageResolution as string const credentialData = await getCredentialData(nodeData.credential ?? '', options) @@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode { const multiModal = { allowImageUploads: allowImageUploads ?? false, - allowSpeechToText: allowSpeechToText ?? false, - imageResolution, - speechToTextMode + imageResolution } model.multiModal = multiModal return model diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts index 8af9c4df..1bf4a286 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts @@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models' import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema' import { Callbacks } from '@langchain/core/callbacks/manager' import { ICommonObject, INodeData } from '../../../src' -import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils' -import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts' +import { addImagesToMessages } from '../../../src/MultiModalUtils' export class FlowiseChatOpenAI extends ChatOpenAI { multiModal: {} @@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI { private async injectMultiModalMessages(messages: BaseMessageLike[][]) { const nodeData = FlowiseChatOpenAI.chainNodeData const optionsData = FlowiseChatOpenAI.chainNodeOptions - let audioTrans = await checkSpeechToText(nodeData, optionsData) - if (audioTrans) { - if (messages.length > 0) { - const lastMessage = messages[0].pop() as HumanMessage - if (!nodeData.inputs?.prompt) { - lastMessage.content = audioTrans - } else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) { - lastMessage.content = audioTrans - } else if (nodeData.inputs?.prompt instanceof PromptTemplate) { - let prompt = nodeData.inputs?.prompt as PromptTemplate - let inputVar = prompt.inputVariables[0] - let formattedValues: any = {} - formattedValues[inputVar] = audioTrans - lastMessage.content = await prompt.format(formattedValues) - } - messages[0].push(lastMessage) - } - } const messageContent = addImagesToMessages(nodeData, optionsData) if (messageContent) { if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) { diff --git a/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts b/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts new file mode 100644 index 00000000..c5db6619 --- /dev/null +++ b/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts @@ -0,0 +1,33 @@ +import { INode, INodeParams } from '../../../src/Interface' + +class AssemblyAI_SpeechToText implements INode { + label: string + name: string + version: number + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs?: INodeParams[] + credential: INodeParams + + constructor() { + this.label = 'AssemblyAI' + this.name = 'assemblyAI' + this.version = 1.0 + this.type = 'AssemblyAI' + this.icon = 'assemblyai.png' + this.category = 'SpeechToText' + this.baseClasses = [this.type] + this.inputs = [] + this.credential = { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['assemblyAIApi'] + } + } +} + +module.exports = { nodeClass: AssemblyAI_SpeechToText } diff --git a/packages/components/nodes/speechtotext/assemblyai/assemblyai.png b/packages/components/nodes/speechtotext/assemblyai/assemblyai.png new file mode 100644 index 00000000..8919cb18 Binary files /dev/null and b/packages/components/nodes/speechtotext/assemblyai/assemblyai.png differ diff --git a/packages/components/src/MultiModalUtils.ts b/packages/components/src/MultiModalUtils.ts index 58865a8a..62e3513c 100644 --- a/packages/components/src/MultiModalUtils.ts +++ b/packages/components/src/MultiModalUtils.ts @@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject) } } -export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => { - const MODEL_NAME = 'whisper-1' - let input = undefined - let model = nodeData.inputs?.model as BaseChatModel - if (model instanceof ChatOpenAI && (model as any).multiModal) { - const multiModalConfig = (model as any).multiModal - if (options?.uploads) { - if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') { - const upload = options.uploads[0] - //special case, text input is empty, but we have an upload (recorded audio) - if (multiModalConfig.allowSpeechToText) { - const openAIClientOptions: ClientOptions = { - apiKey: model.openAIApiKey, - organization: model.organization - } - const openAIClient = new OpenAIClient(openAIClientOptions) - const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) - - // as the image is stored in the server, read the file and convert it to base64 - const audio_file = fs.createReadStream(filePath) - - if (multiModalConfig.speechToTextMode === 'transcriptions') { - const transcription = await openAIClient.audio.transcriptions.create({ - file: audio_file, - model: MODEL_NAME - }) - return transcription.text - } else if (multiModalConfig.speechToTextMode === 'translations') { - const translation = await openAIClient.audio.translations.create({ - file: audio_file, - model: MODEL_NAME - }) - return translation.text - } - } else { - throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') - } - } - } - } - return input -} - export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => { const imageContent: MessageContent = [] let model = nodeData.inputs?.model as BaseChatModel diff --git a/packages/server/src/NodesPool.ts b/packages/server/src/NodesPool.ts index f4681d4a..8b01e63a 100644 --- a/packages/server/src/NodesPool.ts +++ b/packages/server/src/NodesPool.ts @@ -54,7 +54,7 @@ export class NodesPool { } } - const skipCategories = ['Analytic'] + const skipCategories = ['Analytic', 'SpeechToText'] if (!skipCategories.includes(newNodeInstance.category)) { this.componentNodes[newNodeInstance.name] = newNodeInstance } diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index e7816311..7558c689 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -46,7 +46,8 @@ import { getSessionChatHistory, getAllConnectedNodes, clearSessionMemory, - findMemoryNode + findMemoryNode, + convertedSpeechToText } from './utils' import { cloneDeep, omit, uniqWith, isEqual } from 'lodash' import { getDataSource } from './DataSource' @@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool' import { Assistant } from './database/entities/Assistant' import { ChatflowPool } from './ChatflowPool' import { CachePool } from './CachePool' -import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components' +import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components' import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit' import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey' import { sanitizeMiddleware } from './utils/XSS' @@ -473,6 +474,17 @@ export class App { const flowObj = JSON.parse(chatflow.flowData) const allowances: IUploadFileSizeAndTypes[] = [] let allowSpeechToText = false + if (chatflow.speechToText) { + const speechToTextProviders = JSON.parse(chatflow.speechToText) + for (const provider in speechToTextProviders) { + const providerObj = speechToTextProviders[provider] + if (providerObj.status) { + allowSpeechToText = true + break + } + } + } + let allowImageUploads = false flowObj.nodes.forEach((node: IReactFlowNode) => { if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) { @@ -488,9 +500,6 @@ export class App { }) allowImageUploads = true } - if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) { - allowSpeechToText = true - } }) } }) @@ -1602,7 +1611,8 @@ export class App { if (incomingInput.uploads) { // @ts-ignore - ;(incomingInput.uploads as any[]).forEach((upload: any) => { + const uploads = incomingInput.uploads as IFileUpload[] + for (const upload of uploads) { if (upload.type === 'file' || upload.type === 'audio') { const filename = upload.name const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId) @@ -1618,7 +1628,29 @@ export class App { upload.data = chatId upload.type = 'stored-file' } - }) + + if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) { + //speechToText + let speechToTextConfig: any = {} + if (chatflow.speechToText) { + const speechToTextProviders = JSON.parse(chatflow.speechToText) + for (const provider in speechToTextProviders) { + const providerObj = speechToTextProviders[provider] + if (providerObj.status) { + speechToTextConfig = providerObj + speechToTextConfig['name'] = provider + break + } + } + } + if (speechToTextConfig) { + const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig) + if (speechToTextResult) { + incomingInput.question = speechToTextResult + } + } + } + } } let isStreamValid = false diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts index dafe612c..92f4d450 100644 --- a/packages/server/src/utils/index.ts +++ b/packages/server/src/utils/index.ts @@ -593,7 +593,6 @@ export const resolveVariables = ( } const paramsObj = flowNodeData[types] ?? {} - getParamValues(paramsObj) return flowNodeData @@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => { extractValues(obj) return values } + +export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => { + // const MODEL_NAME = 'whisper-1' + if (speechToTextConfig) { + //special case, text input is empty, but we have an upload (recorded audio) + // const openAIClientOptions: ClientOptions = { + // apiKey: model.openAIApiKey, + // organization: model.organization + // } + // const openAIClient = new OpenAIClient(openAIClientOptions) + // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) + // + // // as the image is stored in the server, read the file and convert it to base64 + // const audio_file = fs.createReadStream(filePath) + // + // if (multiModalConfig.speechToTextMode === 'transcriptions') { + // const transcription = await openAIClient.audio.transcriptions.create({ + // file: audio_file, + // model: MODEL_NAME + // }) + // return transcription.text + // } else if (multiModalConfig.speechToTextMode === 'translations') { + // const translation = await openAIClient.audio.translations.create({ + // file: audio_file, + // model: MODEL_NAME + // }) + // return translation.text + // } + } else { + throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') + } + return undefined +} diff --git a/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js b/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js index fa2b7a78..10b6f076 100644 --- a/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js +++ b/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js @@ -41,8 +41,8 @@ import chatflowsApi from 'api/chatflows' const speechToTextProviders = [ { - label: 'OpenAI Wisper', - name: 'openAIWisper', + label: 'OpenAI Whisper', + name: 'openAIWhisper', icon: openAISVG, url: 'https://platform.openai.com/docs/guides/speech-to-text', inputs: [ @@ -70,7 +70,7 @@ const speechToTextProviders = [ label: 'Connect Credential', name: 'credential', type: 'credential', - credentialNames: ['assemblyAiApi'] + credentialNames: ['assemblyAIApi'] }, { label: 'On/Off', @@ -101,7 +101,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => { }) if (saveResp.data) { enqueueSnackbar({ - message: 'Analytic Configuration Saved', + message: 'Speech To Text Configuration Saved', options: { key: new Date().getTime() + Math.random(), variant: 'success', @@ -118,7 +118,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => { } catch (error) { const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}` enqueueSnackbar({ - message: `Failed to save Analytic Configuration: ${errorData}`, + message: `Failed to save Speech To Text Configuration: ${errorData}`, options: { key: new Date().getTime() + Math.random(), variant: 'error',