SpeechToText: Adding SpeechToText at the Chatflow level.

This commit is contained in:
vinodkiran
2024-01-30 21:48:08 -05:00
parent 1d122084b9
commit 4604594c55
10 changed files with 136 additions and 112 deletions
@@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode {
default: false,
optional: true
},
{
label: 'Allow Speech to Text',
name: 'allowSpeechToText',
type: 'boolean',
default: false,
optional: true
},
// TODO: only show when speechToText is true
{
label: 'Speech to Text Method',
description: 'How to turn audio into text',
name: 'speechToTextMode',
type: 'options',
options: [
{
label: 'Transcriptions',
name: 'transcriptions',
description:
'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
},
{
label: 'Translations',
name: 'translations',
description: 'Translate and transcribe the audio into english.'
}
],
optional: false,
default: 'transcriptions',
additionalParams: true
},
{
label: 'Image Resolution',
description: 'This parameter controls the resolution in which the model views the image.',
@@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode {
const baseOptions = nodeData.inputs?.baseOptions
const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
const speechToTextMode = nodeData.inputs?.speechToTextMode as string
const imageResolution = nodeData.inputs?.imageResolution as string
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
@@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode {
const multiModal = {
allowImageUploads: allowImageUploads ?? false,
allowSpeechToText: allowSpeechToText ?? false,
imageResolution,
speechToTextMode
imageResolution
}
model.multiModal = multiModal
return model
@@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models'
import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
import { Callbacks } from '@langchain/core/callbacks/manager'
import { ICommonObject, INodeData } from '../../../src'
import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils'
import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts'
import { addImagesToMessages } from '../../../src/MultiModalUtils'
export class FlowiseChatOpenAI extends ChatOpenAI {
multiModal: {}
@@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
const nodeData = FlowiseChatOpenAI.chainNodeData
const optionsData = FlowiseChatOpenAI.chainNodeOptions
let audioTrans = await checkSpeechToText(nodeData, optionsData)
if (audioTrans) {
if (messages.length > 0) {
const lastMessage = messages[0].pop() as HumanMessage
if (!nodeData.inputs?.prompt) {
lastMessage.content = audioTrans
} else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) {
lastMessage.content = audioTrans
} else if (nodeData.inputs?.prompt instanceof PromptTemplate) {
let prompt = nodeData.inputs?.prompt as PromptTemplate
let inputVar = prompt.inputVariables[0]
let formattedValues: any = {}
formattedValues[inputVar] = audioTrans
lastMessage.content = await prompt.format(formattedValues)
}
messages[0].push(lastMessage)
}
}
const messageContent = addImagesToMessages(nodeData, optionsData)
if (messageContent) {
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {