mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 19:00:59 +03:00
SpeechToText: Adding SpeechToText at the Chatflow level.
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
import { INodeParams, INodeCredential } from '../src/Interface'
|
||||
|
||||
class AssemblyAIApi implements INodeCredential {
|
||||
label: string
|
||||
name: string
|
||||
version: number
|
||||
inputs: INodeParams[]
|
||||
|
||||
constructor() {
|
||||
this.label = 'AssemblyAI API'
|
||||
this.name = 'assemblyAIApi'
|
||||
this.version = 1.0
|
||||
this.inputs = [
|
||||
{
|
||||
label: 'AssemblyAI Api Key',
|
||||
name: 'assemblyAIApiKey',
|
||||
type: 'password'
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { credClass: AssemblyAIApi }
|
||||
@@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode {
|
||||
default: false,
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
label: 'Allow Speech to Text',
|
||||
name: 'allowSpeechToText',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
optional: true
|
||||
},
|
||||
// TODO: only show when speechToText is true
|
||||
{
|
||||
label: 'Speech to Text Method',
|
||||
description: 'How to turn audio into text',
|
||||
name: 'speechToTextMode',
|
||||
type: 'options',
|
||||
options: [
|
||||
{
|
||||
label: 'Transcriptions',
|
||||
name: 'transcriptions',
|
||||
description:
|
||||
'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
|
||||
},
|
||||
{
|
||||
label: 'Translations',
|
||||
name: 'translations',
|
||||
description: 'Translate and transcribe the audio into english.'
|
||||
}
|
||||
],
|
||||
optional: false,
|
||||
default: 'transcriptions',
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Image Resolution',
|
||||
description: 'This parameter controls the resolution in which the model views the image.',
|
||||
@@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode {
|
||||
const baseOptions = nodeData.inputs?.baseOptions
|
||||
|
||||
const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
|
||||
const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
|
||||
const speechToTextMode = nodeData.inputs?.speechToTextMode as string
|
||||
const imageResolution = nodeData.inputs?.imageResolution as string
|
||||
|
||||
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
||||
@@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode {
|
||||
|
||||
const multiModal = {
|
||||
allowImageUploads: allowImageUploads ?? false,
|
||||
allowSpeechToText: allowSpeechToText ?? false,
|
||||
imageResolution,
|
||||
speechToTextMode
|
||||
imageResolution
|
||||
}
|
||||
model.multiModal = multiModal
|
||||
return model
|
||||
|
||||
@@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models'
|
||||
import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
|
||||
import { Callbacks } from '@langchain/core/callbacks/manager'
|
||||
import { ICommonObject, INodeData } from '../../../src'
|
||||
import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils'
|
||||
import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts'
|
||||
import { addImagesToMessages } from '../../../src/MultiModalUtils'
|
||||
|
||||
export class FlowiseChatOpenAI extends ChatOpenAI {
|
||||
multiModal: {}
|
||||
@@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
|
||||
private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
|
||||
const nodeData = FlowiseChatOpenAI.chainNodeData
|
||||
const optionsData = FlowiseChatOpenAI.chainNodeOptions
|
||||
let audioTrans = await checkSpeechToText(nodeData, optionsData)
|
||||
if (audioTrans) {
|
||||
if (messages.length > 0) {
|
||||
const lastMessage = messages[0].pop() as HumanMessage
|
||||
if (!nodeData.inputs?.prompt) {
|
||||
lastMessage.content = audioTrans
|
||||
} else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) {
|
||||
lastMessage.content = audioTrans
|
||||
} else if (nodeData.inputs?.prompt instanceof PromptTemplate) {
|
||||
let prompt = nodeData.inputs?.prompt as PromptTemplate
|
||||
let inputVar = prompt.inputVariables[0]
|
||||
let formattedValues: any = {}
|
||||
formattedValues[inputVar] = audioTrans
|
||||
lastMessage.content = await prompt.format(formattedValues)
|
||||
}
|
||||
messages[0].push(lastMessage)
|
||||
}
|
||||
}
|
||||
const messageContent = addImagesToMessages(nodeData, optionsData)
|
||||
if (messageContent) {
|
||||
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
import { INode, INodeParams } from '../../../src/Interface'
|
||||
|
||||
class AssemblyAI_SpeechToText implements INode {
|
||||
label: string
|
||||
name: string
|
||||
version: number
|
||||
description: string
|
||||
type: string
|
||||
icon: string
|
||||
category: string
|
||||
baseClasses: string[]
|
||||
inputs?: INodeParams[]
|
||||
credential: INodeParams
|
||||
|
||||
constructor() {
|
||||
this.label = 'AssemblyAI'
|
||||
this.name = 'assemblyAI'
|
||||
this.version = 1.0
|
||||
this.type = 'AssemblyAI'
|
||||
this.icon = 'assemblyai.png'
|
||||
this.category = 'SpeechToText'
|
||||
this.baseClasses = [this.type]
|
||||
this.inputs = []
|
||||
this.credential = {
|
||||
label: 'Connect Credential',
|
||||
name: 'credential',
|
||||
type: 'credential',
|
||||
credentialNames: ['assemblyAIApi']
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { nodeClass: AssemblyAI_SpeechToText }
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 8.5 KiB |
@@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject)
|
||||
}
|
||||
}
|
||||
|
||||
export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => {
|
||||
const MODEL_NAME = 'whisper-1'
|
||||
let input = undefined
|
||||
let model = nodeData.inputs?.model as BaseChatModel
|
||||
if (model instanceof ChatOpenAI && (model as any).multiModal) {
|
||||
const multiModalConfig = (model as any).multiModal
|
||||
if (options?.uploads) {
|
||||
if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') {
|
||||
const upload = options.uploads[0]
|
||||
//special case, text input is empty, but we have an upload (recorded audio)
|
||||
if (multiModalConfig.allowSpeechToText) {
|
||||
const openAIClientOptions: ClientOptions = {
|
||||
apiKey: model.openAIApiKey,
|
||||
organization: model.organization
|
||||
}
|
||||
const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||
|
||||
// as the image is stored in the server, read the file and convert it to base64
|
||||
const audio_file = fs.createReadStream(filePath)
|
||||
|
||||
if (multiModalConfig.speechToTextMode === 'transcriptions') {
|
||||
const transcription = await openAIClient.audio.transcriptions.create({
|
||||
file: audio_file,
|
||||
model: MODEL_NAME
|
||||
})
|
||||
return transcription.text
|
||||
} else if (multiModalConfig.speechToTextMode === 'translations') {
|
||||
const translation = await openAIClient.audio.translations.create({
|
||||
file: audio_file,
|
||||
model: MODEL_NAME
|
||||
})
|
||||
return translation.text
|
||||
}
|
||||
} else {
|
||||
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
|
||||
const imageContent: MessageContent = []
|
||||
let model = nodeData.inputs?.model as BaseChatModel
|
||||
|
||||
Reference in New Issue
Block a user