mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 23:01:09 +03:00
SpeechToText: Adding SpeechToText at the Chatflow level.
This commit is contained in:
@@ -0,0 +1,23 @@
|
|||||||
|
import { INodeParams, INodeCredential } from '../src/Interface'
|
||||||
|
|
||||||
|
class AssemblyAIApi implements INodeCredential {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
version: number
|
||||||
|
inputs: INodeParams[]
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'AssemblyAI API'
|
||||||
|
this.name = 'assemblyAIApi'
|
||||||
|
this.version = 1.0
|
||||||
|
this.inputs = [
|
||||||
|
{
|
||||||
|
label: 'AssemblyAI Api Key',
|
||||||
|
name: 'assemblyAIApiKey',
|
||||||
|
type: 'password'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { credClass: AssemblyAIApi }
|
||||||
@@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode {
|
|||||||
default: false,
|
default: false,
|
||||||
optional: true
|
optional: true
|
||||||
},
|
},
|
||||||
{
|
|
||||||
label: 'Allow Speech to Text',
|
|
||||||
name: 'allowSpeechToText',
|
|
||||||
type: 'boolean',
|
|
||||||
default: false,
|
|
||||||
optional: true
|
|
||||||
},
|
|
||||||
// TODO: only show when speechToText is true
|
|
||||||
{
|
|
||||||
label: 'Speech to Text Method',
|
|
||||||
description: 'How to turn audio into text',
|
|
||||||
name: 'speechToTextMode',
|
|
||||||
type: 'options',
|
|
||||||
options: [
|
|
||||||
{
|
|
||||||
label: 'Transcriptions',
|
|
||||||
name: 'transcriptions',
|
|
||||||
description:
|
|
||||||
'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'Translations',
|
|
||||||
name: 'translations',
|
|
||||||
description: 'Translate and transcribe the audio into english.'
|
|
||||||
}
|
|
||||||
],
|
|
||||||
optional: false,
|
|
||||||
default: 'transcriptions',
|
|
||||||
additionalParams: true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
label: 'Image Resolution',
|
label: 'Image Resolution',
|
||||||
description: 'This parameter controls the resolution in which the model views the image.',
|
description: 'This parameter controls the resolution in which the model views the image.',
|
||||||
@@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode {
|
|||||||
const baseOptions = nodeData.inputs?.baseOptions
|
const baseOptions = nodeData.inputs?.baseOptions
|
||||||
|
|
||||||
const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
|
const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
|
||||||
const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
|
|
||||||
const speechToTextMode = nodeData.inputs?.speechToTextMode as string
|
|
||||||
const imageResolution = nodeData.inputs?.imageResolution as string
|
const imageResolution = nodeData.inputs?.imageResolution as string
|
||||||
|
|
||||||
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
||||||
@@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode {
|
|||||||
|
|
||||||
const multiModal = {
|
const multiModal = {
|
||||||
allowImageUploads: allowImageUploads ?? false,
|
allowImageUploads: allowImageUploads ?? false,
|
||||||
allowSpeechToText: allowSpeechToText ?? false,
|
imageResolution
|
||||||
imageResolution,
|
|
||||||
speechToTextMode
|
|
||||||
}
|
}
|
||||||
model.multiModal = multiModal
|
model.multiModal = multiModal
|
||||||
return model
|
return model
|
||||||
|
|||||||
@@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models'
|
|||||||
import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
|
import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema'
|
||||||
import { Callbacks } from '@langchain/core/callbacks/manager'
|
import { Callbacks } from '@langchain/core/callbacks/manager'
|
||||||
import { ICommonObject, INodeData } from '../../../src'
|
import { ICommonObject, INodeData } from '../../../src'
|
||||||
import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils'
|
import { addImagesToMessages } from '../../../src/MultiModalUtils'
|
||||||
import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts'
|
|
||||||
|
|
||||||
export class FlowiseChatOpenAI extends ChatOpenAI {
|
export class FlowiseChatOpenAI extends ChatOpenAI {
|
||||||
multiModal: {}
|
multiModal: {}
|
||||||
@@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
|
|||||||
private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
|
private async injectMultiModalMessages(messages: BaseMessageLike[][]) {
|
||||||
const nodeData = FlowiseChatOpenAI.chainNodeData
|
const nodeData = FlowiseChatOpenAI.chainNodeData
|
||||||
const optionsData = FlowiseChatOpenAI.chainNodeOptions
|
const optionsData = FlowiseChatOpenAI.chainNodeOptions
|
||||||
let audioTrans = await checkSpeechToText(nodeData, optionsData)
|
|
||||||
if (audioTrans) {
|
|
||||||
if (messages.length > 0) {
|
|
||||||
const lastMessage = messages[0].pop() as HumanMessage
|
|
||||||
if (!nodeData.inputs?.prompt) {
|
|
||||||
lastMessage.content = audioTrans
|
|
||||||
} else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) {
|
|
||||||
lastMessage.content = audioTrans
|
|
||||||
} else if (nodeData.inputs?.prompt instanceof PromptTemplate) {
|
|
||||||
let prompt = nodeData.inputs?.prompt as PromptTemplate
|
|
||||||
let inputVar = prompt.inputVariables[0]
|
|
||||||
let formattedValues: any = {}
|
|
||||||
formattedValues[inputVar] = audioTrans
|
|
||||||
lastMessage.content = await prompt.format(formattedValues)
|
|
||||||
}
|
|
||||||
messages[0].push(lastMessage)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const messageContent = addImagesToMessages(nodeData, optionsData)
|
const messageContent = addImagesToMessages(nodeData, optionsData)
|
||||||
if (messageContent) {
|
if (messageContent) {
|
||||||
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
|
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
import { INode, INodeParams } from '../../../src/Interface'
|
||||||
|
|
||||||
|
class AssemblyAI_SpeechToText implements INode {
|
||||||
|
label: string
|
||||||
|
name: string
|
||||||
|
version: number
|
||||||
|
description: string
|
||||||
|
type: string
|
||||||
|
icon: string
|
||||||
|
category: string
|
||||||
|
baseClasses: string[]
|
||||||
|
inputs?: INodeParams[]
|
||||||
|
credential: INodeParams
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.label = 'AssemblyAI'
|
||||||
|
this.name = 'assemblyAI'
|
||||||
|
this.version = 1.0
|
||||||
|
this.type = 'AssemblyAI'
|
||||||
|
this.icon = 'assemblyai.png'
|
||||||
|
this.category = 'SpeechToText'
|
||||||
|
this.baseClasses = [this.type]
|
||||||
|
this.inputs = []
|
||||||
|
this.credential = {
|
||||||
|
label: 'Connect Credential',
|
||||||
|
name: 'credential',
|
||||||
|
type: 'credential',
|
||||||
|
credentialNames: ['assemblyAIApi']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { nodeClass: AssemblyAI_SpeechToText }
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 8.5 KiB |
@@ -18,49 +18,6 @@ export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => {
|
|
||||||
const MODEL_NAME = 'whisper-1'
|
|
||||||
let input = undefined
|
|
||||||
let model = nodeData.inputs?.model as BaseChatModel
|
|
||||||
if (model instanceof ChatOpenAI && (model as any).multiModal) {
|
|
||||||
const multiModalConfig = (model as any).multiModal
|
|
||||||
if (options?.uploads) {
|
|
||||||
if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') {
|
|
||||||
const upload = options.uploads[0]
|
|
||||||
//special case, text input is empty, but we have an upload (recorded audio)
|
|
||||||
if (multiModalConfig.allowSpeechToText) {
|
|
||||||
const openAIClientOptions: ClientOptions = {
|
|
||||||
apiKey: model.openAIApiKey,
|
|
||||||
organization: model.organization
|
|
||||||
}
|
|
||||||
const openAIClient = new OpenAIClient(openAIClientOptions)
|
|
||||||
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
|
||||||
|
|
||||||
// as the image is stored in the server, read the file and convert it to base64
|
|
||||||
const audio_file = fs.createReadStream(filePath)
|
|
||||||
|
|
||||||
if (multiModalConfig.speechToTextMode === 'transcriptions') {
|
|
||||||
const transcription = await openAIClient.audio.transcriptions.create({
|
|
||||||
file: audio_file,
|
|
||||||
model: MODEL_NAME
|
|
||||||
})
|
|
||||||
return transcription.text
|
|
||||||
} else if (multiModalConfig.speechToTextMode === 'translations') {
|
|
||||||
const translation = await openAIClient.audio.translations.create({
|
|
||||||
file: audio_file,
|
|
||||||
model: MODEL_NAME
|
|
||||||
})
|
|
||||||
return translation.text
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return input
|
|
||||||
}
|
|
||||||
|
|
||||||
export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
|
export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
|
||||||
const imageContent: MessageContent = []
|
const imageContent: MessageContent = []
|
||||||
let model = nodeData.inputs?.model as BaseChatModel
|
let model = nodeData.inputs?.model as BaseChatModel
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ export class NodesPool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const skipCategories = ['Analytic']
|
const skipCategories = ['Analytic', 'SpeechToText']
|
||||||
if (!skipCategories.includes(newNodeInstance.category)) {
|
if (!skipCategories.includes(newNodeInstance.category)) {
|
||||||
this.componentNodes[newNodeInstance.name] = newNodeInstance
|
this.componentNodes[newNodeInstance.name] = newNodeInstance
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ import {
|
|||||||
getSessionChatHistory,
|
getSessionChatHistory,
|
||||||
getAllConnectedNodes,
|
getAllConnectedNodes,
|
||||||
clearSessionMemory,
|
clearSessionMemory,
|
||||||
findMemoryNode
|
findMemoryNode,
|
||||||
|
convertedSpeechToText
|
||||||
} from './utils'
|
} from './utils'
|
||||||
import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
|
import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
|
||||||
import { getDataSource } from './DataSource'
|
import { getDataSource } from './DataSource'
|
||||||
@@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool'
|
|||||||
import { Assistant } from './database/entities/Assistant'
|
import { Assistant } from './database/entities/Assistant'
|
||||||
import { ChatflowPool } from './ChatflowPool'
|
import { ChatflowPool } from './ChatflowPool'
|
||||||
import { CachePool } from './CachePool'
|
import { CachePool } from './CachePool'
|
||||||
import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components'
|
import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
|
||||||
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
|
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
|
||||||
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
|
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
|
||||||
import { sanitizeMiddleware } from './utils/XSS'
|
import { sanitizeMiddleware } from './utils/XSS'
|
||||||
@@ -473,6 +474,17 @@ export class App {
|
|||||||
const flowObj = JSON.parse(chatflow.flowData)
|
const flowObj = JSON.parse(chatflow.flowData)
|
||||||
const allowances: IUploadFileSizeAndTypes[] = []
|
const allowances: IUploadFileSizeAndTypes[] = []
|
||||||
let allowSpeechToText = false
|
let allowSpeechToText = false
|
||||||
|
if (chatflow.speechToText) {
|
||||||
|
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
||||||
|
for (const provider in speechToTextProviders) {
|
||||||
|
const providerObj = speechToTextProviders[provider]
|
||||||
|
if (providerObj.status) {
|
||||||
|
allowSpeechToText = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let allowImageUploads = false
|
let allowImageUploads = false
|
||||||
flowObj.nodes.forEach((node: IReactFlowNode) => {
|
flowObj.nodes.forEach((node: IReactFlowNode) => {
|
||||||
if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
|
if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
|
||||||
@@ -488,9 +500,6 @@ export class App {
|
|||||||
})
|
})
|
||||||
allowImageUploads = true
|
allowImageUploads = true
|
||||||
}
|
}
|
||||||
if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) {
|
|
||||||
allowSpeechToText = true
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -1602,7 +1611,8 @@ export class App {
|
|||||||
|
|
||||||
if (incomingInput.uploads) {
|
if (incomingInput.uploads) {
|
||||||
// @ts-ignore
|
// @ts-ignore
|
||||||
;(incomingInput.uploads as any[]).forEach((upload: any) => {
|
const uploads = incomingInput.uploads as IFileUpload[]
|
||||||
|
for (const upload of uploads) {
|
||||||
if (upload.type === 'file' || upload.type === 'audio') {
|
if (upload.type === 'file' || upload.type === 'audio') {
|
||||||
const filename = upload.name
|
const filename = upload.name
|
||||||
const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId)
|
const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId)
|
||||||
@@ -1618,7 +1628,29 @@ export class App {
|
|||||||
upload.data = chatId
|
upload.data = chatId
|
||||||
upload.type = 'stored-file'
|
upload.type = 'stored-file'
|
||||||
}
|
}
|
||||||
})
|
|
||||||
|
if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) {
|
||||||
|
//speechToText
|
||||||
|
let speechToTextConfig: any = {}
|
||||||
|
if (chatflow.speechToText) {
|
||||||
|
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
||||||
|
for (const provider in speechToTextProviders) {
|
||||||
|
const providerObj = speechToTextProviders[provider]
|
||||||
|
if (providerObj.status) {
|
||||||
|
speechToTextConfig = providerObj
|
||||||
|
speechToTextConfig['name'] = provider
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (speechToTextConfig) {
|
||||||
|
const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
|
||||||
|
if (speechToTextResult) {
|
||||||
|
incomingInput.question = speechToTextResult
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let isStreamValid = false
|
let isStreamValid = false
|
||||||
|
|||||||
@@ -593,7 +593,6 @@ export const resolveVariables = (
|
|||||||
}
|
}
|
||||||
|
|
||||||
const paramsObj = flowNodeData[types] ?? {}
|
const paramsObj = flowNodeData[types] ?? {}
|
||||||
|
|
||||||
getParamValues(paramsObj)
|
getParamValues(paramsObj)
|
||||||
|
|
||||||
return flowNodeData
|
return flowNodeData
|
||||||
@@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => {
|
|||||||
extractValues(obj)
|
extractValues(obj)
|
||||||
return values
|
return values
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
|
||||||
|
// const MODEL_NAME = 'whisper-1'
|
||||||
|
if (speechToTextConfig) {
|
||||||
|
//special case, text input is empty, but we have an upload (recorded audio)
|
||||||
|
// const openAIClientOptions: ClientOptions = {
|
||||||
|
// apiKey: model.openAIApiKey,
|
||||||
|
// organization: model.organization
|
||||||
|
// }
|
||||||
|
// const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||||
|
// const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||||
|
//
|
||||||
|
// // as the image is stored in the server, read the file and convert it to base64
|
||||||
|
// const audio_file = fs.createReadStream(filePath)
|
||||||
|
//
|
||||||
|
// if (multiModalConfig.speechToTextMode === 'transcriptions') {
|
||||||
|
// const transcription = await openAIClient.audio.transcriptions.create({
|
||||||
|
// file: audio_file,
|
||||||
|
// model: MODEL_NAME
|
||||||
|
// })
|
||||||
|
// return transcription.text
|
||||||
|
// } else if (multiModalConfig.speechToTextMode === 'translations') {
|
||||||
|
// const translation = await openAIClient.audio.translations.create({
|
||||||
|
// file: audio_file,
|
||||||
|
// model: MODEL_NAME
|
||||||
|
// })
|
||||||
|
// return translation.text
|
||||||
|
// }
|
||||||
|
} else {
|
||||||
|
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
||||||
|
}
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|||||||
@@ -41,8 +41,8 @@ import chatflowsApi from 'api/chatflows'
|
|||||||
|
|
||||||
const speechToTextProviders = [
|
const speechToTextProviders = [
|
||||||
{
|
{
|
||||||
label: 'OpenAI Wisper',
|
label: 'OpenAI Whisper',
|
||||||
name: 'openAIWisper',
|
name: 'openAIWhisper',
|
||||||
icon: openAISVG,
|
icon: openAISVG,
|
||||||
url: 'https://platform.openai.com/docs/guides/speech-to-text',
|
url: 'https://platform.openai.com/docs/guides/speech-to-text',
|
||||||
inputs: [
|
inputs: [
|
||||||
@@ -70,7 +70,7 @@ const speechToTextProviders = [
|
|||||||
label: 'Connect Credential',
|
label: 'Connect Credential',
|
||||||
name: 'credential',
|
name: 'credential',
|
||||||
type: 'credential',
|
type: 'credential',
|
||||||
credentialNames: ['assemblyAiApi']
|
credentialNames: ['assemblyAIApi']
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
label: 'On/Off',
|
label: 'On/Off',
|
||||||
@@ -101,7 +101,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => {
|
|||||||
})
|
})
|
||||||
if (saveResp.data) {
|
if (saveResp.data) {
|
||||||
enqueueSnackbar({
|
enqueueSnackbar({
|
||||||
message: 'Analytic Configuration Saved',
|
message: 'Speech To Text Configuration Saved',
|
||||||
options: {
|
options: {
|
||||||
key: new Date().getTime() + Math.random(),
|
key: new Date().getTime() + Math.random(),
|
||||||
variant: 'success',
|
variant: 'success',
|
||||||
@@ -118,7 +118,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}`
|
const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}`
|
||||||
enqueueSnackbar({
|
enqueueSnackbar({
|
||||||
message: `Failed to save Analytic Configuration: ${errorData}`,
|
message: `Failed to save Speech To Text Configuration: ${errorData}`,
|
||||||
options: {
|
options: {
|
||||||
key: new Date().getTime() + Math.random(),
|
key: new Date().getTime() + Math.random(),
|
||||||
variant: 'error',
|
variant: 'error',
|
||||||
|
|||||||
Reference in New Issue
Block a user