mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 19:00:59 +03:00
SpeechToText: Adding SpeechToText at the Chatflow level.
This commit is contained in:
@@ -54,7 +54,7 @@ export class NodesPool {
|
||||
}
|
||||
}
|
||||
|
||||
const skipCategories = ['Analytic']
|
||||
const skipCategories = ['Analytic', 'SpeechToText']
|
||||
if (!skipCategories.includes(newNodeInstance.category)) {
|
||||
this.componentNodes[newNodeInstance.name] = newNodeInstance
|
||||
}
|
||||
|
||||
@@ -46,7 +46,8 @@ import {
|
||||
getSessionChatHistory,
|
||||
getAllConnectedNodes,
|
||||
clearSessionMemory,
|
||||
findMemoryNode
|
||||
findMemoryNode,
|
||||
convertedSpeechToText
|
||||
} from './utils'
|
||||
import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
|
||||
import { getDataSource } from './DataSource'
|
||||
@@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool'
|
||||
import { Assistant } from './database/entities/Assistant'
|
||||
import { ChatflowPool } from './ChatflowPool'
|
||||
import { CachePool } from './CachePool'
|
||||
import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components'
|
||||
import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
|
||||
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
|
||||
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
|
||||
import { sanitizeMiddleware } from './utils/XSS'
|
||||
@@ -473,6 +474,17 @@ export class App {
|
||||
const flowObj = JSON.parse(chatflow.flowData)
|
||||
const allowances: IUploadFileSizeAndTypes[] = []
|
||||
let allowSpeechToText = false
|
||||
if (chatflow.speechToText) {
|
||||
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
||||
for (const provider in speechToTextProviders) {
|
||||
const providerObj = speechToTextProviders[provider]
|
||||
if (providerObj.status) {
|
||||
allowSpeechToText = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let allowImageUploads = false
|
||||
flowObj.nodes.forEach((node: IReactFlowNode) => {
|
||||
if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
|
||||
@@ -488,9 +500,6 @@ export class App {
|
||||
})
|
||||
allowImageUploads = true
|
||||
}
|
||||
if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) {
|
||||
allowSpeechToText = true
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
@@ -1602,7 +1611,8 @@ export class App {
|
||||
|
||||
if (incomingInput.uploads) {
|
||||
// @ts-ignore
|
||||
;(incomingInput.uploads as any[]).forEach((upload: any) => {
|
||||
const uploads = incomingInput.uploads as IFileUpload[]
|
||||
for (const upload of uploads) {
|
||||
if (upload.type === 'file' || upload.type === 'audio') {
|
||||
const filename = upload.name
|
||||
const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId)
|
||||
@@ -1618,7 +1628,29 @@ export class App {
|
||||
upload.data = chatId
|
||||
upload.type = 'stored-file'
|
||||
}
|
||||
})
|
||||
|
||||
if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) {
|
||||
//speechToText
|
||||
let speechToTextConfig: any = {}
|
||||
if (chatflow.speechToText) {
|
||||
const speechToTextProviders = JSON.parse(chatflow.speechToText)
|
||||
for (const provider in speechToTextProviders) {
|
||||
const providerObj = speechToTextProviders[provider]
|
||||
if (providerObj.status) {
|
||||
speechToTextConfig = providerObj
|
||||
speechToTextConfig['name'] = provider
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if (speechToTextConfig) {
|
||||
const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
|
||||
if (speechToTextResult) {
|
||||
incomingInput.question = speechToTextResult
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let isStreamValid = false
|
||||
|
||||
@@ -593,7 +593,6 @@ export const resolveVariables = (
|
||||
}
|
||||
|
||||
const paramsObj = flowNodeData[types] ?? {}
|
||||
|
||||
getParamValues(paramsObj)
|
||||
|
||||
return flowNodeData
|
||||
@@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => {
|
||||
extractValues(obj)
|
||||
return values
|
||||
}
|
||||
|
||||
export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
|
||||
// const MODEL_NAME = 'whisper-1'
|
||||
if (speechToTextConfig) {
|
||||
//special case, text input is empty, but we have an upload (recorded audio)
|
||||
// const openAIClientOptions: ClientOptions = {
|
||||
// apiKey: model.openAIApiKey,
|
||||
// organization: model.organization
|
||||
// }
|
||||
// const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||
// const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||
//
|
||||
// // as the image is stored in the server, read the file and convert it to base64
|
||||
// const audio_file = fs.createReadStream(filePath)
|
||||
//
|
||||
// if (multiModalConfig.speechToTextMode === 'transcriptions') {
|
||||
// const transcription = await openAIClient.audio.transcriptions.create({
|
||||
// file: audio_file,
|
||||
// model: MODEL_NAME
|
||||
// })
|
||||
// return transcription.text
|
||||
// } else if (multiModalConfig.speechToTextMode === 'translations') {
|
||||
// const translation = await openAIClient.audio.translations.create({
|
||||
// file: audio_file,
|
||||
// model: MODEL_NAME
|
||||
// })
|
||||
// return translation.text
|
||||
// }
|
||||
} else {
|
||||
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user