mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 15:00:57 +03:00
SpeechToText: Adding SpeechToText at the Chatflow level.
This commit is contained in:
@@ -38,7 +38,7 @@ export class FlowiseChatOpenAI extends ChatOpenAI {
|
|||||||
const nodeData = FlowiseChatOpenAI.chainNodeData
|
const nodeData = FlowiseChatOpenAI.chainNodeData
|
||||||
const optionsData = FlowiseChatOpenAI.chainNodeOptions
|
const optionsData = FlowiseChatOpenAI.chainNodeOptions
|
||||||
const messageContent = addImagesToMessages(nodeData, optionsData)
|
const messageContent = addImagesToMessages(nodeData, optionsData)
|
||||||
if (messageContent) {
|
if (messageContent?.length) {
|
||||||
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
|
if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) {
|
||||||
const lastMessage = messages[0].pop()
|
const lastMessage = messages[0].pop()
|
||||||
if (lastMessage instanceof HumanMessage) {
|
if (lastMessage instanceof HumanMessage) {
|
||||||
|
|||||||
@@ -40,6 +40,7 @@
|
|||||||
"@upstash/redis": "^1.22.1",
|
"@upstash/redis": "^1.22.1",
|
||||||
"@zilliz/milvus2-sdk-node": "^2.2.24",
|
"@zilliz/milvus2-sdk-node": "^2.2.24",
|
||||||
"apify-client": "^2.7.1",
|
"apify-client": "^2.7.1",
|
||||||
|
"assemblyai": "^4.2.2",
|
||||||
"axios": "1.6.2",
|
"axios": "1.6.2",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"chromadb": "^1.5.11",
|
"chromadb": "^1.5.11",
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import { ICommonObject, INodeData } from './Interface'
|
import { ICommonObject, INodeData } from './Interface'
|
||||||
import { BaseChatModel } from 'langchain/chat_models/base'
|
import { BaseChatModel } from 'langchain/chat_models/base'
|
||||||
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
|
|
||||||
import { ChatOpenAI } from 'langchain/chat_models/openai'
|
import { ChatOpenAI } from 'langchain/chat_models/openai'
|
||||||
import path from 'path'
|
import path from 'path'
|
||||||
import { getUserHome } from './utils'
|
import { getUserHome } from './utils'
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })
|
|||||||
|
|
||||||
export * from './Interface'
|
export * from './Interface'
|
||||||
export * from './utils'
|
export * from './utils'
|
||||||
|
export * from './speechToText'
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
import { ICommonObject } from './Interface'
|
||||||
|
import { getCredentialData, getUserHome } from './utils'
|
||||||
|
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
|
||||||
|
import fs from 'fs'
|
||||||
|
import path from 'path'
|
||||||
|
import { AssemblyAI } from 'assemblyai'
|
||||||
|
|
||||||
|
export const convertSpeechToText = async (upload: any, speechToTextConfig: any, options: ICommonObject) => {
|
||||||
|
if (speechToTextConfig) {
|
||||||
|
const credentialId = speechToTextConfig.credentialId as string
|
||||||
|
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||||
|
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||||
|
|
||||||
|
// as the image is stored in the server, read the file and convert it to base64
|
||||||
|
const audio_file = fs.createReadStream(filePath)
|
||||||
|
|
||||||
|
if (speechToTextConfig.name === 'openAIWhisper') {
|
||||||
|
const openAIClientOptions: ClientOptions = {
|
||||||
|
apiKey: credentialData.openAIApiKey
|
||||||
|
}
|
||||||
|
const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||||
|
|
||||||
|
const transcription = await openAIClient.audio.transcriptions.create({
|
||||||
|
file: audio_file,
|
||||||
|
model: 'whisper-1'
|
||||||
|
})
|
||||||
|
if (transcription?.text) {
|
||||||
|
return transcription.text
|
||||||
|
}
|
||||||
|
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
|
||||||
|
const client = new AssemblyAI({
|
||||||
|
apiKey: credentialData.assemblyAIApiKey
|
||||||
|
})
|
||||||
|
|
||||||
|
const params = {
|
||||||
|
audio: audio_file,
|
||||||
|
speaker_labels: false
|
||||||
|
}
|
||||||
|
|
||||||
|
const transcription = await client.transcripts.transcribe(params)
|
||||||
|
if (transcription?.text) {
|
||||||
|
return transcription.text
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
||||||
|
}
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
@@ -46,8 +46,7 @@ import {
|
|||||||
getSessionChatHistory,
|
getSessionChatHistory,
|
||||||
getAllConnectedNodes,
|
getAllConnectedNodes,
|
||||||
clearSessionMemory,
|
clearSessionMemory,
|
||||||
findMemoryNode,
|
findMemoryNode
|
||||||
convertedSpeechToText
|
|
||||||
} from './utils'
|
} from './utils'
|
||||||
import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
|
import { cloneDeep, omit, uniqWith, isEqual } from 'lodash'
|
||||||
import { getDataSource } from './DataSource'
|
import { getDataSource } from './DataSource'
|
||||||
@@ -59,7 +58,15 @@ import { Tool } from './database/entities/Tool'
|
|||||||
import { Assistant } from './database/entities/Assistant'
|
import { Assistant } from './database/entities/Assistant'
|
||||||
import { ChatflowPool } from './ChatflowPool'
|
import { ChatflowPool } from './ChatflowPool'
|
||||||
import { CachePool } from './CachePool'
|
import { CachePool } from './CachePool'
|
||||||
import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components'
|
import {
|
||||||
|
ICommonObject,
|
||||||
|
IMessage,
|
||||||
|
INodeOptionsValue,
|
||||||
|
INodeParams,
|
||||||
|
handleEscapeCharacters,
|
||||||
|
convertSpeechToText,
|
||||||
|
IFileUpload
|
||||||
|
} from 'flowise-components'
|
||||||
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
|
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
|
||||||
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
|
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
|
||||||
import { sanitizeMiddleware } from './utils/XSS'
|
import { sanitizeMiddleware } from './utils/XSS'
|
||||||
@@ -1644,7 +1651,11 @@ export class App {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (speechToTextConfig) {
|
if (speechToTextConfig) {
|
||||||
const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig)
|
const options: ICommonObject = {
|
||||||
|
appDataSource: this.AppDataSource,
|
||||||
|
databaseEntities: databaseEntities
|
||||||
|
}
|
||||||
|
const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
|
||||||
if (speechToTextResult) {
|
if (speechToTextResult) {
|
||||||
incomingInput.question = speechToTextResult
|
incomingInput.question = speechToTextResult
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1078,36 +1078,3 @@ export const getAllValuesFromJson = (obj: any): any[] => {
|
|||||||
extractValues(obj)
|
extractValues(obj)
|
||||||
return values
|
return values
|
||||||
}
|
}
|
||||||
|
|
||||||
export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => {
|
|
||||||
// const MODEL_NAME = 'whisper-1'
|
|
||||||
if (speechToTextConfig) {
|
|
||||||
//special case, text input is empty, but we have an upload (recorded audio)
|
|
||||||
// const openAIClientOptions: ClientOptions = {
|
|
||||||
// apiKey: model.openAIApiKey,
|
|
||||||
// organization: model.organization
|
|
||||||
// }
|
|
||||||
// const openAIClient = new OpenAIClient(openAIClientOptions)
|
|
||||||
// const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
|
||||||
//
|
|
||||||
// // as the image is stored in the server, read the file and convert it to base64
|
|
||||||
// const audio_file = fs.createReadStream(filePath)
|
|
||||||
//
|
|
||||||
// if (multiModalConfig.speechToTextMode === 'transcriptions') {
|
|
||||||
// const transcription = await openAIClient.audio.transcriptions.create({
|
|
||||||
// file: audio_file,
|
|
||||||
// model: MODEL_NAME
|
|
||||||
// })
|
|
||||||
// return transcription.text
|
|
||||||
// } else if (multiModalConfig.speechToTextMode === 'translations') {
|
|
||||||
// const translation = await openAIClient.audio.translations.create({
|
|
||||||
// file: audio_file,
|
|
||||||
// model: MODEL_NAME
|
|
||||||
// })
|
|
||||||
// return translation.text
|
|
||||||
// }
|
|
||||||
} else {
|
|
||||||
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
|
||||||
}
|
|
||||||
return undefined
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user