mirror of
https://github.com/farcasclaudiu/Flowise.git
synced 2026-06-28 19:00:59 +03:00
Merge pull request #1419 from vinodkiran/FEATURE/Vision
FEATURE: Add Multi Modal Capabilities to Flowise
This commit is contained in:
@@ -21,6 +21,8 @@ export type CommonType = string | number | boolean | undefined | null
|
||||
|
||||
export type MessageType = 'apiMessage' | 'userMessage'
|
||||
|
||||
export type ImageDetail = 'auto' | 'low' | 'high'
|
||||
|
||||
/**
|
||||
* Others
|
||||
*/
|
||||
@@ -146,6 +148,33 @@ export interface IUsedTool {
|
||||
toolOutput: string | object
|
||||
}
|
||||
|
||||
export interface IFileUpload {
|
||||
data?: string
|
||||
type: string
|
||||
name: string
|
||||
mime: string
|
||||
}
|
||||
|
||||
export interface IMultiModalOption {
|
||||
image?: Record<string, any>
|
||||
audio?: Record<string, any>
|
||||
}
|
||||
|
||||
export type MessageContentText = {
|
||||
type: 'text'
|
||||
text: string
|
||||
}
|
||||
|
||||
export type MessageContentImageUrl = {
|
||||
type: 'image_url'
|
||||
image_url:
|
||||
| string
|
||||
| {
|
||||
url: string
|
||||
detail?: ImageDetail
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Classes
|
||||
*/
|
||||
|
||||
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })
|
||||
|
||||
export * from './Interface'
|
||||
export * from './utils'
|
||||
export * from './speechToText'
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
import { ICommonObject, IFileUpload, IMultiModalOption, INodeData, MessageContentImageUrl } from './Interface'
|
||||
import { ChatOpenAI as LangchainChatOpenAI } from 'langchain/chat_models/openai'
|
||||
import path from 'path'
|
||||
import { getStoragePath } from './utils'
|
||||
import fs from 'fs'
|
||||
|
||||
export const addImagesToMessages = (
|
||||
nodeData: INodeData,
|
||||
options: ICommonObject,
|
||||
multiModalOption?: IMultiModalOption
|
||||
): MessageContentImageUrl[] => {
|
||||
const imageContent: MessageContentImageUrl[] = []
|
||||
let model = nodeData.inputs?.model
|
||||
|
||||
if (model instanceof LangchainChatOpenAI && multiModalOption) {
|
||||
// Image Uploaded
|
||||
if (multiModalOption.image && multiModalOption.image.allowImageUploads && options?.uploads && options?.uploads.length > 0) {
|
||||
const imageUploads = getImageUploads(options.uploads)
|
||||
for (const upload of imageUploads) {
|
||||
let bf = upload.data
|
||||
if (upload.type == 'stored-file') {
|
||||
const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
|
||||
|
||||
// as the image is stored in the server, read the file and convert it to base64
|
||||
const contents = fs.readFileSync(filePath)
|
||||
bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64')
|
||||
|
||||
imageContent.push({
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: bf,
|
||||
detail: multiModalOption.image.imageResolution ?? 'low'
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return imageContent
|
||||
}
|
||||
|
||||
export const getAudioUploads = (uploads: IFileUpload[]) => {
|
||||
return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('audio/'))
|
||||
}
|
||||
|
||||
export const getImageUploads = (uploads: IFileUpload[]) => {
|
||||
return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('image/'))
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
import { ICommonObject, IFileUpload } from './Interface'
|
||||
import { getCredentialData, getStoragePath } from './utils'
|
||||
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
|
||||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
import { AssemblyAI } from 'assemblyai'
|
||||
|
||||
export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
|
||||
if (speechToTextConfig) {
|
||||
const credentialId = speechToTextConfig.credentialId as string
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
|
||||
|
||||
const audio_file = fs.createReadStream(filePath)
|
||||
|
||||
if (speechToTextConfig.name === 'openAIWhisper') {
|
||||
const openAIClientOptions: ClientOptions = {
|
||||
apiKey: credentialData.openAIApiKey
|
||||
}
|
||||
const openAIClient = new OpenAIClient(openAIClientOptions)
|
||||
|
||||
const transcription = await openAIClient.audio.transcriptions.create({
|
||||
file: audio_file,
|
||||
model: 'whisper-1',
|
||||
language: speechToTextConfig?.language,
|
||||
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
|
||||
prompt: speechToTextConfig?.prompt
|
||||
})
|
||||
if (transcription?.text) {
|
||||
return transcription.text
|
||||
}
|
||||
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
|
||||
const client = new AssemblyAI({
|
||||
apiKey: credentialData.assemblyAIApiKey
|
||||
})
|
||||
|
||||
const params = {
|
||||
audio: audio_file,
|
||||
speaker_labels: false
|
||||
}
|
||||
|
||||
const transcription = await client.transcripts.transcribe(params)
|
||||
if (transcription?.text) {
|
||||
return transcription.text
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
@@ -770,3 +770,10 @@ export const prepareSandboxVars = (variables: IVariable[]) => {
|
||||
}
|
||||
return vars
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare storage path
|
||||
*/
|
||||
export const getStoragePath = (): string => {
|
||||
return process.env.BLOB_STORAGE_PATH ? path.join(process.env.BLOB_STORAGE_PATH) : path.join(getUserHome(), '.flowise', 'storage')
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user