Merge pull request #1419 from vinodkiran/FEATURE/Vision

FEATURE: Add Multi Modal Capabilities to Flowise
This commit is contained in:
Henry Heng
2024-02-27 11:58:47 +08:00
committed by GitHub
91 changed files with 4345 additions and 353 deletions
+29
View File
@@ -21,6 +21,8 @@ export type CommonType = string | number | boolean | undefined | null
export type MessageType = 'apiMessage' | 'userMessage'
export type ImageDetail = 'auto' | 'low' | 'high'
/**
* Others
*/
@@ -146,6 +148,33 @@ export interface IUsedTool {
toolOutput: string | object
}
export interface IFileUpload {
data?: string
type: string
name: string
mime: string
}
export interface IMultiModalOption {
image?: Record<string, any>
audio?: Record<string, any>
}
export type MessageContentText = {
type: 'text'
text: string
}
export type MessageContentImageUrl = {
type: 'image_url'
image_url:
| string
| {
url: string
detail?: ImageDetail
}
}
/**
* Classes
*/
+1
View File
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })
export * from './Interface'
export * from './utils'
export * from './speechToText'
@@ -0,0 +1,48 @@
import { ICommonObject, IFileUpload, IMultiModalOption, INodeData, MessageContentImageUrl } from './Interface'
import { ChatOpenAI as LangchainChatOpenAI } from 'langchain/chat_models/openai'
import path from 'path'
import { getStoragePath } from './utils'
import fs from 'fs'
export const addImagesToMessages = (
nodeData: INodeData,
options: ICommonObject,
multiModalOption?: IMultiModalOption
): MessageContentImageUrl[] => {
const imageContent: MessageContentImageUrl[] = []
let model = nodeData.inputs?.model
if (model instanceof LangchainChatOpenAI && multiModalOption) {
// Image Uploaded
if (multiModalOption.image && multiModalOption.image.allowImageUploads && options?.uploads && options?.uploads.length > 0) {
const imageUploads = getImageUploads(options.uploads)
for (const upload of imageUploads) {
let bf = upload.data
if (upload.type == 'stored-file') {
const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
// as the image is stored in the server, read the file and convert it to base64
const contents = fs.readFileSync(filePath)
bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64')
imageContent.push({
type: 'image_url',
image_url: {
url: bf,
detail: multiModalOption.image.imageResolution ?? 'low'
}
})
}
}
}
}
return imageContent
}
export const getAudioUploads = (uploads: IFileUpload[]) => {
return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('audio/'))
}
export const getImageUploads = (uploads: IFileUpload[]) => {
return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('image/'))
}
+51
View File
@@ -0,0 +1,51 @@
import { ICommonObject, IFileUpload } from './Interface'
import { getCredentialData, getStoragePath } from './utils'
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
import fs from 'fs'
import path from 'path'
import { AssemblyAI } from 'assemblyai'
export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
if (speechToTextConfig) {
const credentialId = speechToTextConfig.credentialId as string
const credentialData = await getCredentialData(credentialId ?? '', options)
const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
const audio_file = fs.createReadStream(filePath)
if (speechToTextConfig.name === 'openAIWhisper') {
const openAIClientOptions: ClientOptions = {
apiKey: credentialData.openAIApiKey
}
const openAIClient = new OpenAIClient(openAIClientOptions)
const transcription = await openAIClient.audio.transcriptions.create({
file: audio_file,
model: 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (transcription?.text) {
return transcription.text
}
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
const client = new AssemblyAI({
apiKey: credentialData.assemblyAIApiKey
})
const params = {
audio: audio_file,
speaker_labels: false
}
const transcription = await client.transcripts.transcribe(params)
if (transcription?.text) {
return transcription.text
}
}
} else {
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
}
return undefined
}
+7
View File
@@ -770,3 +770,10 @@ export const prepareSandboxVars = (variables: IVariable[]) => {
}
return vars
}
/**
* Prepare storage path
*/
export const getStoragePath = (): string => {
return process.env.BLOB_STORAGE_PATH ? path.join(process.env.BLOB_STORAGE_PATH) : path.join(getUserHome(), '.flowise', 'storage')
}