Merge pull request #1419 from vinodkiran/FEATURE/Vision

FEATURE: Add Multi Modal Capabilities to Flowise
2026-06-28 19:00:59 +03:00 · 2024-02-27 11:58:47 +08:00
parent 714f82a234 68ac61c95f
commit a134ea85eb
91 changed files with 4345 additions and 353 deletions
@@ -21,6 +21,8 @@ export type CommonType = string | number | boolean | undefined | null

 export type MessageType = 'apiMessage' | 'userMessage'

+export type ImageDetail = 'auto' | 'low' | 'high'
+
 /**
 * Others
 */
@@ -146,6 +148,33 @@ export interface IUsedTool {
    toolOutput: string | object
 }

+export interface IFileUpload {
+    data?: string
+    type: string
+    name: string
+    mime: string
+}
+
+export interface IMultiModalOption {
+    image?: Record<string, any>
+    audio?: Record<string, any>
+}
+
+export type MessageContentText = {
+    type: 'text'
+    text: string
+}
+
+export type MessageContentImageUrl = {
+    type: 'image_url'
+    image_url:
+        | string
+        | {
+              url: string
+              detail?: ImageDetail
+          }
+}
+
 /**
 * Classes
 */
@@ -6,3 +6,4 @@ dotenv.config({ path: envPath, override: true })

 export * from './Interface'
 export * from './utils'
+export * from './speechToText'
@@ -0,0 +1,48 @@
+import { ICommonObject, IFileUpload, IMultiModalOption, INodeData, MessageContentImageUrl } from './Interface'
+import { ChatOpenAI as LangchainChatOpenAI } from 'langchain/chat_models/openai'
+import path from 'path'
+import { getStoragePath } from './utils'
+import fs from 'fs'
+
+export const addImagesToMessages = (
+    nodeData: INodeData,
+    options: ICommonObject,
+    multiModalOption?: IMultiModalOption
+): MessageContentImageUrl[] => {
+    const imageContent: MessageContentImageUrl[] = []
+    let model = nodeData.inputs?.model
+
+    if (model instanceof LangchainChatOpenAI && multiModalOption) {
+        // Image Uploaded
+        if (multiModalOption.image && multiModalOption.image.allowImageUploads && options?.uploads && options?.uploads.length > 0) {
+            const imageUploads = getImageUploads(options.uploads)
+            for (const upload of imageUploads) {
+                let bf = upload.data
+                if (upload.type == 'stored-file') {
+                    const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
+
+                    // as the image is stored in the server, read the file and convert it to base64
+                    const contents = fs.readFileSync(filePath)
+                    bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64')
+
+                    imageContent.push({
+                        type: 'image_url',
+                        image_url: {
+                            url: bf,
+                            detail: multiModalOption.image.imageResolution ?? 'low'
+                        }
+                    })
+                }
+            }
+        }
+    }
+    return imageContent
+}
+
+export const getAudioUploads = (uploads: IFileUpload[]) => {
+    return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('audio/'))
+}
+
+export const getImageUploads = (uploads: IFileUpload[]) => {
+    return uploads.filter((upload: IFileUpload) => upload.mime.startsWith('image/'))
+}
@@ -0,0 +1,51 @@
+import { ICommonObject, IFileUpload } from './Interface'
+import { getCredentialData, getStoragePath } from './utils'
+import { type ClientOptions, OpenAIClient } from '@langchain/openai'
+import fs from 'fs'
+import path from 'path'
+import { AssemblyAI } from 'assemblyai'
+
+export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
+    if (speechToTextConfig) {
+        const credentialId = speechToTextConfig.credentialId as string
+        const credentialData = await getCredentialData(credentialId ?? '', options)
+        const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
+
+        const audio_file = fs.createReadStream(filePath)
+
+        if (speechToTextConfig.name === 'openAIWhisper') {
+            const openAIClientOptions: ClientOptions = {
+                apiKey: credentialData.openAIApiKey
+            }
+            const openAIClient = new OpenAIClient(openAIClientOptions)
+
+            const transcription = await openAIClient.audio.transcriptions.create({
+                file: audio_file,
+                model: 'whisper-1',
+                language: speechToTextConfig?.language,
+                temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                prompt: speechToTextConfig?.prompt
+            })
+            if (transcription?.text) {
+                return transcription.text
+            }
+        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
+            const client = new AssemblyAI({
+                apiKey: credentialData.assemblyAIApiKey
+            })
+
+            const params = {
+                audio: audio_file,
+                speaker_labels: false
+            }
+
+            const transcription = await client.transcripts.transcribe(params)
+            if (transcription?.text) {
+                return transcription.text
+            }
+        }
+    } else {
+        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
+    }
+    return undefined
+}
@@ -770,3 +770,10 @@ export const prepareSandboxVars = (variables: IVariable[]) => {
    }
    return vars
 }
+
+/**
+ * Prepare storage path
+ */
+export const getStoragePath = (): string => {
+    return process.env.BLOB_STORAGE_PATH ? path.join(process.env.BLOB_STORAGE_PATH) : path.join(getUserHome(), '.flowise', 'storage')
+}