making the chain multi-modal. now we accept audio and image uploads and can run inference

2026-06-28 21:00:58 +03:00 · 2023-12-09 22:07:16 +05:30
parent 32575828cd
commit 1b308a8b54
4 changed files with 96 additions and 38 deletions
@@ -7,6 +7,7 @@ class OpenAIAudioWhisper implements INode {
    description: string
    type: string
    icon: string
+    badge: string
    category: string
    baseClasses: string[]
    inputs: INodeParams[]
@@ -18,6 +19,7 @@ class OpenAIAudioWhisper implements INode {
        this.type = 'OpenAIWhisper'
        this.description = 'Speech to text using OpenAI Whisper API'
        this.icon = 'audio.svg'
+        this.badge = 'BETA'
        this.category = 'MultiModal'
        this.baseClasses = [this.type]
        this.inputs = [
@@ -27,14 +29,15 @@ class OpenAIAudioWhisper implements INode {
                type: 'options',
                options: [
                    {
-                        label: 'transcription',
+                        label: 'Transcription',
                        name: 'transcription'
                    },
                    {
-                        label: 'translation',
+                        label: 'Translation',
                        name: 'translation'
                    }
-                ]
+                ],
+                default: 'transcription'
            },
            {
                label: 'Accepted Upload Types',
@@ -54,7 +57,9 @@ class OpenAIAudioWhisper implements INode {
    }

    async init(nodeData: INodeData): Promise<any> {
-        return {}
+        const purpose = nodeData.inputs?.purpose as string
+
+        return { purpose }
    }
 }

@@ -132,7 +132,7 @@ class OpenAIVisionChain_Chains implements INode {
        this.outputs = [
            {
                label: 'Open AI MultiModal Chain',
-                name: 'OpenAIMultiModalChain',
+                name: 'openAIMultiModalChain',
                baseClasses: [this.type, ...getBaseClasses(VLLMChain)]
            },
            {
@@ -154,6 +154,8 @@ class OpenAIVisionChain_Chains implements INode {
        const modelName = nodeData.inputs?.modelName as string
        const maxTokens = nodeData.inputs?.maxTokens as string
        const topP = nodeData.inputs?.topP as string
+        const whisperConfig = nodeData.inputs?.audioInput
+
        const fields: OpenAIVisionChainInput = {
            openAIApiKey: openAIApiKey,
            imageResolution: imageResolution,
@@ -164,6 +166,8 @@ class OpenAIVisionChain_Chains implements INode {
        if (temperature) fields.temperature = parseFloat(temperature)
        if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10)
        if (topP) fields.topP = parseFloat(topP)
+        if (whisperConfig) fields.whisperConfig = whisperConfig
+
        if (output === this.name) {
            const chain = new VLLMChain({
                ...fields,
@@ -21,6 +21,7 @@ export interface OpenAIVisionChainInput extends ChainInputs {
    modelName?: string
    maxTokens?: number
    topP?: number
+    whisperConfig?: any
 }

 /**
@@ -48,6 +49,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
    maxTokens?: number
    topP?: number

+    whisperConfig?: any
+
    constructor(fields: OpenAIVisionChainInput) {
        super(fields)
        this.throwError = fields?.throwError ?? false
@@ -59,6 +62,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
        this.maxTokens = fields?.maxTokens
        this.topP = fields?.topP
        this.imageUrls = fields?.imageUrls ?? []
+        this.whisperConfig = fields?.whisperConfig ?? {}
        if (!this.openAIApiKey) {
            throw new Error('OpenAI API key not found')
        }
@@ -92,15 +96,44 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
            type: 'text',
            text: userInput
        })
+        if (this.whisperConfig && this.imageUrls && this.imageUrls.length > 0) {
+            const audioUploads = this.getAudioUploads(this.imageUrls)
+            for (const url of audioUploads) {
+                const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
+
+                // as the image is stored in the server, read the file and convert it to base64
+                const audio_file = fs.createReadStream(filePath)
+                if (this.whisperConfig.purpose === 'transcription') {
+                    const transcription = await this.client.audio.transcriptions.create({
+                        file: audio_file,
+                        model: 'whisper-1'
+                    })
+                    userRole.content.push({
+                        type: 'text',
+                        text: transcription.text
+                    })
+                } else if (this.whisperConfig.purpose === 'translation') {
+                    const translation = await this.client.audio.translations.create({
+                        file: audio_file,
+                        model: 'whisper-1'
+                    })
+                    userRole.content.push({
+                        type: 'text',
+                        text: translation.text
+                    })
+                }
+            }
+        }
        if (this.imageUrls && this.imageUrls.length > 0) {
-            this.imageUrls.forEach((imageUrl: any) => {
-                let bf = imageUrl?.data
-                if (imageUrl.type == 'stored-file') {
-                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', imageUrl.data, imageUrl.name)
+            const imageUploads = this.getImageUploads(this.imageUrls)
+            for (const url of imageUploads) {
+                let bf = url.data
+                if (url.type == 'stored-file') {
+                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)

                    // as the image is stored in the server, read the file and convert it to base64
                    const contents = fs.readFileSync(filePath)
-                    bf = 'data:' + imageUrl.mime + ';base64,' + contents.toString('base64')
+                    bf = 'data:' + url.mime + ';base64,' + contents.toString('base64')
                }
                userRole.content.push({
                    type: 'image_url',
@@ -109,7 +142,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
                        detail: this.imageResolution
                    }
                })
-            })
+            }
        }
        vRequest.messages.push(userRole)
        if (this.prompt && this.prompt instanceof ChatPromptTemplate) {
@@ -146,6 +179,14 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
        }
    }

+    getAudioUploads = (urls: any[]) => {
+        return urls.filter((url: any) => url.mime.startsWith('audio/'))
+    }
+
+    getImageUploads = (urls: any[]) => {
+        return urls.filter((url: any) => url.mime.startsWith('image/'))
+    }
+
    _chainType() {
        return 'vision_chain'
    }