GPT Vision: Converting vision into Multi Modal. Base Changes.

2026-06-28 23:01:09 +03:00 · 2023-12-08 17:21:53 +05:30
parent 68fbe0ea12
commit 32575828cd
9 changed files with 129 additions and 45 deletions
@@ -0,0 +1,61 @@
+import { INode, INodeData, INodeParams } from '../../../src'
+
+class OpenAIAudioWhisper implements INode {
+    label: string
+    name: string
+    version: number
+    description: string
+    type: string
+    icon: string
+    category: string
+    baseClasses: string[]
+    inputs: INodeParams[]
+
+    constructor() {
+        this.label = 'Open AI Whisper'
+        this.name = 'openAIAudioWhisper'
+        this.version = 1.0
+        this.type = 'OpenAIWhisper'
+        this.description = 'Speech to text using OpenAI Whisper API'
+        this.icon = 'audio.svg'
+        this.category = 'MultiModal'
+        this.baseClasses = [this.type]
+        this.inputs = [
+            {
+                label: 'Purpose',
+                name: 'purpose',
+                type: 'options',
+                options: [
+                    {
+                        label: 'transcription',
+                        name: 'transcription'
+                    },
+                    {
+                        label: 'translation',
+                        name: 'translation'
+                    }
+                ]
+            },
+            {
+                label: 'Accepted Upload Types',
+                name: 'allowedUploadTypes',
+                type: 'string',
+                default: 'audio/mpeg;audio/x-wav;audio/mp4',
+                hidden: true
+            },
+            {
+                label: 'Maximum Upload Size (MB)',
+                name: 'maxUploadSize',
+                type: 'number',
+                default: '5',
+                hidden: true
+            }
+        ]
+    }
+
+    async init(nodeData: INodeData): Promise<any> {
+        return {}
+    }
+}
+
+module.exports = { nodeClass: OpenAIAudioWhisper }
@@ -19,14 +19,14 @@ class OpenAIVisionChain_Chains implements INode {
    credential: INodeParams

    constructor() {
-        this.label = 'Open AI Vision Chain'
-        this.name = 'openAIVisionChain'
+        this.label = 'Open AI MultiModal Chain'
+        this.name = 'openAIMultiModalChain'
        this.version = 1.0
-        this.type = 'OpenAIVisionChain'
+        this.type = 'OpenAIMultiModalChain'
        this.icon = 'chain.svg'
        this.category = 'Chains'
        this.badge = 'BETA'
-        this.description = 'Chain to run queries against OpenAI (GPT-4) Vision .'
+        this.description = 'Chain to query against Image and Audio Input.'
        this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)]
        this.credential = {
            label: 'Connect Credential',
@@ -36,16 +36,9 @@ class OpenAIVisionChain_Chains implements INode {
        }
        this.inputs = [
            {
-                label: 'Model Name',
-                name: 'modelName',
-                type: 'options',
-                options: [
-                    {
-                        label: 'gpt-4-vision-preview',
-                        name: 'gpt-4-vision-preview'
-                    }
-                ],
-                default: 'gpt-4-vision-preview',
+                label: 'Audio Input',
+                name: 'audioInput',
+                type: 'OpenAIWhisper',
                optional: true
            },
            {
@@ -54,6 +47,22 @@ class OpenAIVisionChain_Chains implements INode {
                type: 'BasePromptTemplate',
                optional: true
            },
+            {
+                label: 'Model Name',
+                name: 'modelName',
+                type: 'options',
+                options: [
+                    {
+                        label: 'gpt-4-vision-preview',
+                        name: 'gpt-4-vision-preview'
+                    },
+                    {
+                        label: 'whisper-1',
+                        name: 'whisper-1'
+                    }
+                ],
+                default: 'gpt-4-vision-preview'
+            },
            {
                label: 'Image Resolution',
                description: 'This parameter controls the resolution in which the model views the image.',
@@ -122,8 +131,8 @@ class OpenAIVisionChain_Chains implements INode {
        ]
        this.outputs = [
            {
-                label: 'Open AI Vision Chain',
-                name: 'openAIVisionChain',
+                label: 'Open AI MultiModal Chain',
+                name: 'OpenAIMultiModalChain',
                baseClasses: [this.type, ...getBaseClasses(VLLMChain)]
            },
            {
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="mdi-text-to-speech" width="24" height="24" viewBox="0 0 24 24"><path d="M8,7A2,2 0 0,1 10,9V14A2,2 0 0,1 8,16A2,2 0 0,1 6,14V9A2,2 0 0,1 8,7M14,14C14,16.97 11.84,19.44 9,19.92V22H7V19.92C4.16,19.44 2,16.97 2,14H4A4,4 0 0,0 8,18A4,4 0 0,0 12,14H14M21.41,9.41L17.17,13.66L18.18,10H14A2,2 0 0,1 12,8V4A2,2 0 0,1 14,2H20A2,2 0 0,1 22,4V8C22,8.55 21.78,9.05 21.41,9.41Z" /></svg>
@@ -1212,30 +1212,32 @@ export class App {
        })
    }

-    private uploadAllowedNodes = ['OpenAIVisionChain']
+    private uploadAllowedNodes = ['OpenAIMultiModalChain', 'OpenAIWhisper']
    private shouldAllowUploads(result: ChatFlow): any {
        const flowObj = JSON.parse(result.flowData)
        let allowUploads = false
-        let allowedTypes: string[] = []
-        let maxUploadSize: number = -1
+        const allowances: any = []
        flowObj.nodes.forEach((node: IReactFlowNode) => {
            if (this.uploadAllowedNodes.indexOf(node.data.type) > -1) {
                logger.debug(`[server]: Found Eligible Node ${node.data.type}, Allowing Uploads.`)
                allowUploads = true
+                const allowance: any = {}
                node.data.inputParams.map((param: any) => {
                    if (param.name === 'allowedUploadTypes') {
-                        allowedTypes = param.default.split(';')
+                        allowance.allowedTypes = param.default.split(';')
                    }
                    if (param.name === 'maxUploadSize') {
-                        maxUploadSize = parseInt(param.default ? param.default : '0')
+                        allowance.maxUploadSize = parseInt(param.default ? param.default : '0')
                    }
                })
+                if (allowance.allowedTypes && allowance.maxUploadSize) {
+                    allowances.push(allowance)
+                }
            }
        })
        return {
            allowUploads,
-            allowedTypes,
-            maxUploadSize
+            allowed: allowances
        }
    }

@@ -8,6 +8,7 @@ import rehypeRaw from 'rehype-raw'
 import remarkGfm from 'remark-gfm'
 import remarkMath from 'remark-math'
 import axios from 'axios'
+import audioUploadSVG from 'assets/images/wave-sound.jpg'

 import {
    Box,
@@ -85,23 +86,21 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
        e.preventDefault()
    }
    const isFileAllowedForUpload = (file) => {
-        // check if file type is allowed
-        if (getAllowChatFlowUploads.data?.allowedTypes?.length > 0) {
-            const allowedFileTypes = getAllowChatFlowUploads.data?.allowedTypes
-            if (!allowedFileTypes.includes(file.type)) {
-                alert(`File ${file.name} is not allowed.\nAllowed file types are ${allowedFileTypes.join(', ')}.`)
-                return false
-            }
-        }
-        // check if file size is allowed
-        if (getAllowChatFlowUploads.data?.maxUploadSize > 0) {
+        const constraints = getAllowChatFlowUploads.data
+        let acceptFile = false
+        if (constraints.allowUploads) {
+            const fileType = file.type
            const sizeInMB = file.size / 1024 / 1024
-            if (sizeInMB > getAllowChatFlowUploads.data?.maxUploadSize) {
-                alert(`File ${file.name} is too large.\nMaximum allowed size is ${getAllowChatFlowUploads.data?.maxUploadSize} MB.`)
-                return false
-            }
+            constraints.allowed.map((allowed) => {
+                if (allowed.allowedTypes.includes(fileType) && sizeInMB <= allowed.maxUploadSize) {
+                    acceptFile = true
+                }
+            })
        }
-        return true
+        if (!acceptFile) {
+            alert(`Cannot upload file. Kindly check the allowed file types and maximum allowed size.`)
+        }
+        return acceptFile
    }
    const handleDrop = async (e) => {
        if (!isChatFlowAvailableForUploads) {
@@ -124,9 +123,15 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
                                return
                            }
                            const { result } = evt.target
+                            let previewUrl
+                            if (file.type.startsWith('audio/')) {
+                                previewUrl = audioUploadSVG
+                            } else if (file.type.startsWith('image/')) {
+                                previewUrl = URL.createObjectURL(file)
+                            }
                            resolve({
                                data: result,
-                                preview: URL.createObjectURL(file),
+                                preview: previewUrl,
                                type: 'file',
                                name: name,
                                mime: file.type
@@ -240,7 +245,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
    }

    const previewStyle = {
-        width: '64px',
+        width: '128px',
        height: '64px',
        objectFit: 'cover' // This makes the image cover the area, cropping it if necessary
    }
@@ -514,11 +519,17 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
            onDrop={handleDrop}
            className={`file-drop-field`}
        >
-            {isDragOver && (
+            {isDragOver && getAllowChatFlowUploads.data?.allowUploads && (
                <Box className='drop-overlay'>
                    <Typography variant='h2'>Drop here to upload</Typography>
-                    <Typography variant='subtitle1'>{getAllowChatFlowUploads.data?.allowedTypes?.join(', ')}</Typography>
-                    <Typography variant='subtitle1'>Max Allowed Size: {getAllowChatFlowUploads.data?.maxUploadSize} MB</Typography>
+                    {getAllowChatFlowUploads.data.allowed.map((allowed) => {
+                        return (
+                            <>
+                                <Typography variant='subtitle1'>{allowed.allowedTypes?.join(', ')}</Typography>
+                                <Typography variant='subtitle1'>Max Allowed Size: {allowed.maxUploadSize} MB</Typography>
+                            </>
+                        )
+                    })}
                </Box>
            )}
            <div className={`${isDialog ? 'cloud-dialog' : 'cloud'}`}>
@@ -727,7 +738,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
                    <Grid container spacing={2} sx={{ p: 1, mt: '5px', ml: '1px' }}>
                        {previews.map((item, index) => (
                            <Grid item xs={12} sm={6} md={3} key={index}>
-                                <Card variant='outlined' sx={{ maxWidth: 64 }}>
+                                <Card variant='outlined' sx={{ maxWidth: 128 }}>
                                    <CardMedia
                                        component='img'
                                        image={item.preview}
@@ -735,7 +746,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
                                        alt={`preview ${index}`}
                                        style={previewStyle}
                                    />
-                                    <CardActions className='center' sx={{ padding: 0, margin: 0 }}>
+                                    <CardActions className='center' sx={{ p: 0, m: 0 }}>
                                        <Button
                                            startIcon={<DeleteIcon />}
                                            onClick={() => handleDeletePreview(item)}
				`@@ -0,0 +1 @@`
				<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="mdi-text-to-speech" width="24" height="24" viewBox="0 0 24 24"><path d="M8,7A2,2 0 0,1 10,9V14A2,2 0 0,1 8,16A2,2 0 0,1 6,14V9A2,2 0 0,1 8,7M14,14C14,16.97 11.84,19.44 9,19.92V22H7V19.92C4.16,19.44 2,16.97 2,14H4A4,4 0 0,0 8,18A4,4 0 0,0 12,14H14M21.41,9.41L17.17,13.66L18.18,10H14A2,2 0 0,1 12,8V4A2,2 0 0,1 14,2H20A2,2 0 0,1 22,4V8C22,8.55 21.78,9.05 21.41,9.41Z" /></svg>