GPT Vision: Renaming to OpenAIMultiModalChain and merging the functionality of Wisper.

2026-06-29 03:01:10 +03:00 · 2024-01-18 13:03:27 +05:30
parent 398a31f426
commit 8a14a52d90
8 changed files with 118 additions and 132 deletions
@@ -0,0 +1,333 @@
+import {
+    ICommonObject,
+    INode,
+    INodeData,
+    INodeOutputsValue,
+    INodeParams
+} from "../../../src/Interface";
+import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
+import { OpenAIMultiModalChainInput, VLLMChain } from "./VLLMChain";
+import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler'
+import { formatResponse } from '../../outputparsers/OutputParserHelpers'
+import { checkInputs, Moderation, streamResponse } from "../../moderation/Moderation";
+
+class OpenAIMultiModalChain_Chains implements INode {
+    label: string
+    name: string
+    version: number
+    type: string
+    icon: string
+    badge: string
+    category: string
+    baseClasses: string[]
+    description: string
+    inputs: INodeParams[]
+    outputs: INodeOutputsValue[]
+    credential: INodeParams
+
+    constructor() {
+        this.label = 'Open AI MultiModal Chain'
+        this.name = 'openAIMultiModalChain'
+        this.version = 1.0
+        this.type = 'OpenAIMultiModalChain'
+        this.icon = 'chain.svg'
+        this.category = 'Chains'
+        this.badge = 'BETA'
+        this.description = 'Chain to query against Image and Audio Input.'
+        this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)]
+        this.credential = {
+            label: 'Connect Credential',
+            name: 'credential',
+            type: 'credential',
+            credentialNames: ['openAIApi']
+        }
+        this.inputs = [
+            {
+                label: 'Prompt',
+                name: 'prompt',
+                type: 'BasePromptTemplate',
+                optional: true
+            },
+            {
+                label: 'Input Moderation',
+                description: 'Detect text that could generate harmful output and prevent it from being sent to the language model',
+                name: 'inputModeration',
+                type: 'Moderation',
+                optional: true,
+                list: true
+            },
+            {
+                label: 'Model Name',
+                name: 'modelName',
+                type: 'options',
+                options: [
+                    {
+                        label: 'gpt-4-vision-preview',
+                        name: 'gpt-4-vision-preview'
+                    }
+                ],
+                default: 'gpt-4-vision-preview'
+            },
+            {
+                label: 'Speech to Text',
+                name: 'speechToText',
+                type: 'boolean',
+                optional: true,
+            },
+            // TODO: only show when speechToText is true
+            {
+                label: 'Speech to Text Method',
+                description: 'How to turn audio into text',
+                name: 'speechToTextMode',
+                type: 'options',
+                options: [
+                    {
+                        label: 'Transcriptions',
+                        name: 'transcriptions',
+                        description: 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
+                    },
+                    {
+                        label: 'Translations',
+                        name: 'translations',
+                        description: 'Translate and transcribe the audio into english.'
+                    }
+                ],
+                optional: false,
+                default: 'transcriptions',
+                additionalParams: true
+            },
+            {
+                label: 'Image Resolution',
+                description: 'This parameter controls the resolution in which the model views the image.',
+                name: 'imageResolution',
+                type: 'options',
+                options: [
+                    {
+                        label: 'Low',
+                        name: 'low'
+                    },
+                    {
+                        label: 'High',
+                        name: 'high'
+                    },
+                    {
+                        label: 'Auto',
+                        name: 'auto'
+                    }
+                ],
+                default: 'low',
+                optional: false,
+                additionalParams: true
+            },
+            {
+                label: 'Temperature',
+                name: 'temperature',
+                type: 'number',
+                step: 0.1,
+                default: 0.9,
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Top Probability',
+                name: 'topP',
+                type: 'number',
+                step: 0.1,
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Max Tokens',
+                name: 'maxTokens',
+                type: 'number',
+                step: 1,
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Accepted Upload Types',
+                name: 'allowedUploadTypes',
+                type: 'string',
+                default: 'image/gif;image/jpeg;image/png;image/webp;audio/mpeg;audio/x-wav;audio/mp4',
+                hidden: true
+            },
+            {
+                label: 'Maximum Upload Size (MB)',
+                name: 'maxUploadSize',
+                type: 'number',
+                default: '5',
+                hidden: true
+            }
+        ]
+        this.outputs = [
+            {
+                label: 'Open AI MultiModal Chain',
+                name: 'openAIMultiModalChain',
+                baseClasses: [this.type, ...getBaseClasses(VLLMChain)]
+            },
+            {
+                label: 'Output Prediction',
+                name: 'outputPrediction',
+                baseClasses: ['string', 'json']
+            }
+        ]
+    }
+
+    async init(nodeData: INodeData, input: string, options: ICommonObject): Promise<any> {
+        const prompt = nodeData.inputs?.prompt
+        const output = nodeData.outputs?.output as string
+        const imageResolution = nodeData.inputs?.imageResolution
+        const promptValues = prompt.promptValues as ICommonObject
+        const credentialData = await getCredentialData(nodeData.credential ?? '', options)
+        const openAIApiKey = getCredentialParam('openAIApiKey', credentialData, nodeData)
+        const temperature = nodeData.inputs?.temperature as string
+        const modelName = nodeData.inputs?.modelName as string
+        const maxTokens = nodeData.inputs?.maxTokens as string
+        const topP = nodeData.inputs?.topP as string
+        const speechToText = nodeData.inputs?.speechToText as boolean
+
+
+        const fields: OpenAIMultiModalChainInput = {
+            openAIApiKey: openAIApiKey,
+            imageResolution: imageResolution,
+            verbose: process.env.DEBUG === 'true',
+            uploads: options.uploads,
+            modelName: modelName
+        }
+        if (temperature) fields.temperature = parseFloat(temperature)
+        if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10)
+        if (topP) fields.topP = parseFloat(topP)
+        if (speechToText) {
+            const speechToTextMode = nodeData.inputs?.speechToTextMode ?? 'transcriptions'
+            if (speechToTextMode) fields.speechToTextMode = speechToTextMode
+        }
+
+        if (output === this.name) {
+            const chain = new VLLMChain({
+                ...fields,
+                prompt: prompt
+            })
+            return chain
+        } else if (output === 'outputPrediction') {
+            const chain = new VLLMChain({
+                ...fields
+            })
+            const inputVariables: string[] = prompt.inputVariables as string[] // ["product"]
+            const res = await runPrediction(inputVariables, chain, input, promptValues, options, nodeData)
+            // eslint-disable-next-line no-console
+            console.log('\x1b[92m\x1b[1m\n*****OUTPUT PREDICTION*****\n\x1b[0m\x1b[0m')
+            // eslint-disable-next-line no-console
+            console.log(res)
+            /**
+             * Apply string transformation to convert special chars:
+             * FROM: hello i am ben\n\n\thow are you?
+             * TO: hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?
+             */
+            return handleEscapeCharacters(res, false)
+        }
+    }
+
+    async run(nodeData: INodeData, input: string, options: ICommonObject): Promise<string | object> {
+        const prompt = nodeData.inputs?.prompt
+        const inputVariables: string[] = prompt.inputVariables as string[] // ["product"]
+        const chain = nodeData.instance as VLLMChain
+        let promptValues: ICommonObject | undefined = nodeData.inputs?.prompt.promptValues as ICommonObject
+        const res = await runPrediction(inputVariables, chain, input, promptValues, options, nodeData)
+        // eslint-disable-next-line no-console
+        console.log('\x1b[93m\x1b[1m\n*****FINAL RESULT*****\n\x1b[0m\x1b[0m')
+        // eslint-disable-next-line no-console
+        console.log(res)
+        return res
+    }
+}
+
+const runPrediction = async (
+    inputVariables: string[],
+    chain: VLLMChain,
+    input: string,
+    promptValuesRaw: ICommonObject | undefined,
+    options: ICommonObject,
+    nodeData: INodeData
+) => {
+    const loggerHandler = new ConsoleCallbackHandler(options.logger)
+    const callbacks = await additionalCallbacks(nodeData, options)
+
+    const isStreaming = options.socketIO && options.socketIOClientId
+    const socketIO = isStreaming ? options.socketIO : undefined
+    const socketIOClientId = isStreaming ? options.socketIOClientId : ''
+    const moderations = nodeData.inputs?.inputModeration as Moderation[]
+    if (moderations && moderations.length > 0) {
+        try {
+            // Use the output of the moderation chain as input for the LLM chain
+            input = await checkInputs(moderations, input)
+        } catch (e) {
+            await new Promise((resolve) => setTimeout(resolve, 500))
+            streamResponse(isStreaming, e.message, socketIO, socketIOClientId)
+            return formatResponse(e.message)
+        }
+    }
+
+    /**
+     * Apply string transformation to reverse converted special chars:
+     * FROM: { "value": "hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?" }
+     * TO: { "value": "hello i am ben\n\n\thow are you?" }
+     */
+    const promptValues = handleEscapeCharacters(promptValuesRaw, true)
+    if (options?.uploads) {
+        chain.uploads = options.uploads
+    }
+    if (promptValues && inputVariables.length > 0) {
+        let seen: string[] = []
+
+        for (const variable of inputVariables) {
+            seen.push(variable)
+            if (promptValues[variable]) {
+                chain.inputKey = variable
+                seen.pop()
+            }
+        }
+
+        if (seen.length === 0) {
+            // All inputVariables have fixed values specified
+            const options = { ...promptValues }
+            if (isStreaming) {
+                const handler = new CustomChainHandler(socketIO, socketIOClientId)
+                const res = await chain.call(options, [loggerHandler, handler, ...callbacks])
+                return formatResponse(res?.text)
+            } else {
+                const res = await chain.call(options, [loggerHandler, ...callbacks])
+                return formatResponse(res?.text)
+            }
+        } else if (seen.length === 1) {
+            // If one inputVariable is not specify, use input (user's question) as value
+            const lastValue = seen.pop()
+            if (!lastValue) throw new Error('Please provide Prompt Values')
+            chain.inputKey = lastValue as string
+            const options = {
+                ...promptValues,
+                [lastValue]: input
+            }
+            if (isStreaming) {
+                const handler = new CustomChainHandler(socketIO, socketIOClientId)
+                const res = await chain.call(options, [loggerHandler, handler, ...callbacks])
+                return formatResponse(res?.text)
+            } else {
+                const res = await chain.call(options, [loggerHandler, ...callbacks])
+                return formatResponse(res?.text)
+            }
+        } else {
+            throw new Error(`Please provide Prompt Values for: ${seen.join(', ')}`)
+        }
+    } else {
+        if (isStreaming) {
+            const handler = new CustomChainHandler(socketIO, socketIOClientId)
+            const res = await chain.run(input, [loggerHandler, handler, ...callbacks])
+            return formatResponse(res)
+        } else {
+            const res = await chain.run(input, [loggerHandler, ...callbacks])
+            return formatResponse(res)
+        }
+    }
+}
+
+module.exports = { nodeClass: OpenAIMultiModalChain_Chains }
@@ -0,0 +1,204 @@
+import { OpenAI as OpenAIClient, ClientOptions, OpenAI } from 'openai'
+import { BaseChain, ChainInputs } from 'langchain/chains'
+import { ChainValues } from 'langchain/schema'
+import { BasePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts'
+import path from 'path'
+import { getUserHome } from '../../../src/utils'
+import fs from 'fs'
+import { ChatCompletionContentPart, ChatCompletionMessageParam } from 'openai/src/resources/chat/completions'
+import ChatCompletionCreateParamsNonStreaming = OpenAI.ChatCompletionCreateParamsNonStreaming
+import { IFileUpload } from '../../../src'
+
+/**
+ * Interface for the input parameters of the OpenAIVisionChain class.
+ */
+export interface OpenAIMultiModalChainInput extends ChainInputs {
+    openAIApiKey?: string
+    openAIOrganization?: string
+    throwError?: boolean
+    prompt?: BasePromptTemplate
+    configuration?: ClientOptions
+    uploads?: IFileUpload[]
+    imageResolution?: 'auto' | 'low' | 'high'
+    temperature?: number
+    modelName?: string
+    maxTokens?: number
+    topP?: number
+    speechToTextMode?: string
+}
+
+/**
+ * Class representing a chain for generating text from an image using the OpenAI
+ * Vision API. It extends the BaseChain class and implements the
+ * OpenAIVisionChainInput interface.
+ */
+export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput {
+    static lc_name() {
+        return 'VLLMChain'
+    }
+    prompt: BasePromptTemplate | undefined
+
+    inputKey = 'input'
+    outputKey = 'text'
+    uploads?: IFileUpload[]
+    imageResolution: 'auto' | 'low' | 'high'
+    openAIApiKey?: string
+    openAIOrganization?: string
+    clientConfig: ClientOptions
+    client: OpenAIClient
+    throwError: boolean
+    temperature?: number
+    modelName?: string
+    maxTokens?: number
+    topP?: number
+
+    speechToTextMode?: any
+
+    constructor(fields: OpenAIMultiModalChainInput) {
+        super(fields)
+        this.throwError = fields?.throwError ?? false
+        this.imageResolution = fields?.imageResolution ?? 'low'
+        this.openAIApiKey = fields?.openAIApiKey
+        this.prompt = fields?.prompt
+        this.temperature = fields?.temperature
+        this.modelName = fields?.modelName
+        this.maxTokens = fields?.maxTokens
+        this.topP = fields?.topP
+        this.uploads = fields?.uploads ?? []
+        this.speechToTextMode = fields?.speechToTextMode ?? {}
+        if (!this.openAIApiKey) {
+            throw new Error('OpenAI API key not found')
+        }
+
+        this.openAIOrganization = fields?.openAIOrganization
+
+        this.clientConfig = {
+            ...fields?.configuration,
+            apiKey: this.openAIApiKey,
+            organization: this.openAIOrganization
+        }
+
+        this.client = new OpenAIClient(this.clientConfig)
+    }
+
+    async _call(values: ChainValues): Promise<ChainValues> {
+        const userInput = values[this.inputKey]
+
+        const vRequest: ChatCompletionCreateParamsNonStreaming = {
+            model: 'gpt-4-vision-preview',
+            temperature: this.temperature,
+            top_p: this.topP,
+            messages: []
+        }
+        if (this.maxTokens) vRequest.max_tokens = this.maxTokens
+        else vRequest.max_tokens = 1024
+
+        const chatMessages: ChatCompletionContentPart[] = []
+        const userRole: ChatCompletionMessageParam = { role: 'user', content: [] }
+        chatMessages.push({
+            type: 'text',
+            text: userInput
+        })
+        if (this.speechToTextMode && this.uploads && this.uploads.length > 0) {
+            const audioUploads = this.getAudioUploads(this.uploads)
+            for (const url of audioUploads) {
+                const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
+
+                // as the image is stored in the server, read the file and convert it to base64
+                const audio_file = fs.createReadStream(filePath)
+                if (this.speechToTextMode.purpose === 'transcriptions') {
+                    const transcription = await this.client.audio.transcriptions.create({
+                        file: audio_file,
+                        model: 'whisper-1'
+                    })
+                    chatMessages.push({
+                        type: 'text',
+                        text: transcription.text
+                    })
+                } else if (this.speechToTextMode.purpose === 'translations') {
+                    const translation = await this.client.audio.translations.create({
+                        file: audio_file,
+                        model: 'whisper-1'
+                    })
+                    chatMessages.push({
+                        type: 'text',
+                        text: translation.text
+                    })
+                }
+            }
+        }
+        if (this.uploads && this.uploads.length > 0) {
+            const imageUploads = this.getImageUploads(this.uploads)
+            for (const url of imageUploads) {
+                let bf = url.data
+                if (url.type == 'stored-file') {
+                    const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
+
+                    // as the image is stored in the server, read the file and convert it to base64
+                    const contents = fs.readFileSync(filePath)
+                    bf = 'data:' + url.mime + ';base64,' + contents.toString('base64')
+                }
+                chatMessages.push({
+                    type: 'image_url',
+                    image_url: {
+                        url: bf,
+                        detail: this.imageResolution
+                    }
+                })
+            }
+        }
+        userRole.content = chatMessages
+        vRequest.messages.push(userRole)
+        if (this.prompt && this.prompt instanceof ChatPromptTemplate) {
+            let chatPrompt = this.prompt as ChatPromptTemplate
+            chatPrompt.promptMessages.forEach((message: any) => {
+                if (message instanceof SystemMessagePromptTemplate) {
+                    vRequest.messages.push({
+                        role: 'system',
+                        content: (message.prompt as any).template
+                    })
+                } else if (message instanceof HumanMessagePromptTemplate) {
+                    vRequest.messages.push({
+                        role: 'user',
+                        content: (message.prompt as any).template
+                    })
+                }
+            })
+        }
+
+        let response
+        try {
+            response = await this.client.chat.completions.create(vRequest)
+        } catch (error) {
+            if (error instanceof Error) {
+                throw error
+            } else {
+                throw new Error(error as string)
+            }
+        }
+        const output = response.choices[0]
+        return {
+            [this.outputKey]: output.message.content
+        }
+    }
+
+    getAudioUploads = (urls: any[]) => {
+        return urls.filter((url: any) => url.mime.startsWith('audio/'))
+    }
+
+    getImageUploads = (urls: any[]) => {
+        return urls.filter((url: any) => url.mime.startsWith('image/'))
+    }
+
+    _chainType() {
+        return 'vision_chain'
+    }
+
+    get inputKeys() {
+        return this.prompt?.inputVariables ?? [this.inputKey]
+    }
+
+    get outputKeys(): string[] {
+        return [this.outputKey]
+    }
+}
@@ -0,0 +1,6 @@
+<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-dna" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
+   <path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
+   <path d="M14.828 14.828a4 4 0 1 0 -5.656 -5.656a4 4 0 0 0 5.656 5.656z"></path>
+   <path d="M9.172 20.485a4 4 0 1 0 -5.657 -5.657"></path>
+   <path d="M14.828 3.515a4 4 0 0 0 5.657 5.657"></path>
+</svg>