GPT Vision: Renaming to OpenAIMultiModalChain and merging the functionality of Wisper.

This commit is contained in:
vinodkiran
2024-01-18 13:03:27 +05:30
parent 398a31f426
commit 8a14a52d90
8 changed files with 118 additions and 132 deletions
@@ -0,0 +1,333 @@
import {
ICommonObject,
INode,
INodeData,
INodeOutputsValue,
INodeParams
} from "../../../src/Interface";
import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import { OpenAIMultiModalChainInput, VLLMChain } from "./VLLMChain";
import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler'
import { formatResponse } from '../../outputparsers/OutputParserHelpers'
import { checkInputs, Moderation, streamResponse } from "../../moderation/Moderation";
class OpenAIMultiModalChain_Chains implements INode {
label: string
name: string
version: number
type: string
icon: string
badge: string
category: string
baseClasses: string[]
description: string
inputs: INodeParams[]
outputs: INodeOutputsValue[]
credential: INodeParams
constructor() {
this.label = 'Open AI MultiModal Chain'
this.name = 'openAIMultiModalChain'
this.version = 1.0
this.type = 'OpenAIMultiModalChain'
this.icon = 'chain.svg'
this.category = 'Chains'
this.badge = 'BETA'
this.description = 'Chain to query against Image and Audio Input.'
this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['openAIApi']
}
this.inputs = [
{
label: 'Prompt',
name: 'prompt',
type: 'BasePromptTemplate',
optional: true
},
{
label: 'Input Moderation',
description: 'Detect text that could generate harmful output and prevent it from being sent to the language model',
name: 'inputModeration',
type: 'Moderation',
optional: true,
list: true
},
{
label: 'Model Name',
name: 'modelName',
type: 'options',
options: [
{
label: 'gpt-4-vision-preview',
name: 'gpt-4-vision-preview'
}
],
default: 'gpt-4-vision-preview'
},
{
label: 'Speech to Text',
name: 'speechToText',
type: 'boolean',
optional: true,
},
// TODO: only show when speechToText is true
{
label: 'Speech to Text Method',
description: 'How to turn audio into text',
name: 'speechToTextMode',
type: 'options',
options: [
{
label: 'Transcriptions',
name: 'transcriptions',
description: 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
},
{
label: 'Translations',
name: 'translations',
description: 'Translate and transcribe the audio into english.'
}
],
optional: false,
default: 'transcriptions',
additionalParams: true
},
{
label: 'Image Resolution',
description: 'This parameter controls the resolution in which the model views the image.',
name: 'imageResolution',
type: 'options',
options: [
{
label: 'Low',
name: 'low'
},
{
label: 'High',
name: 'high'
},
{
label: 'Auto',
name: 'auto'
}
],
default: 'low',
optional: false,
additionalParams: true
},
{
label: 'Temperature',
name: 'temperature',
type: 'number',
step: 0.1,
default: 0.9,
optional: true,
additionalParams: true
},
{
label: 'Top Probability',
name: 'topP',
type: 'number',
step: 0.1,
optional: true,
additionalParams: true
},
{
label: 'Max Tokens',
name: 'maxTokens',
type: 'number',
step: 1,
optional: true,
additionalParams: true
},
{
label: 'Accepted Upload Types',
name: 'allowedUploadTypes',
type: 'string',
default: 'image/gif;image/jpeg;image/png;image/webp;audio/mpeg;audio/x-wav;audio/mp4',
hidden: true
},
{
label: 'Maximum Upload Size (MB)',
name: 'maxUploadSize',
type: 'number',
default: '5',
hidden: true
}
]
this.outputs = [
{
label: 'Open AI MultiModal Chain',
name: 'openAIMultiModalChain',
baseClasses: [this.type, ...getBaseClasses(VLLMChain)]
},
{
label: 'Output Prediction',
name: 'outputPrediction',
baseClasses: ['string', 'json']
}
]
}
async init(nodeData: INodeData, input: string, options: ICommonObject): Promise<any> {
const prompt = nodeData.inputs?.prompt
const output = nodeData.outputs?.output as string
const imageResolution = nodeData.inputs?.imageResolution
const promptValues = prompt.promptValues as ICommonObject
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const openAIApiKey = getCredentialParam('openAIApiKey', credentialData, nodeData)
const temperature = nodeData.inputs?.temperature as string
const modelName = nodeData.inputs?.modelName as string
const maxTokens = nodeData.inputs?.maxTokens as string
const topP = nodeData.inputs?.topP as string
const speechToText = nodeData.inputs?.speechToText as boolean
const fields: OpenAIMultiModalChainInput = {
openAIApiKey: openAIApiKey,
imageResolution: imageResolution,
verbose: process.env.DEBUG === 'true',
uploads: options.uploads,
modelName: modelName
}
if (temperature) fields.temperature = parseFloat(temperature)
if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10)
if (topP) fields.topP = parseFloat(topP)
if (speechToText) {
const speechToTextMode = nodeData.inputs?.speechToTextMode ?? 'transcriptions'
if (speechToTextMode) fields.speechToTextMode = speechToTextMode
}
if (output === this.name) {
const chain = new VLLMChain({
...fields,
prompt: prompt
})
return chain
} else if (output === 'outputPrediction') {
const chain = new VLLMChain({
...fields
})
const inputVariables: string[] = prompt.inputVariables as string[] // ["product"]
const res = await runPrediction(inputVariables, chain, input, promptValues, options, nodeData)
// eslint-disable-next-line no-console
console.log('\x1b[92m\x1b[1m\n*****OUTPUT PREDICTION*****\n\x1b[0m\x1b[0m')
// eslint-disable-next-line no-console
console.log(res)
/**
* Apply string transformation to convert special chars:
* FROM: hello i am ben\n\n\thow are you?
* TO: hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?
*/
return handleEscapeCharacters(res, false)
}
}
async run(nodeData: INodeData, input: string, options: ICommonObject): Promise<string | object> {
const prompt = nodeData.inputs?.prompt
const inputVariables: string[] = prompt.inputVariables as string[] // ["product"]
const chain = nodeData.instance as VLLMChain
let promptValues: ICommonObject | undefined = nodeData.inputs?.prompt.promptValues as ICommonObject
const res = await runPrediction(inputVariables, chain, input, promptValues, options, nodeData)
// eslint-disable-next-line no-console
console.log('\x1b[93m\x1b[1m\n*****FINAL RESULT*****\n\x1b[0m\x1b[0m')
// eslint-disable-next-line no-console
console.log(res)
return res
}
}
const runPrediction = async (
inputVariables: string[],
chain: VLLMChain,
input: string,
promptValuesRaw: ICommonObject | undefined,
options: ICommonObject,
nodeData: INodeData
) => {
const loggerHandler = new ConsoleCallbackHandler(options.logger)
const callbacks = await additionalCallbacks(nodeData, options)
const isStreaming = options.socketIO && options.socketIOClientId
const socketIO = isStreaming ? options.socketIO : undefined
const socketIOClientId = isStreaming ? options.socketIOClientId : ''
const moderations = nodeData.inputs?.inputModeration as Moderation[]
if (moderations && moderations.length > 0) {
try {
// Use the output of the moderation chain as input for the LLM chain
input = await checkInputs(moderations, input)
} catch (e) {
await new Promise((resolve) => setTimeout(resolve, 500))
streamResponse(isStreaming, e.message, socketIO, socketIOClientId)
return formatResponse(e.message)
}
}
/**
* Apply string transformation to reverse converted special chars:
* FROM: { "value": "hello i am benFLOWISE_NEWLINEFLOWISE_NEWLINEFLOWISE_TABhow are you?" }
* TO: { "value": "hello i am ben\n\n\thow are you?" }
*/
const promptValues = handleEscapeCharacters(promptValuesRaw, true)
if (options?.uploads) {
chain.uploads = options.uploads
}
if (promptValues && inputVariables.length > 0) {
let seen: string[] = []
for (const variable of inputVariables) {
seen.push(variable)
if (promptValues[variable]) {
chain.inputKey = variable
seen.pop()
}
}
if (seen.length === 0) {
// All inputVariables have fixed values specified
const options = { ...promptValues }
if (isStreaming) {
const handler = new CustomChainHandler(socketIO, socketIOClientId)
const res = await chain.call(options, [loggerHandler, handler, ...callbacks])
return formatResponse(res?.text)
} else {
const res = await chain.call(options, [loggerHandler, ...callbacks])
return formatResponse(res?.text)
}
} else if (seen.length === 1) {
// If one inputVariable is not specify, use input (user's question) as value
const lastValue = seen.pop()
if (!lastValue) throw new Error('Please provide Prompt Values')
chain.inputKey = lastValue as string
const options = {
...promptValues,
[lastValue]: input
}
if (isStreaming) {
const handler = new CustomChainHandler(socketIO, socketIOClientId)
const res = await chain.call(options, [loggerHandler, handler, ...callbacks])
return formatResponse(res?.text)
} else {
const res = await chain.call(options, [loggerHandler, ...callbacks])
return formatResponse(res?.text)
}
} else {
throw new Error(`Please provide Prompt Values for: ${seen.join(', ')}`)
}
} else {
if (isStreaming) {
const handler = new CustomChainHandler(socketIO, socketIOClientId)
const res = await chain.run(input, [loggerHandler, handler, ...callbacks])
return formatResponse(res)
} else {
const res = await chain.run(input, [loggerHandler, ...callbacks])
return formatResponse(res)
}
}
}
module.exports = { nodeClass: OpenAIMultiModalChain_Chains }
@@ -0,0 +1,204 @@
import { OpenAI as OpenAIClient, ClientOptions, OpenAI } from 'openai'
import { BaseChain, ChainInputs } from 'langchain/chains'
import { ChainValues } from 'langchain/schema'
import { BasePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts'
import path from 'path'
import { getUserHome } from '../../../src/utils'
import fs from 'fs'
import { ChatCompletionContentPart, ChatCompletionMessageParam } from 'openai/src/resources/chat/completions'
import ChatCompletionCreateParamsNonStreaming = OpenAI.ChatCompletionCreateParamsNonStreaming
import { IFileUpload } from '../../../src'
/**
* Interface for the input parameters of the OpenAIVisionChain class.
*/
export interface OpenAIMultiModalChainInput extends ChainInputs {
openAIApiKey?: string
openAIOrganization?: string
throwError?: boolean
prompt?: BasePromptTemplate
configuration?: ClientOptions
uploads?: IFileUpload[]
imageResolution?: 'auto' | 'low' | 'high'
temperature?: number
modelName?: string
maxTokens?: number
topP?: number
speechToTextMode?: string
}
/**
* Class representing a chain for generating text from an image using the OpenAI
* Vision API. It extends the BaseChain class and implements the
* OpenAIVisionChainInput interface.
*/
export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput {
static lc_name() {
return 'VLLMChain'
}
prompt: BasePromptTemplate | undefined
inputKey = 'input'
outputKey = 'text'
uploads?: IFileUpload[]
imageResolution: 'auto' | 'low' | 'high'
openAIApiKey?: string
openAIOrganization?: string
clientConfig: ClientOptions
client: OpenAIClient
throwError: boolean
temperature?: number
modelName?: string
maxTokens?: number
topP?: number
speechToTextMode?: any
constructor(fields: OpenAIMultiModalChainInput) {
super(fields)
this.throwError = fields?.throwError ?? false
this.imageResolution = fields?.imageResolution ?? 'low'
this.openAIApiKey = fields?.openAIApiKey
this.prompt = fields?.prompt
this.temperature = fields?.temperature
this.modelName = fields?.modelName
this.maxTokens = fields?.maxTokens
this.topP = fields?.topP
this.uploads = fields?.uploads ?? []
this.speechToTextMode = fields?.speechToTextMode ?? {}
if (!this.openAIApiKey) {
throw new Error('OpenAI API key not found')
}
this.openAIOrganization = fields?.openAIOrganization
this.clientConfig = {
...fields?.configuration,
apiKey: this.openAIApiKey,
organization: this.openAIOrganization
}
this.client = new OpenAIClient(this.clientConfig)
}
async _call(values: ChainValues): Promise<ChainValues> {
const userInput = values[this.inputKey]
const vRequest: ChatCompletionCreateParamsNonStreaming = {
model: 'gpt-4-vision-preview',
temperature: this.temperature,
top_p: this.topP,
messages: []
}
if (this.maxTokens) vRequest.max_tokens = this.maxTokens
else vRequest.max_tokens = 1024
const chatMessages: ChatCompletionContentPart[] = []
const userRole: ChatCompletionMessageParam = { role: 'user', content: [] }
chatMessages.push({
type: 'text',
text: userInput
})
if (this.speechToTextMode && this.uploads && this.uploads.length > 0) {
const audioUploads = this.getAudioUploads(this.uploads)
for (const url of audioUploads) {
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
// as the image is stored in the server, read the file and convert it to base64
const audio_file = fs.createReadStream(filePath)
if (this.speechToTextMode.purpose === 'transcriptions') {
const transcription = await this.client.audio.transcriptions.create({
file: audio_file,
model: 'whisper-1'
})
chatMessages.push({
type: 'text',
text: transcription.text
})
} else if (this.speechToTextMode.purpose === 'translations') {
const translation = await this.client.audio.translations.create({
file: audio_file,
model: 'whisper-1'
})
chatMessages.push({
type: 'text',
text: translation.text
})
}
}
}
if (this.uploads && this.uploads.length > 0) {
const imageUploads = this.getImageUploads(this.uploads)
for (const url of imageUploads) {
let bf = url.data
if (url.type == 'stored-file') {
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
// as the image is stored in the server, read the file and convert it to base64
const contents = fs.readFileSync(filePath)
bf = 'data:' + url.mime + ';base64,' + contents.toString('base64')
}
chatMessages.push({
type: 'image_url',
image_url: {
url: bf,
detail: this.imageResolution
}
})
}
}
userRole.content = chatMessages
vRequest.messages.push(userRole)
if (this.prompt && this.prompt instanceof ChatPromptTemplate) {
let chatPrompt = this.prompt as ChatPromptTemplate
chatPrompt.promptMessages.forEach((message: any) => {
if (message instanceof SystemMessagePromptTemplate) {
vRequest.messages.push({
role: 'system',
content: (message.prompt as any).template
})
} else if (message instanceof HumanMessagePromptTemplate) {
vRequest.messages.push({
role: 'user',
content: (message.prompt as any).template
})
}
})
}
let response
try {
response = await this.client.chat.completions.create(vRequest)
} catch (error) {
if (error instanceof Error) {
throw error
} else {
throw new Error(error as string)
}
}
const output = response.choices[0]
return {
[this.outputKey]: output.message.content
}
}
getAudioUploads = (urls: any[]) => {
return urls.filter((url: any) => url.mime.startsWith('audio/'))
}
getImageUploads = (urls: any[]) => {
return urls.filter((url: any) => url.mime.startsWith('image/'))
}
_chainType() {
return 'vision_chain'
}
get inputKeys() {
return this.prompt?.inputVariables ?? [this.inputKey]
}
get outputKeys(): string[] {
return [this.outputKey]
}
}
@@ -0,0 +1,6 @@
<svg xmlns="http://www.w3.org/2000/svg" class="icon icon-tabler icon-tabler-dna" width="24" height="24" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
<path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
<path d="M14.828 14.828a4 4 0 1 0 -5.656 -5.656a4 4 0 0 0 5.656 5.656z"></path>
<path d="M9.172 20.485a4 4 0 1 0 -5.657 -5.657"></path>
<path d="M14.828 3.515a4 4 0 0 0 5.657 5.657"></path>
</svg>

After

Width:  |  Height:  |  Size: 489 B