GPT Vision: Renaming to OpenAIMultiModalChain and merging the functionality of Wisper.

This commit is contained in:
vinodkiran
2024-01-18 13:03:27 +05:30
parent 398a31f426
commit 8a14a52d90
8 changed files with 118 additions and 132 deletions
@@ -1,10 +1,17 @@
import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface' import {
ICommonObject,
INode,
INodeData,
INodeOutputsValue,
INodeParams
} from "../../../src/Interface";
import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils' import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import { OpenAIVisionChainInput, VLLMChain } from './VLLMChain' import { OpenAIMultiModalChainInput, VLLMChain } from "./VLLMChain";
import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler' import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler'
import { formatResponse } from '../../outputparsers/OutputParserHelpers' import { formatResponse } from '../../outputparsers/OutputParserHelpers'
import { checkInputs, Moderation, streamResponse } from "../../moderation/Moderation";
class OpenAIVisionChain_Chains implements INode { class OpenAIMultiModalChain_Chains implements INode {
label: string label: string
name: string name: string
version: number version: number
@@ -24,7 +31,7 @@ class OpenAIVisionChain_Chains implements INode {
this.version = 1.0 this.version = 1.0
this.type = 'OpenAIMultiModalChain' this.type = 'OpenAIMultiModalChain'
this.icon = 'chain.svg' this.icon = 'chain.svg'
this.category = 'MultiModal' this.category = 'Chains'
this.badge = 'BETA' this.badge = 'BETA'
this.description = 'Chain to query against Image and Audio Input.' this.description = 'Chain to query against Image and Audio Input.'
this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)] this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)]
@@ -35,18 +42,20 @@ class OpenAIVisionChain_Chains implements INode {
credentialNames: ['openAIApi'] credentialNames: ['openAIApi']
} }
this.inputs = [ this.inputs = [
{
label: 'Audio Input',
name: 'audioInput',
type: 'OpenAIWhisper',
optional: true
},
{ {
label: 'Prompt', label: 'Prompt',
name: 'prompt', name: 'prompt',
type: 'BasePromptTemplate', type: 'BasePromptTemplate',
optional: true optional: true
}, },
{
label: 'Input Moderation',
description: 'Detect text that could generate harmful output and prevent it from being sent to the language model',
name: 'inputModeration',
type: 'Moderation',
optional: true,
list: true
},
{ {
label: 'Model Name', label: 'Model Name',
name: 'modelName', name: 'modelName',
@@ -55,14 +64,38 @@ class OpenAIVisionChain_Chains implements INode {
{ {
label: 'gpt-4-vision-preview', label: 'gpt-4-vision-preview',
name: 'gpt-4-vision-preview' name: 'gpt-4-vision-preview'
},
{
label: 'whisper-1',
name: 'whisper-1'
} }
], ],
default: 'gpt-4-vision-preview' default: 'gpt-4-vision-preview'
}, },
{
label: 'Speech to Text',
name: 'speechToText',
type: 'boolean',
optional: true,
},
// TODO: only show when speechToText is true
{
label: 'Speech to Text Method',
description: 'How to turn audio into text',
name: 'speechToTextMode',
type: 'options',
options: [
{
label: 'Transcriptions',
name: 'transcriptions',
description: 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
},
{
label: 'Translations',
name: 'translations',
description: 'Translate and transcribe the audio into english.'
}
],
optional: false,
default: 'transcriptions',
additionalParams: true
},
{ {
label: 'Image Resolution', label: 'Image Resolution',
description: 'This parameter controls the resolution in which the model views the image.', description: 'This parameter controls the resolution in which the model views the image.',
@@ -76,6 +109,10 @@ class OpenAIVisionChain_Chains implements INode {
{ {
label: 'High', label: 'High',
name: 'high' name: 'high'
},
{
label: 'Auto',
name: 'auto'
} }
], ],
default: 'low', default: 'low',
@@ -107,18 +144,11 @@ class OpenAIVisionChain_Chains implements INode {
optional: true, optional: true,
additionalParams: true additionalParams: true
}, },
{
label: 'Chain Name',
name: 'chainName',
type: 'string',
placeholder: 'Name Your Chain',
optional: true
},
{ {
label: 'Accepted Upload Types', label: 'Accepted Upload Types',
name: 'allowedUploadTypes', name: 'allowedUploadTypes',
type: 'string', type: 'string',
default: 'image/gif;image/jpeg;image/png;image/webp', default: 'image/gif;image/jpeg;image/png;image/webp;audio/mpeg;audio/x-wav;audio/mp4',
hidden: true hidden: true
}, },
{ {
@@ -154,19 +184,23 @@ class OpenAIVisionChain_Chains implements INode {
const modelName = nodeData.inputs?.modelName as string const modelName = nodeData.inputs?.modelName as string
const maxTokens = nodeData.inputs?.maxTokens as string const maxTokens = nodeData.inputs?.maxTokens as string
const topP = nodeData.inputs?.topP as string const topP = nodeData.inputs?.topP as string
const whisperConfig = nodeData.inputs?.audioInput const speechToText = nodeData.inputs?.speechToText as boolean
const fields: OpenAIVisionChainInput = {
const fields: OpenAIMultiModalChainInput = {
openAIApiKey: openAIApiKey, openAIApiKey: openAIApiKey,
imageResolution: imageResolution, imageResolution: imageResolution,
verbose: process.env.DEBUG === 'true', verbose: process.env.DEBUG === 'true',
imageUrls: options.uploads, uploads: options.uploads,
modelName: modelName modelName: modelName
} }
if (temperature) fields.temperature = parseFloat(temperature) if (temperature) fields.temperature = parseFloat(temperature)
if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10) if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10)
if (topP) fields.topP = parseFloat(topP) if (topP) fields.topP = parseFloat(topP)
if (whisperConfig) fields.whisperConfig = whisperConfig if (speechToText) {
const speechToTextMode = nodeData.inputs?.speechToTextMode ?? 'transcriptions'
if (speechToTextMode) fields.speechToTextMode = speechToTextMode
}
if (output === this.name) { if (output === this.name) {
const chain = new VLLMChain({ const chain = new VLLMChain({
@@ -221,6 +255,17 @@ const runPrediction = async (
const isStreaming = options.socketIO && options.socketIOClientId const isStreaming = options.socketIO && options.socketIOClientId
const socketIO = isStreaming ? options.socketIO : undefined const socketIO = isStreaming ? options.socketIO : undefined
const socketIOClientId = isStreaming ? options.socketIOClientId : '' const socketIOClientId = isStreaming ? options.socketIOClientId : ''
const moderations = nodeData.inputs?.inputModeration as Moderation[]
if (moderations && moderations.length > 0) {
try {
// Use the output of the moderation chain as input for the LLM chain
input = await checkInputs(moderations, input)
} catch (e) {
await new Promise((resolve) => setTimeout(resolve, 500))
streamResponse(isStreaming, e.message, socketIO, socketIOClientId)
return formatResponse(e.message)
}
}
/** /**
* Apply string transformation to reverse converted special chars: * Apply string transformation to reverse converted special chars:
@@ -229,7 +274,7 @@ const runPrediction = async (
*/ */
const promptValues = handleEscapeCharacters(promptValuesRaw, true) const promptValues = handleEscapeCharacters(promptValuesRaw, true)
if (options?.uploads) { if (options?.uploads) {
chain.imageUrls = options.uploads chain.uploads = options.uploads
} }
if (promptValues && inputVariables.length > 0) { if (promptValues && inputVariables.length > 0) {
let seen: string[] = [] let seen: string[] = []
@@ -285,4 +330,4 @@ const runPrediction = async (
} }
} }
module.exports = { nodeClass: OpenAIVisionChain_Chains } module.exports = { nodeClass: OpenAIMultiModalChain_Chains }
@@ -1,27 +1,30 @@
import { OpenAI as OpenAIClient, ClientOptions } from 'openai' import { OpenAI as OpenAIClient, ClientOptions, OpenAI } from 'openai'
import { BaseChain, ChainInputs } from 'langchain/chains' import { BaseChain, ChainInputs } from 'langchain/chains'
import { ChainValues } from 'langchain/schema' import { ChainValues } from 'langchain/schema'
import { BasePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts' import { BasePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts'
import path from 'path' import path from 'path'
import { getUserHome } from '../../../src/utils' import { getUserHome } from '../../../src/utils'
import fs from 'fs' import fs from 'fs'
import { ChatCompletionContentPart, ChatCompletionMessageParam } from 'openai/src/resources/chat/completions'
import ChatCompletionCreateParamsNonStreaming = OpenAI.ChatCompletionCreateParamsNonStreaming
import { IFileUpload } from '../../../src'
/** /**
* Interface for the input parameters of the OpenAIVisionChain class. * Interface for the input parameters of the OpenAIVisionChain class.
*/ */
export interface OpenAIVisionChainInput extends ChainInputs { export interface OpenAIMultiModalChainInput extends ChainInputs {
openAIApiKey?: string openAIApiKey?: string
openAIOrganization?: string openAIOrganization?: string
throwError?: boolean throwError?: boolean
prompt?: BasePromptTemplate prompt?: BasePromptTemplate
configuration?: ClientOptions configuration?: ClientOptions
imageUrls?: [] uploads?: IFileUpload[]
imageResolution?: string imageResolution?: 'auto' | 'low' | 'high'
temperature?: number temperature?: number
modelName?: string modelName?: string
maxTokens?: number maxTokens?: number
topP?: number topP?: number
whisperConfig?: any speechToTextMode?: string
} }
/** /**
@@ -29,7 +32,7 @@ export interface OpenAIVisionChainInput extends ChainInputs {
* Vision API. It extends the BaseChain class and implements the * Vision API. It extends the BaseChain class and implements the
* OpenAIVisionChainInput interface. * OpenAIVisionChainInput interface.
*/ */
export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput {
static lc_name() { static lc_name() {
return 'VLLMChain' return 'VLLMChain'
} }
@@ -37,8 +40,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
inputKey = 'input' inputKey = 'input'
outputKey = 'text' outputKey = 'text'
imageUrls?: [] uploads?: IFileUpload[]
imageResolution: string = 'low' imageResolution: 'auto' | 'low' | 'high'
openAIApiKey?: string openAIApiKey?: string
openAIOrganization?: string openAIOrganization?: string
clientConfig: ClientOptions clientConfig: ClientOptions
@@ -49,9 +52,9 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
maxTokens?: number maxTokens?: number
topP?: number topP?: number
whisperConfig?: any speechToTextMode?: any
constructor(fields: OpenAIVisionChainInput) { constructor(fields: OpenAIMultiModalChainInput) {
super(fields) super(fields)
this.throwError = fields?.throwError ?? false this.throwError = fields?.throwError ?? false
this.imageResolution = fields?.imageResolution ?? 'low' this.imageResolution = fields?.imageResolution ?? 'low'
@@ -61,8 +64,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
this.modelName = fields?.modelName this.modelName = fields?.modelName
this.maxTokens = fields?.maxTokens this.maxTokens = fields?.maxTokens
this.topP = fields?.topP this.topP = fields?.topP
this.imageUrls = fields?.imageUrls ?? [] this.uploads = fields?.uploads ?? []
this.whisperConfig = fields?.whisperConfig ?? {} this.speechToTextMode = fields?.speechToTextMode ?? {}
if (!this.openAIApiKey) { if (!this.openAIApiKey) {
throw new Error('OpenAI API key not found') throw new Error('OpenAI API key not found')
} }
@@ -81,8 +84,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
async _call(values: ChainValues): Promise<ChainValues> { async _call(values: ChainValues): Promise<ChainValues> {
const userInput = values[this.inputKey] const userInput = values[this.inputKey]
const vRequest: any = { const vRequest: ChatCompletionCreateParamsNonStreaming = {
model: this.modelName, model: 'gpt-4-vision-preview',
temperature: this.temperature, temperature: this.temperature,
top_p: this.topP, top_p: this.topP,
messages: [] messages: []
@@ -90,42 +93,42 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
if (this.maxTokens) vRequest.max_tokens = this.maxTokens if (this.maxTokens) vRequest.max_tokens = this.maxTokens
else vRequest.max_tokens = 1024 else vRequest.max_tokens = 1024
const userRole: any = { role: 'user' } const chatMessages: ChatCompletionContentPart[] = []
userRole.content = [] const userRole: ChatCompletionMessageParam = { role: 'user', content: [] }
userRole.content.push({ chatMessages.push({
type: 'text', type: 'text',
text: userInput text: userInput
}) })
if (this.whisperConfig && this.imageUrls && this.imageUrls.length > 0) { if (this.speechToTextMode && this.uploads && this.uploads.length > 0) {
const audioUploads = this.getAudioUploads(this.imageUrls) const audioUploads = this.getAudioUploads(this.uploads)
for (const url of audioUploads) { for (const url of audioUploads) {
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name) const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
// as the image is stored in the server, read the file and convert it to base64 // as the image is stored in the server, read the file and convert it to base64
const audio_file = fs.createReadStream(filePath) const audio_file = fs.createReadStream(filePath)
if (this.whisperConfig.purpose === 'transcription') { if (this.speechToTextMode.purpose === 'transcriptions') {
const transcription = await this.client.audio.transcriptions.create({ const transcription = await this.client.audio.transcriptions.create({
file: audio_file, file: audio_file,
model: 'whisper-1' model: 'whisper-1'
}) })
userRole.content.push({ chatMessages.push({
type: 'text', type: 'text',
text: transcription.text text: transcription.text
}) })
} else if (this.whisperConfig.purpose === 'translation') { } else if (this.speechToTextMode.purpose === 'translations') {
const translation = await this.client.audio.translations.create({ const translation = await this.client.audio.translations.create({
file: audio_file, file: audio_file,
model: 'whisper-1' model: 'whisper-1'
}) })
userRole.content.push({ chatMessages.push({
type: 'text', type: 'text',
text: translation.text text: translation.text
}) })
} }
} }
} }
if (this.imageUrls && this.imageUrls.length > 0) { if (this.uploads && this.uploads.length > 0) {
const imageUploads = this.getImageUploads(this.imageUrls) const imageUploads = this.getImageUploads(this.uploads)
for (const url of imageUploads) { for (const url of imageUploads) {
let bf = url.data let bf = url.data
if (url.type == 'stored-file') { if (url.type == 'stored-file') {
@@ -135,7 +138,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
const contents = fs.readFileSync(filePath) const contents = fs.readFileSync(filePath)
bf = 'data:' + url.mime + ';base64,' + contents.toString('base64') bf = 'data:' + url.mime + ';base64,' + contents.toString('base64')
} }
userRole.content.push({ chatMessages.push({
type: 'image_url', type: 'image_url',
image_url: { image_url: {
url: bf, url: bf,
@@ -144,6 +147,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
}) })
} }
} }
userRole.content = chatMessages
vRequest.messages.push(userRole) vRequest.messages.push(userRole)
if (this.prompt && this.prompt instanceof ChatPromptTemplate) { if (this.prompt && this.prompt instanceof ChatPromptTemplate) {
let chatPrompt = this.prompt as ChatPromptTemplate let chatPrompt = this.prompt as ChatPromptTemplate
@@ -151,12 +155,12 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
if (message instanceof SystemMessagePromptTemplate) { if (message instanceof SystemMessagePromptTemplate) {
vRequest.messages.push({ vRequest.messages.push({
role: 'system', role: 'system',
content: [ content: (message.prompt as any).template
{ })
type: 'text', } else if (message instanceof HumanMessagePromptTemplate) {
text: (message.prompt as any).template vRequest.messages.push({
} role: 'user',
] content: (message.prompt as any).template
}) })
} }
}) })
@@ -164,7 +168,6 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput {
let response let response
try { try {
// @ts-ignore
response = await this.client.chat.completions.create(vRequest) response = await this.client.chat.completions.create(vRequest)
} catch (error) { } catch (error) {
if (error instanceof Error) { if (error instanceof Error) {

Before

Width:  |  Height:  |  Size: 489 B

After

Width:  |  Height:  |  Size: 489 B

@@ -1,66 +0,0 @@
import { INode, INodeData, INodeParams } from '../../../src'
class OpenAIAudioWhisper implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
badge: string
category: string
baseClasses: string[]
inputs: INodeParams[]
constructor() {
this.label = 'Open AI Whisper'
this.name = 'openAIAudioWhisper'
this.version = 1.0
this.type = 'OpenAIWhisper'
this.description = 'Speech to text using OpenAI Whisper API'
this.icon = 'audio.svg'
this.badge = 'BETA'
this.category = 'MultiModal'
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Purpose',
name: 'purpose',
type: 'options',
options: [
{
label: 'Transcription',
name: 'transcription'
},
{
label: 'Translation',
name: 'translation'
}
],
default: 'transcription'
},
{
label: 'Accepted Upload Types',
name: 'allowedUploadTypes',
type: 'string',
default: 'audio/mpeg;audio/x-wav;audio/mp4',
hidden: true
},
{
label: 'Maximum Upload Size (MB)',
name: 'maxUploadSize',
type: 'number',
default: '5',
hidden: true
}
]
}
async init(nodeData: INodeData): Promise<any> {
const purpose = nodeData.inputs?.purpose as string
return { purpose }
}
}
module.exports = { nodeClass: OpenAIAudioWhisper }
@@ -1 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="mdi-text-to-speech" width="24" height="24" viewBox="0 0 24 24"><path d="M8,7A2,2 0 0,1 10,9V14A2,2 0 0,1 8,16A2,2 0 0,1 6,14V9A2,2 0 0,1 8,7M14,14C14,16.97 11.84,19.44 9,19.92V22H7V19.92C4.16,19.44 2,16.97 2,14H4A4,4 0 0,0 8,18A4,4 0 0,0 12,14H14M21.41,9.41L17.17,13.66L18.18,10H14A2,2 0 0,1 12,8V4A2,2 0 0,1 14,2H20A2,2 0 0,1 22,4V8C22,8.55 21.78,9.05 21.41,9.41Z" /></svg>

Before

Width:  |  Height:  |  Size: 611 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.9 KiB

+7
View File
@@ -234,3 +234,10 @@ export abstract class FlowiseSummaryMemory extends ConversationSummaryMemory imp
abstract addChatMessages(msgArray: { text: string; type: MessageType }[], overrideSessionId?: string): Promise<void> abstract addChatMessages(msgArray: { text: string; type: MessageType }[], overrideSessionId?: string): Promise<void>
abstract clearChatMessages(overrideSessionId?: string): Promise<void> abstract clearChatMessages(overrideSessionId?: string): Promise<void>
} }
export interface IFileUpload {
data: string
type: string
name: string
mime: string
}
+1 -3
View File
@@ -1695,9 +1695,7 @@ export class App {
if (!endingNodeData) return res.status(500).send(`Ending node ${endingNode.id} data not found`) if (!endingNodeData) return res.status(500).send(`Ending node ${endingNode.id} data not found`)
if (endingNodeData && endingNodeData.category !== 'Chains' && endingNodeData.category !== 'Agents') { if (endingNodeData && endingNodeData.category !== 'Chains' && endingNodeData.category !== 'Agents') {
if (endingNodeData.type !== 'OpenAIMultiModalChain') { return res.status(500).send(`Ending node must be either a Chain or Agent`)
return res.status(500).send(`Ending node must be either a Chain or Agent`)
}
} }
if ( if (