diff --git a/packages/components/nodes/multimodal/OpenAI/OpenAIVisionChain.ts b/packages/components/nodes/chains/OpenAIMultiModalChain/OpenAIMultiModalChain.ts similarity index 77% rename from packages/components/nodes/multimodal/OpenAI/OpenAIVisionChain.ts rename to packages/components/nodes/chains/OpenAIMultiModalChain/OpenAIMultiModalChain.ts index 1ff4f4c9..f62d58bc 100644 --- a/packages/components/nodes/multimodal/OpenAI/OpenAIVisionChain.ts +++ b/packages/components/nodes/chains/OpenAIMultiModalChain/OpenAIMultiModalChain.ts @@ -1,10 +1,17 @@ -import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface' +import { + ICommonObject, + INode, + INodeData, + INodeOutputsValue, + INodeParams +} from "../../../src/Interface"; import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils' -import { OpenAIVisionChainInput, VLLMChain } from './VLLMChain' +import { OpenAIMultiModalChainInput, VLLMChain } from "./VLLMChain"; import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler' import { formatResponse } from '../../outputparsers/OutputParserHelpers' +import { checkInputs, Moderation, streamResponse } from "../../moderation/Moderation"; -class OpenAIVisionChain_Chains implements INode { +class OpenAIMultiModalChain_Chains implements INode { label: string name: string version: number @@ -24,7 +31,7 @@ class OpenAIVisionChain_Chains implements INode { this.version = 1.0 this.type = 'OpenAIMultiModalChain' this.icon = 'chain.svg' - this.category = 'MultiModal' + this.category = 'Chains' this.badge = 'BETA' this.description = 'Chain to query against Image and Audio Input.' this.baseClasses = [this.type, ...getBaseClasses(VLLMChain)] @@ -35,18 +42,20 @@ class OpenAIVisionChain_Chains implements INode { credentialNames: ['openAIApi'] } this.inputs = [ - { - label: 'Audio Input', - name: 'audioInput', - type: 'OpenAIWhisper', - optional: true - }, { label: 'Prompt', name: 'prompt', type: 'BasePromptTemplate', optional: true }, + { + label: 'Input Moderation', + description: 'Detect text that could generate harmful output and prevent it from being sent to the language model', + name: 'inputModeration', + type: 'Moderation', + optional: true, + list: true + }, { label: 'Model Name', name: 'modelName', @@ -55,14 +64,38 @@ class OpenAIVisionChain_Chains implements INode { { label: 'gpt-4-vision-preview', name: 'gpt-4-vision-preview' - }, - { - label: 'whisper-1', - name: 'whisper-1' } ], default: 'gpt-4-vision-preview' }, + { + label: 'Speech to Text', + name: 'speechToText', + type: 'boolean', + optional: true, + }, + // TODO: only show when speechToText is true + { + label: 'Speech to Text Method', + description: 'How to turn audio into text', + name: 'speechToTextMode', + type: 'options', + options: [ + { + label: 'Transcriptions', + name: 'transcriptions', + description: 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.' + }, + { + label: 'Translations', + name: 'translations', + description: 'Translate and transcribe the audio into english.' + } + ], + optional: false, + default: 'transcriptions', + additionalParams: true + }, { label: 'Image Resolution', description: 'This parameter controls the resolution in which the model views the image.', @@ -76,6 +109,10 @@ class OpenAIVisionChain_Chains implements INode { { label: 'High', name: 'high' + }, + { + label: 'Auto', + name: 'auto' } ], default: 'low', @@ -107,18 +144,11 @@ class OpenAIVisionChain_Chains implements INode { optional: true, additionalParams: true }, - { - label: 'Chain Name', - name: 'chainName', - type: 'string', - placeholder: 'Name Your Chain', - optional: true - }, { label: 'Accepted Upload Types', name: 'allowedUploadTypes', type: 'string', - default: 'image/gif;image/jpeg;image/png;image/webp', + default: 'image/gif;image/jpeg;image/png;image/webp;audio/mpeg;audio/x-wav;audio/mp4', hidden: true }, { @@ -154,19 +184,23 @@ class OpenAIVisionChain_Chains implements INode { const modelName = nodeData.inputs?.modelName as string const maxTokens = nodeData.inputs?.maxTokens as string const topP = nodeData.inputs?.topP as string - const whisperConfig = nodeData.inputs?.audioInput + const speechToText = nodeData.inputs?.speechToText as boolean - const fields: OpenAIVisionChainInput = { + + const fields: OpenAIMultiModalChainInput = { openAIApiKey: openAIApiKey, imageResolution: imageResolution, verbose: process.env.DEBUG === 'true', - imageUrls: options.uploads, + uploads: options.uploads, modelName: modelName } if (temperature) fields.temperature = parseFloat(temperature) if (maxTokens) fields.maxTokens = parseInt(maxTokens, 10) if (topP) fields.topP = parseFloat(topP) - if (whisperConfig) fields.whisperConfig = whisperConfig + if (speechToText) { + const speechToTextMode = nodeData.inputs?.speechToTextMode ?? 'transcriptions' + if (speechToTextMode) fields.speechToTextMode = speechToTextMode + } if (output === this.name) { const chain = new VLLMChain({ @@ -221,6 +255,17 @@ const runPrediction = async ( const isStreaming = options.socketIO && options.socketIOClientId const socketIO = isStreaming ? options.socketIO : undefined const socketIOClientId = isStreaming ? options.socketIOClientId : '' + const moderations = nodeData.inputs?.inputModeration as Moderation[] + if (moderations && moderations.length > 0) { + try { + // Use the output of the moderation chain as input for the LLM chain + input = await checkInputs(moderations, input) + } catch (e) { + await new Promise((resolve) => setTimeout(resolve, 500)) + streamResponse(isStreaming, e.message, socketIO, socketIOClientId) + return formatResponse(e.message) + } + } /** * Apply string transformation to reverse converted special chars: @@ -229,7 +274,7 @@ const runPrediction = async ( */ const promptValues = handleEscapeCharacters(promptValuesRaw, true) if (options?.uploads) { - chain.imageUrls = options.uploads + chain.uploads = options.uploads } if (promptValues && inputVariables.length > 0) { let seen: string[] = [] @@ -285,4 +330,4 @@ const runPrediction = async ( } } -module.exports = { nodeClass: OpenAIVisionChain_Chains } +module.exports = { nodeClass: OpenAIMultiModalChain_Chains } diff --git a/packages/components/nodes/multimodal/OpenAI/VLLMChain.ts b/packages/components/nodes/chains/OpenAIMultiModalChain/VLLMChain.ts similarity index 71% rename from packages/components/nodes/multimodal/OpenAI/VLLMChain.ts rename to packages/components/nodes/chains/OpenAIMultiModalChain/VLLMChain.ts index dd44ebb5..2cf2ce95 100644 --- a/packages/components/nodes/multimodal/OpenAI/VLLMChain.ts +++ b/packages/components/nodes/chains/OpenAIMultiModalChain/VLLMChain.ts @@ -1,27 +1,30 @@ -import { OpenAI as OpenAIClient, ClientOptions } from 'openai' +import { OpenAI as OpenAIClient, ClientOptions, OpenAI } from 'openai' import { BaseChain, ChainInputs } from 'langchain/chains' import { ChainValues } from 'langchain/schema' -import { BasePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts' +import { BasePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts' import path from 'path' import { getUserHome } from '../../../src/utils' import fs from 'fs' +import { ChatCompletionContentPart, ChatCompletionMessageParam } from 'openai/src/resources/chat/completions' +import ChatCompletionCreateParamsNonStreaming = OpenAI.ChatCompletionCreateParamsNonStreaming +import { IFileUpload } from '../../../src' /** * Interface for the input parameters of the OpenAIVisionChain class. */ -export interface OpenAIVisionChainInput extends ChainInputs { +export interface OpenAIMultiModalChainInput extends ChainInputs { openAIApiKey?: string openAIOrganization?: string throwError?: boolean prompt?: BasePromptTemplate configuration?: ClientOptions - imageUrls?: [] - imageResolution?: string + uploads?: IFileUpload[] + imageResolution?: 'auto' | 'low' | 'high' temperature?: number modelName?: string maxTokens?: number topP?: number - whisperConfig?: any + speechToTextMode?: string } /** @@ -29,7 +32,7 @@ export interface OpenAIVisionChainInput extends ChainInputs { * Vision API. It extends the BaseChain class and implements the * OpenAIVisionChainInput interface. */ -export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { +export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput { static lc_name() { return 'VLLMChain' } @@ -37,8 +40,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { inputKey = 'input' outputKey = 'text' - imageUrls?: [] - imageResolution: string = 'low' + uploads?: IFileUpload[] + imageResolution: 'auto' | 'low' | 'high' openAIApiKey?: string openAIOrganization?: string clientConfig: ClientOptions @@ -49,9 +52,9 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { maxTokens?: number topP?: number - whisperConfig?: any + speechToTextMode?: any - constructor(fields: OpenAIVisionChainInput) { + constructor(fields: OpenAIMultiModalChainInput) { super(fields) this.throwError = fields?.throwError ?? false this.imageResolution = fields?.imageResolution ?? 'low' @@ -61,8 +64,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { this.modelName = fields?.modelName this.maxTokens = fields?.maxTokens this.topP = fields?.topP - this.imageUrls = fields?.imageUrls ?? [] - this.whisperConfig = fields?.whisperConfig ?? {} + this.uploads = fields?.uploads ?? [] + this.speechToTextMode = fields?.speechToTextMode ?? {} if (!this.openAIApiKey) { throw new Error('OpenAI API key not found') } @@ -81,8 +84,8 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { async _call(values: ChainValues): Promise { const userInput = values[this.inputKey] - const vRequest: any = { - model: this.modelName, + const vRequest: ChatCompletionCreateParamsNonStreaming = { + model: 'gpt-4-vision-preview', temperature: this.temperature, top_p: this.topP, messages: [] @@ -90,42 +93,42 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { if (this.maxTokens) vRequest.max_tokens = this.maxTokens else vRequest.max_tokens = 1024 - const userRole: any = { role: 'user' } - userRole.content = [] - userRole.content.push({ + const chatMessages: ChatCompletionContentPart[] = [] + const userRole: ChatCompletionMessageParam = { role: 'user', content: [] } + chatMessages.push({ type: 'text', text: userInput }) - if (this.whisperConfig && this.imageUrls && this.imageUrls.length > 0) { - const audioUploads = this.getAudioUploads(this.imageUrls) + if (this.speechToTextMode && this.uploads && this.uploads.length > 0) { + const audioUploads = this.getAudioUploads(this.uploads) for (const url of audioUploads) { const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name) // as the image is stored in the server, read the file and convert it to base64 const audio_file = fs.createReadStream(filePath) - if (this.whisperConfig.purpose === 'transcription') { + if (this.speechToTextMode.purpose === 'transcriptions') { const transcription = await this.client.audio.transcriptions.create({ file: audio_file, model: 'whisper-1' }) - userRole.content.push({ + chatMessages.push({ type: 'text', text: transcription.text }) - } else if (this.whisperConfig.purpose === 'translation') { + } else if (this.speechToTextMode.purpose === 'translations') { const translation = await this.client.audio.translations.create({ file: audio_file, model: 'whisper-1' }) - userRole.content.push({ + chatMessages.push({ type: 'text', text: translation.text }) } } } - if (this.imageUrls && this.imageUrls.length > 0) { - const imageUploads = this.getImageUploads(this.imageUrls) + if (this.uploads && this.uploads.length > 0) { + const imageUploads = this.getImageUploads(this.uploads) for (const url of imageUploads) { let bf = url.data if (url.type == 'stored-file') { @@ -135,7 +138,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { const contents = fs.readFileSync(filePath) bf = 'data:' + url.mime + ';base64,' + contents.toString('base64') } - userRole.content.push({ + chatMessages.push({ type: 'image_url', image_url: { url: bf, @@ -144,6 +147,7 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { }) } } + userRole.content = chatMessages vRequest.messages.push(userRole) if (this.prompt && this.prompt instanceof ChatPromptTemplate) { let chatPrompt = this.prompt as ChatPromptTemplate @@ -151,12 +155,12 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { if (message instanceof SystemMessagePromptTemplate) { vRequest.messages.push({ role: 'system', - content: [ - { - type: 'text', - text: (message.prompt as any).template - } - ] + content: (message.prompt as any).template + }) + } else if (message instanceof HumanMessagePromptTemplate) { + vRequest.messages.push({ + role: 'user', + content: (message.prompt as any).template }) } }) @@ -164,7 +168,6 @@ export class VLLMChain extends BaseChain implements OpenAIVisionChainInput { let response try { - // @ts-ignore response = await this.client.chat.completions.create(vRequest) } catch (error) { if (error instanceof Error) { diff --git a/packages/components/nodes/multimodal/OpenAI/chain.svg b/packages/components/nodes/chains/OpenAIMultiModalChain/chain.svg similarity index 100% rename from packages/components/nodes/multimodal/OpenAI/chain.svg rename to packages/components/nodes/chains/OpenAIMultiModalChain/chain.svg diff --git a/packages/components/nodes/multimodal/OpenAI/AudioWhisper.ts b/packages/components/nodes/multimodal/OpenAI/AudioWhisper.ts deleted file mode 100644 index aa2c71e1..00000000 --- a/packages/components/nodes/multimodal/OpenAI/AudioWhisper.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { INode, INodeData, INodeParams } from '../../../src' - -class OpenAIAudioWhisper implements INode { - label: string - name: string - version: number - description: string - type: string - icon: string - badge: string - category: string - baseClasses: string[] - inputs: INodeParams[] - - constructor() { - this.label = 'Open AI Whisper' - this.name = 'openAIAudioWhisper' - this.version = 1.0 - this.type = 'OpenAIWhisper' - this.description = 'Speech to text using OpenAI Whisper API' - this.icon = 'audio.svg' - this.badge = 'BETA' - this.category = 'MultiModal' - this.baseClasses = [this.type] - this.inputs = [ - { - label: 'Purpose', - name: 'purpose', - type: 'options', - options: [ - { - label: 'Transcription', - name: 'transcription' - }, - { - label: 'Translation', - name: 'translation' - } - ], - default: 'transcription' - }, - { - label: 'Accepted Upload Types', - name: 'allowedUploadTypes', - type: 'string', - default: 'audio/mpeg;audio/x-wav;audio/mp4', - hidden: true - }, - { - label: 'Maximum Upload Size (MB)', - name: 'maxUploadSize', - type: 'number', - default: '5', - hidden: true - } - ] - } - - async init(nodeData: INodeData): Promise { - const purpose = nodeData.inputs?.purpose as string - - return { purpose } - } -} - -module.exports = { nodeClass: OpenAIAudioWhisper } diff --git a/packages/components/nodes/multimodal/OpenAI/audio.svg b/packages/components/nodes/multimodal/OpenAI/audio.svg deleted file mode 100644 index 3bcbbdcd..00000000 --- a/packages/components/nodes/multimodal/OpenAI/audio.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/packages/components/nodes/multimodal/OpenAI/list.png b/packages/components/nodes/multimodal/OpenAI/list.png deleted file mode 100644 index acb4e5d6..00000000 Binary files a/packages/components/nodes/multimodal/OpenAI/list.png and /dev/null differ diff --git a/packages/components/src/Interface.ts b/packages/components/src/Interface.ts index 676618e5..e7f6fe86 100644 --- a/packages/components/src/Interface.ts +++ b/packages/components/src/Interface.ts @@ -234,3 +234,10 @@ export abstract class FlowiseSummaryMemory extends ConversationSummaryMemory imp abstract addChatMessages(msgArray: { text: string; type: MessageType }[], overrideSessionId?: string): Promise abstract clearChatMessages(overrideSessionId?: string): Promise } + +export interface IFileUpload { + data: string + type: string + name: string + mime: string +} \ No newline at end of file diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 4451b838..61aff470 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -1695,9 +1695,7 @@ export class App { if (!endingNodeData) return res.status(500).send(`Ending node ${endingNode.id} data not found`) if (endingNodeData && endingNodeData.category !== 'Chains' && endingNodeData.category !== 'Agents') { - if (endingNodeData.type !== 'OpenAIMultiModalChain') { - return res.status(500).send(`Ending node must be either a Chain or Agent`) - } + return res.status(500).send(`Ending node must be either a Chain or Agent`) } if (