diff --git a/packages/components/nodes/chains/ConversationChain/ConversationChain.ts b/packages/components/nodes/chains/ConversationChain/ConversationChain.ts index 0bba9b3c..19e6bec1 100644 --- a/packages/components/nodes/chains/ConversationChain/ConversationChain.ts +++ b/packages/components/nodes/chains/ConversationChain/ConversationChain.ts @@ -8,8 +8,7 @@ import { flatten } from 'lodash' import { Document } from 'langchain/document' import { RunnableSequence } from 'langchain/schema/runnable' import { StringOutputParser } from 'langchain/schema/output_parser' -import { addImagesToMessages, processSpeechToText } from '../../../src/MultiModalUtils' -import { HumanMessage } from 'langchain/schema' +import { injectChainNodeData } from '../../../src/MultiModalUtils' let systemMessage = `The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.` const inputKey = 'input' @@ -75,7 +74,7 @@ class ConversationChain_Chains implements INode { async run(nodeData: INodeData, input: string, options: ICommonObject): Promise { const memory = nodeData.inputs?.memory - input = await processSpeechToText(nodeData, input, options) + injectChainNodeData(nodeData, options) const chain = prepareChain(nodeData, options, this.sessionId) @@ -132,24 +131,12 @@ const prepareChatPrompt = (nodeData: INodeData, options: ICommonObject) => { if (finalText) systemMessage = `${systemMessage}\nThe AI has the following context:\n${finalText}` - // TODO: add audio uploads - // if (options.uploads.length > 0) { - // const audioUploads = getAudioUploads(options.uploads) - // for (const upload of audioUploads) { - // await this.processAudioWithWhisper(upload, chatMessages) - // } - // } - const imageContent = addImagesToMessages(nodeData, options) - //TODO, this should not be any[], what interface should it be? let promptMessages: any[] = [ SystemMessagePromptTemplate.fromTemplate(prompt ? `${prompt}\n${systemMessage}` : systemMessage), new MessagesPlaceholder(memory.memoryKey ?? 'chat_history'), HumanMessagePromptTemplate.fromTemplate(`{${inputKey}}`) ] - if (imageContent.length > 0) { - promptMessages.push(new HumanMessage({ content: imageContent })) - } const chatPrompt = ChatPromptTemplate.fromMessages(promptMessages) return chatPrompt diff --git a/packages/components/nodes/chains/LLMChain/LLMChain.ts b/packages/components/nodes/chains/LLMChain/LLMChain.ts index b7c055e4..2e7e29f7 100644 --- a/packages/components/nodes/chains/LLMChain/LLMChain.ts +++ b/packages/components/nodes/chains/LLMChain/LLMChain.ts @@ -8,6 +8,7 @@ import { formatResponse, injectOutputParser } from '../../outputparsers/OutputPa import { BaseLLMOutputParser } from 'langchain/schema/output_parser' import { OutputFixingParser } from 'langchain/output_parsers' import { checkInputs, Moderation, streamResponse } from '../../moderation/Moderation' +import { injectChainNodeData } from '../../../src/MultiModalUtils' class LLMChain_Chains implements INode { label: string @@ -129,6 +130,7 @@ class LLMChain_Chains implements INode { if (!this.outputParser && outputParser) { this.outputParser = outputParser } + injectChainNodeData(nodeData, options) promptValues = injectOutputParser(this.outputParser, chain, promptValues) const res = await runPrediction(inputVariables, chain, input, promptValues, options, nodeData) // eslint-disable-next-line no-console diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts index bc5814d0..9543f1ee 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts @@ -3,6 +3,7 @@ import { getBaseClasses, getCredentialData, getCredentialParam } from '../../../ import { ChatOpenAI, OpenAIChatInput } from 'langchain/chat_models/openai' import { BaseCache } from 'langchain/schema' import { BaseLLMParams } from 'langchain/llms/base' +import { FlowiseChatOpenAI } from './FlowiseChatOpenAI' class ChatOpenAI_ChatModels implements INode { label: string @@ -157,13 +158,7 @@ class ChatOpenAI_ChatModels implements INode { label: 'Allow Image Uploads', name: 'allowImageUploads', type: 'boolean', - default: false, - optional: true - }, - { - label: 'Allow Audio Uploads', - name: 'allowAudioUploads', - type: 'boolean', + description: 'Enabling this option, would default the model to gpt-4-vision-preview', default: false, optional: true }, @@ -236,7 +231,6 @@ class ChatOpenAI_ChatModels implements INode { const baseOptions = nodeData.inputs?.baseOptions const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean - const allowAudioUploads = nodeData.inputs?.allowAudioUploads as boolean const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean const speechToTextMode = nodeData.inputs?.speechToTextMode as string const imageResolution = nodeData.inputs?.imageResolution as string @@ -269,24 +263,18 @@ class ChatOpenAI_ChatModels implements INode { throw new Error("Invalid JSON in the ChatOpenAI's BaseOptions: " + exception) } } - const model = new ChatOpenAI(obj, { - basePath, + const model = new FlowiseChatOpenAI(obj, { + baseURL: basePath, baseOptions: parsedBaseOptions }) const multiModal = { allowImageUploads: allowImageUploads ?? false, - allowAudioUploads: allowAudioUploads ?? false, allowSpeechToText: allowSpeechToText ?? false, imageResolution, speechToTextMode } - Object.defineProperty(model, 'multiModal', { - enumerable: true, - configurable: true, - writable: true, - value: multiModal - }) + model.multiModal = multiModal return model } } diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts new file mode 100644 index 00000000..8af9c4df --- /dev/null +++ b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts @@ -0,0 +1,71 @@ +import { ChatOpenAI, OpenAIChatInput } from 'langchain/chat_models/openai' +import { BaseChatModelParams } from 'langchain/chat_models/base' +import type { ClientOptions } from 'openai' +import type { LegacyOpenAIInput } from '@langchain/openai/dist/types' +import { BaseLanguageModelInput } from 'langchain/base_language' +import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models' +import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema' +import { Callbacks } from '@langchain/core/callbacks/manager' +import { ICommonObject, INodeData } from '../../../src' +import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils' +import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts' + +export class FlowiseChatOpenAI extends ChatOpenAI { + multiModal: {} + //TODO: Should be class variables and not static + public static chainNodeData: INodeData + public static chainNodeOptions: ICommonObject + + constructor( + fields?: Partial & BaseChatModelParams & { openAIApiKey?: string }, + /** @deprecated */ + configuration?: ClientOptions & LegacyOpenAIInput + ) { + super(fields) + } + + async invoke(input: BaseLanguageModelInput, options?: ChatOpenAICallOptions): Promise { + //input.messages + return super.invoke(input, options) + } + + async generate(messages: BaseMessageLike[][], options?: string[] | ChatOpenAICallOptions, callbacks?: Callbacks): Promise { + //messages + await this.injectMultiModalMessages(messages) + return super.generate(messages, options, callbacks) + } + + private async injectMultiModalMessages(messages: BaseMessageLike[][]) { + const nodeData = FlowiseChatOpenAI.chainNodeData + const optionsData = FlowiseChatOpenAI.chainNodeOptions + let audioTrans = await checkSpeechToText(nodeData, optionsData) + if (audioTrans) { + if (messages.length > 0) { + const lastMessage = messages[0].pop() as HumanMessage + if (!nodeData.inputs?.prompt) { + lastMessage.content = audioTrans + } else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) { + lastMessage.content = audioTrans + } else if (nodeData.inputs?.prompt instanceof PromptTemplate) { + let prompt = nodeData.inputs?.prompt as PromptTemplate + let inputVar = prompt.inputVariables[0] + let formattedValues: any = {} + formattedValues[inputVar] = audioTrans + lastMessage.content = await prompt.format(formattedValues) + } + messages[0].push(lastMessage) + } + } + const messageContent = addImagesToMessages(nodeData, optionsData) + if (messageContent) { + if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) { + const lastMessage = messages[0].pop() + if (lastMessage instanceof HumanMessage) { + lastMessage.content = messageContent + this.modelName = 'gpt-4-vision-preview' + } + messages[0].push(lastMessage as HumanMessage) + } + } + } +} diff --git a/packages/components/src/MultiModalUtils.ts b/packages/components/src/MultiModalUtils.ts index 513915a5..58865a8a 100644 --- a/packages/components/src/MultiModalUtils.ts +++ b/packages/components/src/MultiModalUtils.ts @@ -6,15 +6,26 @@ import path from 'path' import { getUserHome } from './utils' import fs from 'fs' import { MessageContent } from '@langchain/core/dist/messages' +import { FlowiseChatOpenAI } from '../nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI' -export const processSpeechToText = async (nodeData: INodeData, input: string, options: ICommonObject) => { +export const injectChainNodeData = (nodeData: INodeData, options: ICommonObject) => { + let model = nodeData.inputs?.model as BaseChatModel + + if (model instanceof FlowiseChatOpenAI) { + // TODO: this should not be static, need to figure out how to pass the nodeData and options to the invoke method + FlowiseChatOpenAI.chainNodeOptions = options + FlowiseChatOpenAI.chainNodeData = nodeData + } +} + +export const checkSpeechToText = async (nodeData: INodeData, options: ICommonObject) => { const MODEL_NAME = 'whisper-1' - + let input = undefined let model = nodeData.inputs?.model as BaseChatModel if (model instanceof ChatOpenAI && (model as any).multiModal) { const multiModalConfig = (model as any).multiModal if (options?.uploads) { - if (options.uploads.length === 1 && input.length === 0 && options.uploads[0].mime === 'audio/webm') { + if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') { const upload = options.uploads[0] //special case, text input is empty, but we have an upload (recorded audio) if (multiModalConfig.allowSpeechToText) { diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index da1057a9..e7816311 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -474,7 +474,6 @@ export class App { const allowances: IUploadFileSizeAndTypes[] = [] let allowSpeechToText = false let allowImageUploads = false - let allowAudioUploads = false flowObj.nodes.forEach((node: IReactFlowNode) => { if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) { logger.debug(`[server]: Found Eligible Node ${node.data.type}, Allowing Uploads.`) @@ -484,18 +483,11 @@ export class App { node.data.inputParams.map((param: INodeParams) => { if (param.name === 'allowImageUploads' && node.data.inputs?.['allowImageUploads'] && !allowImageUploads) { allowances.push({ - fileTypes: 'image/gif;image/jpeg;image/png;image/webp'.split(';'), + fileTypes: 'image/gif;image/jpeg;image/png;image/webp;'.split(';'), maxUploadSize: 5 }) allowImageUploads = true } - if (param.name === 'allowAudioUploads' && node.data.inputs?.['allowAudioUploads'] && !allowAudioUploads) { - allowances.push({ - fileTypes: 'audio/mpeg;audio/x-wav;audio/mp4'.split(';'), - maxUploadSize: 5 - }) - allowAudioUploads = true - } if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) { allowSpeechToText = true } diff --git a/packages/ui/src/views/chatmessage/ChatMessage.js b/packages/ui/src/views/chatmessage/ChatMessage.js index 006e2425..bea1acd1 100644 --- a/packages/ui/src/views/chatmessage/ChatMessage.js +++ b/packages/ui/src/views/chatmessage/ChatMessage.js @@ -23,7 +23,7 @@ import { Typography } from '@mui/material' import { useTheme } from '@mui/material/styles' -import { IconCircleDot, IconDownload, IconSend, IconMicrophone, IconPhotoPlus, IconSquare, IconTrash, IconX } from '@tabler/icons' +import { IconCircleDot, IconDownload, IconSend, IconMicrophone, IconPhotoPlus, IconTrash, IconX } from '@tabler/icons' import robotPNG from 'assets/images/robot.png' import userPNG from 'assets/images/account.png' import audioUploadSVG from 'assets/images/wave-sound.jpg' @@ -897,9 +897,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => { - +