Files
Vinod Kiran e17994d8fe Evaluations for Agentflows v2 & Assistants (#4589)
* New Feature: Evaluations for AgentFlow v2

* New Feature: Evaluations for Assistants and minor tweaks on other evaluations.

* do not store messages during evaluation for agent flows.

* common cost formatting

* moving the category names to description (in create dialog) and adjusting the side drawer label

* lint fixes

* Enhancement: Add auto-refresh toggle for evaluations with 5-second interval and adjust grid item size for metrics display.

* 1) chatflow types are stored in additional config
2) messages are now stored with type "Evaluations"
3) Message Dialog has a new Type in the ChatType Filter Dropdown
4) Chatflow badges on the view page, have the right canvas URL
5) outdated API returns chatflow type along with the stale indicator.
6) UI - Flow Indicator Icons are shown in the Chatflows Used chips & side drawer

* Refactor JWT error handling to return 401 status for expired refresh tokens. Update chat message ID assignment to remove UUID fallback. Enhance ViewMessagesDialog to set default chat type filters and implement a new method for determining chat type sources. Modify EvalsResultDialog to open links in a new tab and adjust icon sizes for better consistency. Clean up unused imports in EvaluationResultSideDrawer.

* handling on Click for deleted flows and minor code cleanup

* evals ui fix

* Refactor evaluation service to improve error handling and data parsing. Update additionalConfig handling to default to an empty object if not present. Enhance type definitions for better clarity. Adjust MetricsItemCard to prevent overflow and improve layout consistency.

---------

Co-authored-by: Henry <hzj94@hotmail.com>
2025-06-10 16:11:22 +01:00

227 lines
9.7 KiB
TypeScript

import axios from 'axios'
import { v4 as uuidv4 } from 'uuid'
import { ICommonObject } from '../src'
import { getModelConfigByModelName, MODEL_TYPE } from '../src/modelLoader'
export class EvaluationRunner {
static metrics = new Map<string, string[]>()
static getCostMetrics = async (selectedProvider: string, selectedModel: string) => {
let modelConfig = await getModelConfigByModelName(MODEL_TYPE.CHAT, selectedProvider, selectedModel)
if (modelConfig) {
if (modelConfig['cost_values']) {
return modelConfig.cost_values
}
return { cost_values: modelConfig }
} else {
modelConfig = await getModelConfigByModelName(MODEL_TYPE.LLM, selectedProvider, selectedModel)
if (modelConfig) {
if (modelConfig['cost_values']) {
return modelConfig.cost_values
}
return { cost_values: modelConfig }
}
}
return undefined
}
static async getAndDeleteMetrics(id: string) {
const val = EvaluationRunner.metrics.get(id)
if (val) {
try {
//first lets get the provider and model
let selectedModel = undefined
let selectedProvider = undefined
if (val && val.length > 0) {
let modelName = ''
let providerName = ''
for (let i = 0; i < val.length; i++) {
const metric = val[i]
if (typeof metric === 'object') {
modelName = metric['model']
providerName = metric['provider']
} else {
modelName = JSON.parse(metric)['model']
providerName = JSON.parse(metric)['provider']
}
if (modelName) {
selectedModel = modelName
}
if (providerName) {
selectedProvider = providerName
}
}
}
if (selectedProvider && selectedModel) {
const modelConfig = await EvaluationRunner.getCostMetrics(selectedProvider, selectedModel)
if (modelConfig) {
val.push(JSON.stringify({ cost_values: modelConfig }))
}
}
} catch (error) {
//stay silent
}
}
EvaluationRunner.metrics.delete(id)
return val
}
static addMetrics(id: string, metric: string) {
if (EvaluationRunner.metrics.has(id)) {
EvaluationRunner.metrics.get(id)?.push(metric)
} else {
EvaluationRunner.metrics.set(id, [metric])
}
}
baseURL = ''
constructor(baseURL: string) {
this.baseURL = baseURL
}
getChatflowApiKey(chatflowId: string, apiKeys: { chatflowId: string; apiKey: string }[] = []) {
return apiKeys.find((item) => item.chatflowId === chatflowId)?.apiKey || ''
}
public async runEvaluations(data: ICommonObject) {
const chatflowIds = JSON.parse(data.chatflowId)
const returnData: ICommonObject = {}
returnData.evaluationId = data.evaluationId
returnData.runDate = new Date()
returnData.rows = []
for (let i = 0; i < data.dataset.rows.length; i++) {
returnData.rows.push({
input: data.dataset.rows[i].input,
expectedOutput: data.dataset.rows[i].output,
itemNo: data.dataset.rows[i].sequenceNo,
evaluations: [],
status: 'pending'
})
}
for (let i = 0; i < chatflowIds.length; i++) {
const chatflowId = chatflowIds[i]
await this.evaluateChatflow(chatflowId, this.getChatflowApiKey(chatflowId, data.apiKeys), data, returnData)
}
return returnData
}
async evaluateChatflow(chatflowId: string, apiKey: string, data: any, returnData: any) {
for (let i = 0; i < data.dataset.rows.length; i++) {
const item = data.dataset.rows[i]
const uuid = uuidv4()
const headers: any = {
'X-Request-ID': uuid,
'X-Flowise-Evaluation': 'true'
}
if (apiKey) {
headers['Authorization'] = `Bearer ${apiKey}`
}
let axiosConfig = {
headers: headers
}
let startTime = performance.now()
const runData: any = {}
runData.chatflowId = chatflowId
runData.startTime = startTime
const postData: any = { question: item.input, evaluationRunId: uuid, evaluation: true }
if (data.sessionId) {
postData.overrideConfig = { sessionId: data.sessionId }
}
try {
let response = await axios.post(`${this.baseURL}/api/v1/prediction/${chatflowId}`, postData, axiosConfig)
let agentFlowMetrics: any[] = []
if (response?.data?.agentFlowExecutedData) {
for (let i = 0; i < response.data.agentFlowExecutedData.length; i++) {
const agentFlowExecutedData = response.data.agentFlowExecutedData[i]
const input_tokens = agentFlowExecutedData?.data?.output?.usageMetadata?.input_tokens || 0
const output_tokens = agentFlowExecutedData?.data?.output?.usageMetadata?.output_tokens || 0
const total_tokens =
agentFlowExecutedData?.data?.output?.usageMetadata?.total_tokens || input_tokens + output_tokens
const metrics: any = {
promptTokens: input_tokens,
completionTokens: output_tokens,
totalTokens: total_tokens,
provider:
agentFlowExecutedData.data?.input?.llmModelConfig?.llmModel ||
agentFlowExecutedData.data?.input?.agentModelConfig?.agentModel,
model:
agentFlowExecutedData.data?.input?.llmModelConfig?.modelName ||
agentFlowExecutedData.data?.input?.agentModelConfig?.modelName,
nodeLabel: agentFlowExecutedData?.nodeLabel,
nodeId: agentFlowExecutedData?.nodeId
}
if (metrics.provider && metrics.model) {
const modelConfig = await EvaluationRunner.getCostMetrics(metrics.provider, metrics.model)
if (modelConfig) {
metrics.cost_values = {
input_cost: (modelConfig.cost_values.input_cost || 0) * (input_tokens / 1000),
output_cost: (modelConfig.cost_values.output_cost || 0) * (output_tokens / 1000)
}
metrics.cost_values.total_cost = metrics.cost_values.input_cost + metrics.cost_values.output_cost
}
}
agentFlowMetrics.push(metrics)
}
}
const endTime = performance.now()
const timeTaken = (endTime - startTime).toFixed(2)
if (response?.data?.metrics) {
runData.metrics = response.data.metrics
runData.metrics.push({
apiLatency: timeTaken
})
} else {
runData.metrics = [
{
apiLatency: timeTaken
}
]
}
if (agentFlowMetrics.length > 0) {
runData.nested_metrics = agentFlowMetrics
}
runData.status = 'complete'
let resultText = ''
if (response.data.text) resultText = response.data.text
else if (response.data.json) resultText = '```json\n' + JSON.stringify(response.data.json, null, 2)
else resultText = JSON.stringify(response.data, null, 2)
runData.actualOutput = resultText
runData.latency = timeTaken
runData.error = ''
} catch (error: any) {
runData.status = 'error'
runData.actualOutput = ''
runData.error = error?.response?.data?.message
? error.response.data.message
: error?.message
? error.message
: 'Unknown error'
try {
if (runData.error.indexOf('-') > -1) {
// if there is a dash, remove all content before
runData.error = 'Error: ' + runData.error.substr(runData.error.indexOf('-') + 1).trim()
}
} catch (error) {
//stay silent
}
const endTime = performance.now()
const timeTaken = (endTime - startTime).toFixed(2)
runData.metrics = [
{
apiLatency: timeTaken
}
]
runData.latency = timeTaken
}
runData.uuid = uuid
returnData.rows[i].evaluations.push(runData)
}
return returnData
}
}