Evaluations for Agentflows v2 & Assistants (#4589)

* New Feature: Evaluations for AgentFlow v2 * New Feature: Evaluations for Assistants and minor tweaks on other evaluations. * do not store messages during evaluation for agent flows. * common cost formatting * moving the category names to description (in create dialog) and adjusting the side drawer label * lint fixes * Enhancement: Add auto-refresh toggle for evaluations with 5-second interval and adjust grid item size for metrics display. * 1) chatflow types are stored in additional config 2) messages are now stored with type "Evaluations" 3) Message Dialog has a new Type in the ChatType Filter Dropdown 4) Chatflow badges on the view page, have the right canvas URL 5) outdated API returns chatflow type along with the stale indicator. 6) UI - Flow Indicator Icons are shown in the Chatflows Used chips & side drawer * Refactor JWT error handling to return 401 status for expired refresh tokens. Update chat message ID assignment to remove UUID fallback. Enhance ViewMessagesDialog to set default chat type filters and implement a new method for determining chat type sources. Modify EvalsResultDialog to open links in a new tab and adjust icon sizes for better consistency. Clean up unused imports in EvaluationResultSideDrawer. * handling on Click for deleted flows and minor code cleanup * evals ui fix * Refactor evaluation service to improve error handling and data parsing. Update additionalConfig handling to default to an empty object if not present. Enhance type definitions for better clarity. Adjust MetricsItemCard to prevent overflow and improve layout consistency. --------- Co-authored-by: Henry <hzj94@hotmail.com>
2026-06-28 15:00:57 +03:00 · 2025-06-10 20:41:22 +05:30
parent f644c47251
commit e17994d8fe
13 changed files with 766 additions and 224 deletions
@@ -218,7 +218,7 @@ export const initializeJwtCookieMiddleware = async (app: express.Application, id
        if (!refreshToken) return res.sendStatus(401)

        jwt.verify(refreshToken, jwtRefreshSecret, async (err: any, payload: any) => {
-            if (err || !payload) return res.status(403).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
+            if (err || !payload) return res.status(401).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
            // @ts-ignore
            const loggedInUser = req.user as LoggedInUser
            let isSSO = false
@@ -227,16 +227,16 @@ export const initializeJwtCookieMiddleware = async (app: express.Application, id
                try {
                    newTokenResponse = await identityManager.getRefreshToken(loggedInUser.ssoProvider, loggedInUser.ssoRefreshToken)
                    if (newTokenResponse.error) {
-                        return res.status(403).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
+                        return res.status(401).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
                    }
                    isSSO = true
                } catch (error) {
-                    return res.status(403).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
+                    return res.status(401).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
                }
            }
            const meta = decryptToken(payload.meta)
            if (!meta) {
-                return res.status(403).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
+                return res.status(401).json({ message: ErrorMessage.REFRESH_TOKEN_EXPIRED })
            }
            if (isSSO) {
                loggedInUser.ssoToken = newTokenResponse.access_token
@@ -18,39 +18,29 @@ export const calculateCost = (metricsArray: ICommonObject[]) => {
        let completionTokensCost: string = '0'
        let totalTokensCost = '0'
        if (metric.cost_values) {
-            const costValues = metric.cost_values
+            let costValues: any = {}
+            if (metric.cost_values?.cost_values) {
+                costValues = metric.cost_values.cost_values
+            } else {
+                costValues = metric.cost_values
+            }
+
            if (costValues.total_price > 0) {
                let cost = costValues.total_cost * (totalTokens / 1000)
-                if (cost < 0.01) {
-                    totalTokensCost = '$ <0.01'
-                } else {
-                    totalTokensCost = '$ ' + cost.toFixed(fractionDigits)
-                }
+                totalTokensCost = formatCost(cost)
            } else {
                let totalCost = 0
                if (promptTokens) {
                    const cost = costValues.input_cost * (promptTokens / 1000)
                    totalCost += cost
-                    if (cost < 0.01) {
-                        promptTokensCost = '$ <0.01'
-                    } else {
-                        promptTokensCost = '$ ' + cost.toFixed(fractionDigits)
-                    }
+                    promptTokensCost = formatCost(cost)
                }
                if (completionTokens) {
                    const cost = costValues.output_cost * (completionTokens / 1000)
                    totalCost += cost
-                    if (cost < 0.01) {
-                        completionTokensCost = '$ <0.01'
-                    } else {
-                        completionTokensCost = '$ ' + cost.toFixed(fractionDigits)
-                    }
-                }
-                if (totalCost < 0.01) {
-                    totalTokensCost = '$ <0.01'
-                } else {
-                    totalTokensCost = '$ ' + totalCost.toFixed(fractionDigits)
+                    completionTokensCost = formatCost(cost)
                }
+                totalTokensCost = formatCost(totalCost)
            }
        }
        metric['totalCost'] = totalTokensCost
@@ -58,3 +48,10 @@ export const calculateCost = (metricsArray: ICommonObject[]) => {
        metric['completionCost'] = completionTokensCost
    }
 }
+
+export const formatCost = (cost: number) => {
+    if (cost == 0) {
+        return '$ 0'
+    }
+    return cost < 0.01 ? '$ <0.01' : '$ ' + cost.toFixed(fractionDigits)
+}
@@ -15,10 +15,11 @@ import { getAppVersion } from '../../utils'
 import { In } from 'typeorm'
 import { getWorkspaceSearchOptions } from '../../enterprise/utils/ControllerServiceUtils'
 import { v4 as uuidv4 } from 'uuid'
-import { calculateCost } from './CostCalculator'
+import { calculateCost, formatCost } from './CostCalculator'
 import { runAdditionalEvaluators } from './EvaluatorRunner'
 import evaluatorsService from '../evaluator'
 import { LLMEvaluationRunner } from './LLMEvaluationRunner'
+import { Assistant } from '../../database/entities/Assistant'

 const runAgain = async (id: string, baseURL: string, orgId: string) => {
    try {
@@ -27,7 +28,7 @@ const runAgain = async (id: string, baseURL: string, orgId: string) => {
            id: id
        })
        if (!evaluation) throw new Error(`Evaluation ${id} not found`)
-        const additionalConfig: any = JSON.parse(evaluation.additionalConfig)
+        const additionalConfig = evaluation.additionalConfig ? JSON.parse(evaluation.additionalConfig) : {}
        const data: ICommonObject = {
            chatflowId: evaluation.chatflowId,
            chatflowName: evaluation.chatflowName,
@@ -35,7 +36,8 @@ const runAgain = async (id: string, baseURL: string, orgId: string) => {
            datasetId: evaluation.datasetId,
            evaluationType: evaluation.evaluationType,
            selectedSimpleEvaluators: JSON.stringify(additionalConfig.simpleEvaluators),
-            datasetAsOneConversation: additionalConfig.datasetAsOneConversation
+            datasetAsOneConversation: additionalConfig.datasetAsOneConversation,
+            chatflowType: JSON.stringify(additionalConfig.chatflowTypes ? additionalConfig.chatflowTypes : [])
        }
        data.name = evaluation.name
        data.workspaceId = evaluation.workspaceId
@@ -69,7 +71,8 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
        const row = appServer.AppDataSource.getRepository(Evaluation).create(newEval)
        row.average_metrics = JSON.stringify({})

-        const additionalConfig: any = {
+        const additionalConfig: ICommonObject = {
+            chatflowTypes: body.chatflowType ? JSON.parse(body.chatflowType) : [],
            datasetAsOneConversation: body.datasetAsOneConversation,
            simpleEvaluators: body.selectedSimpleEvaluators.length > 0 ? JSON.parse(body.selectedSimpleEvaluators) : []
        }
@@ -152,7 +155,7 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
        let evalMetrics = { passCount: 0, failCount: 0, errorCount: 0 }
        evalRunner
            .runEvaluations(data)
-            .then(async (result: any) => {
+            .then(async (result) => {
                let totalTime = 0
                // let us assume that the eval is successful
                let allRowsSuccessful = true
@@ -171,8 +174,48 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
                            totalTime += parseFloat(evaluationRow.latency)
                            let metricsObjFromRun: ICommonObject = {}

+                            let nested_metrics = evaluationRow.nested_metrics
+
+                            let promptTokens = 0,
+                                completionTokens = 0,
+                                totalTokens = 0
+                            let inputCost = 0,
+                                outputCost = 0,
+                                totalCost = 0
+                            if (nested_metrics && nested_metrics.length > 0) {
+                                for (let i = 0; i < nested_metrics.length; i++) {
+                                    const nested_metric = nested_metrics[i]
+                                    if (nested_metric.model && nested_metric.promptTokens > 0) {
+                                        promptTokens += nested_metric.promptTokens
+                                        completionTokens += nested_metric.completionTokens
+                                        totalTokens += nested_metric.totalTokens
+
+                                        inputCost += nested_metric.cost_values.input_cost
+                                        outputCost += nested_metric.cost_values.output_cost
+                                        totalCost += nested_metric.cost_values.total_cost
+
+                                        nested_metric['totalCost'] = formatCost(nested_metric.cost_values.total_cost)
+                                        nested_metric['promptCost'] = formatCost(nested_metric.cost_values.input_cost)
+                                        nested_metric['completionCost'] = formatCost(nested_metric.cost_values.output_cost)
+                                    }
+                                }
+                                nested_metrics = nested_metrics.filter((metric: any) => {
+                                    return metric.model && metric.provider
+                                })
+                            }
                            const metrics = evaluationRow.metrics
                            if (metrics) {
+                                if (nested_metrics && nested_metrics.length > 0) {
+                                    metrics.push({
+                                        promptTokens: promptTokens,
+                                        completionTokens: completionTokens,
+                                        totalTokens: totalTokens,
+                                        totalCost: formatCost(totalCost),
+                                        promptCost: formatCost(inputCost),
+                                        completionCost: formatCost(outputCost)
+                                    })
+                                    metricsObjFromRun.nested_metrics = nested_metrics
+                                }
                                metrics.map((metric: any) => {
                                    if (metric) {
                                        const json = typeof metric === 'object' ? metric : JSON.parse(metric)
@@ -211,7 +254,7 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
                        if (body.evaluationType === 'llm') {
                            resultRow.llmConfig = additionalConfig.llmConfig
                            resultRow.LLMEvaluators = body.selectedLLMEvaluators.length > 0 ? JSON.parse(body.selectedLLMEvaluators) : []
-                            const llmEvaluatorMap: any = []
+                            const llmEvaluatorMap: { evaluatorId: string; evaluator: any }[] = []
                            for (let i = 0; i < resultRow.LLMEvaluators.length; i++) {
                                const evaluatorId = resultRow.LLMEvaluators[i]
                                const evaluator = await evaluatorsService.getEvaluator(evaluatorId)
@@ -243,23 +286,27 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
                    }
                    appServer.AppDataSource.getRepository(Evaluation)
                        .findOneBy({ id: newEvaluation.id })
-                        .then((evaluation: any) => {
-                            evaluation.status = allRowsSuccessful ? EvaluationStatus.COMPLETED : EvaluationStatus.ERROR
-                            evaluation.average_metrics = JSON.stringify({
-                                averageLatency: (totalTime / result.rows.length).toFixed(3),
-                                totalRuns: result.rows.length,
-                                ...evalMetrics,
-                                passPcnt: passPercent.toFixed(2)
-                            })
-                            appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                        .then((evaluation) => {
+                            if (evaluation) {
+                                evaluation.status = allRowsSuccessful ? EvaluationStatus.COMPLETED : EvaluationStatus.ERROR
+                                evaluation.average_metrics = JSON.stringify({
+                                    averageLatency: (totalTime / result.rows.length).toFixed(3),
+                                    totalRuns: result.rows.length,
+                                    ...evalMetrics,
+                                    passPcnt: passPercent.toFixed(2)
+                                })
+                                appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                            }
                        })
                } catch (error) {
                    //update the evaluation with status as error
                    appServer.AppDataSource.getRepository(Evaluation)
                        .findOneBy({ id: newEvaluation.id })
-                        .then((evaluation: any) => {
-                            evaluation.status = EvaluationStatus.ERROR
-                            appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                        .then((evaluation) => {
+                            if (evaluation) {
+                                evaluation.status = EvaluationStatus.ERROR
+                                appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                            }
                        })
                }
            })
@@ -268,12 +315,14 @@ const createEvaluation = async (body: ICommonObject, baseURL: string, orgId: str
                console.error('Error running evaluations:', getErrorMessage(error))
                appServer.AppDataSource.getRepository(Evaluation)
                    .findOneBy({ id: newEvaluation.id })
-                    .then((evaluation: any) => {
-                        evaluation.status = EvaluationStatus.ERROR
-                        evaluation.average_metrics = JSON.stringify({
-                            error: getErrorMessage(error)
-                        })
-                        appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                    .then((evaluation) => {
+                        if (evaluation) {
+                            evaluation.status = EvaluationStatus.ERROR
+                            evaluation.average_metrics = JSON.stringify({
+                                error: getErrorMessage(error)
+                            })
+                            appServer.AppDataSource.getRepository(Evaluation).save(evaluation)
+                        }
                    })
                    .catch((dbError) => {
                        console.error('Error updating evaluation status:', getErrorMessage(dbError))
@@ -378,18 +427,31 @@ const isOutdated = async (id: string) => {
                returnObj.dataset = dataset
            }
        } else {
-            returnObj.errors.push(`Dataset ${evaluation.datasetName} not found`)
+            returnObj.errors.push({
+                message: `Dataset ${evaluation.datasetName} not found`,
+                id: evaluation.datasetId
+            })
            isOutdated = true
        }
-        const chatflows = JSON.parse(evaluation.chatflowId)
-        const chatflowNames = JSON.parse(evaluation.chatflowName)
-
-        for (let i = 0; i < chatflows.length; i++) {
+        const chatflowIds = evaluation.chatflowId ? JSON.parse(evaluation.chatflowId) : []
+        const chatflowNames = evaluation.chatflowName ? JSON.parse(evaluation.chatflowName) : []
+        const chatflowTypes = evaluation.additionalConfig ? JSON.parse(evaluation.additionalConfig).chatflowTypes : []
+        for (let i = 0; i < chatflowIds.length; i++) {
+            // check for backward compatibility, as previous versions did not the types in additionalConfig
+            if (chatflowTypes && chatflowTypes.length >= 0) {
+                if (chatflowTypes[i] === 'Custom Assistant') {
+                    // if the chatflow type is custom assistant, then we should NOT check in the chatflows table
+                    continue
+                }
+            }
            const chatflow = await appServer.AppDataSource.getRepository(ChatFlow).findOneBy({
-                id: chatflows[i]
+                id: chatflowIds[i]
            })
            if (!chatflow) {
-                returnObj.errors.push(`Chatflow ${chatflowNames[i]} not found`)
+                returnObj.errors.push({
+                    message: `Chatflow ${chatflowNames[i]} not found`,
+                    id: chatflowIds[i]
+                })
                isOutdated = true
            } else {
                const chatflowLastUpdated = chatflow.updatedDate.getTime()
@@ -397,12 +459,42 @@ const isOutdated = async (id: string) => {
                    isOutdated = true
                    returnObj.chatflows.push({
                        chatflowName: chatflowNames[i],
-                        chatflowId: chatflows[i],
+                        chatflowId: chatflowIds[i],
+                        chatflowType: chatflow.type === 'AGENTFLOW' ? 'Agentflow v2' : 'Chatflow',
                        isOutdated: true
                    })
                }
            }
        }
+        if (chatflowTypes && chatflowTypes.length > 0) {
+            for (let i = 0; i < chatflowIds.length; i++) {
+                if (chatflowTypes[i] !== 'Custom Assistant') {
+                    // if the chatflow type is NOT custom assistant, then bail out for this item
+                    continue
+                }
+                const assistant = await appServer.AppDataSource.getRepository(Assistant).findOneBy({
+                    id: chatflowIds[i]
+                })
+                if (!assistant) {
+                    returnObj.errors.push({
+                        message: `Custom Assistant ${chatflowNames[i]} not found`,
+                        id: chatflowIds[i]
+                    })
+                    isOutdated = true
+                } else {
+                    const chatflowLastUpdated = assistant.updatedDate.getTime()
+                    if (chatflowLastUpdated > evaluationRunDate) {
+                        isOutdated = true
+                        returnObj.chatflows.push({
+                            chatflowName: chatflowNames[i],
+                            chatflowId: chatflowIds[i],
+                            chatflowType: 'Custom Assistant',
+                            isOutdated: true
+                        })
+                    }
+                }
+            }
+        }
        returnObj.isOutdated = isOutdated
        return returnObj
    } catch (error) {
@@ -424,7 +516,7 @@ const getEvaluation = async (id: string) => {
            where: { evaluationId: id }
        })
        const versions = (await getVersions(id)).versions
-        const versionNo = versions.findIndex((version: any) => version.id === id) + 1
+        const versionNo = versions.findIndex((version) => version.id === id) + 1
        return {
            ...evaluation,
            versionCount: versionCount,
@@ -451,7 +543,7 @@ const getVersions = async (id: string) => {
                runDate: 'ASC'
            }
        })
-        const returnResults: any[] = []
+        const returnResults: { id: string; runDate: Date; version: number }[] = []
        versions.map((version, index) => {
            returnResults.push({
                id: version.id,
@@ -1805,7 +1805,7 @@ export const executeAgentFlow = async ({
        role: 'userMessage',
        content: finalUserInput,
        chatflowid,
-        chatType: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+        chatType: evaluationRunId ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
        chatId,
        sessionId,
        createdDate: userMessageDateTime,
@@ -1820,7 +1820,7 @@ export const executeAgentFlow = async ({
        role: 'apiMessage',
        content: content,
        chatflowid,
-        chatType: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+        chatType: evaluationRunId ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
        chatId,
        sessionId,
        executionId: newExecution.id
@@ -1856,7 +1856,7 @@ export const executeAgentFlow = async ({
            version: await getAppVersion(),
            chatflowId: chatflowid,
            chatId,
-            type: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+            type: evaluationRunId ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
            flowGraph: getTelemetryFlowObj(nodes, edges)
        },
        orgId
@@ -551,7 +551,7 @@ export const executeFlow = async ({
                role: 'userMessage',
                content: incomingInput.question,
                chatflowid: agentflow.id,
-                chatType: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+                chatType: isEvaluation ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
                chatId,
                memoryType,
                sessionId,
@@ -566,7 +566,7 @@ export const executeFlow = async ({
                role: 'apiMessage',
                content: finalResult,
                chatflowid: agentflow.id,
-                chatType: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+                chatType: isEvaluation ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
                chatId,
                memoryType,
                sessionId
@@ -598,7 +598,7 @@ export const executeFlow = async ({
                    version: await getAppVersion(),
                    agentflowId: agentflow.id,
                    chatId,
-                    type: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+                    type: isEvaluation ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
                    flowGraph: getTelemetryFlowObj(nodes, edges)
                },
                orgId
@@ -807,7 +807,7 @@ export const executeFlow = async ({
                version: await getAppVersion(),
                chatflowId: chatflowid,
                chatId,
-                type: isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
+                type: isEvaluation ? ChatType.EVALUATION : isInternal ? ChatType.INTERNAL : ChatType.EXTERNAL,
                flowGraph: getTelemetryFlowObj(nodes, edges)
            },
            orgId
@@ -905,17 +905,17 @@ export const utilBuildChatflow = async (req: Request, isInternal: boolean = fals
    const isTool = req.get('flowise-tool') === 'true'
    const isEvaluation: boolean = req.headers['X-Flowise-Evaluation'] || req.body.evaluation
    let evaluationRunId = ''
-    if (isEvaluation) {
-        evaluationRunId = req.body.evaluationRunId
-        if (evaluationRunId) {
-            const newEval = {
-                evaluation: {
-                    status: true,
-                    evaluationRunId
-                }
+    evaluationRunId = req.body.evaluationRunId
+    if (isEvaluation && chatflow.type !== 'AGENTFLOW' && req.body.evaluationRunId) {
+        // this is needed for the collection of token metrics for non-agent flows,
+        // for agentflows the execution trace has the info needed
+        const newEval = {
+            evaluation: {
+                status: true,
+                evaluationRunId
            }
-            chatflow.analytic = JSON.stringify(newEval)
        }
+        chatflow.analytic = JSON.stringify(newEval)
    }

    try {