From ea329113be24c64d5e9dbaf98140ea61e2cd9ad4 Mon Sep 17 00:00:00 2001 From: Rylan Cai <67412196+cy948@users.noreply.github.com> Date: Tue, 10 Mar 2026 09:53:26 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(eval):=20add=20external=20scor?= =?UTF-8?q?ing=20mode=20(#12729)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * :bug: fix: browsecomp topics * :memo: docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * :sparkles: feat: add external eval routes * wip: add eval cli * :bug: fix: support authoritize in no browser environment * wip: pass tests * :recycle: refactor: remove tests * :recycle: refactor: mo camel case --- apps/cli/src/commands/eval.test.ts | 285 ++++++++++ apps/cli/src/commands/eval.ts | 326 +++++++++++ apps/cli/src/index.ts | 2 + locales/en-US/eval.json | 8 +- locales/zh-CN/eval.json | 5 + .../database/src/models/agentEval/dataset.ts | 2 + packages/database/src/models/agentEval/run.ts | 2 +- packages/database/src/schemas/agentEvals.ts | 5 +- packages/eval-rubric/src/evaluate.ts | 12 +- packages/eval-rubric/src/matchers/external.ts | 9 + packages/eval-rubric/src/matchers/index.ts | 19 +- packages/eval-rubric/src/matchers/llmEq.ts | 89 +++ .../eval-rubric/src/matchers/llmRubric.ts | 4 + packages/types/src/eval/agentEval.ts | 9 +- packages/types/src/eval/agentEvalRun.ts | 1 + packages/types/src/eval/rubric.ts | 2 + .../agent-eval-run/finalize-run/route.ts | 13 +- src/locales/default/eval.ts | 16 +- .../datasets/[datasetId]/index.tsx | 1 + .../features/DatasetsTab/DatasetCard.tsx | 1 + .../features/DatasetsTab/TestCaseTable.tsx | 18 +- .../features/CaseResultsTable/index.tsx | 33 +- .../[runId]/features/PendingState/index.tsx | 4 +- .../runs/[runId]/features/RunHeader/index.tsx | 27 +- .../[benchmarkId]/runs/[runId]/index.tsx | 4 +- .../(main)/eval/config/datasetPresets.ts | 143 +++++ .../features/DatasetCreateModal/index.tsx | 1 + .../eval/features/DatasetEditModal/index.tsx | 24 +- .../DatasetImportModal/MappingStep.tsx | 12 + .../(main)/eval/features/StatusBadge.tsx | 3 +- src/server/routers/lambda/agentEval.ts | 15 +- .../routers/lambda/agentEvalExternal.ts | 514 ++++++++++++++++++ src/server/routers/lambda/index.ts | 2 + src/server/services/agentEvalRun/index.ts | 84 ++- 34 files changed, 1655 insertions(+), 40 deletions(-) create mode 100644 apps/cli/src/commands/eval.test.ts create mode 100644 apps/cli/src/commands/eval.ts create mode 100644 packages/eval-rubric/src/matchers/external.ts create mode 100644 packages/eval-rubric/src/matchers/llmEq.ts create mode 100644 src/server/routers/lambda/agentEvalExternal.ts diff --git a/apps/cli/src/commands/eval.test.ts b/apps/cli/src/commands/eval.test.ts new file mode 100644 index 0000000000..f402567ba0 --- /dev/null +++ b/apps/cli/src/commands/eval.test.ts @@ -0,0 +1,285 @@ +import { Command } from 'commander'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +const { mockTrpcClient } = vi.hoisted(() => ({ + mockTrpcClient: { + agentEvalExternal: { + datasetGet: { query: vi.fn() }, + messagesList: { query: vi.fn() }, + runGet: { query: vi.fn() }, + runSetStatus: { mutate: vi.fn() }, + runTopicReportResult: { mutate: vi.fn() }, + runTopicsList: { query: vi.fn() }, + testCasesCount: { query: vi.fn() }, + threadsList: { query: vi.fn() }, + }, + }, +})); + +const { getTrpcClientMock } = vi.hoisted(() => ({ + getTrpcClientMock: vi.fn(), +})); + +vi.mock('../api/client', () => ({ + getTrpcClient: getTrpcClientMock, +})); + +vi.mock('../utils/logger', () => ({ + log: { + debug: vi.fn(), + error: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + }, + setVerbose: vi.fn(), +})); + +// eslint-disable-next-line import-x/first +import { log } from '../utils/logger'; +// eslint-disable-next-line import-x/first +import { registerEvalCommand } from './eval'; + +describe('eval command', () => { + let exitSpy: ReturnType; + let logSpy: ReturnType; + + beforeEach(() => { + getTrpcClientMock.mockResolvedValue(mockTrpcClient); + exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any); + logSpy = vi.spyOn(console, 'log').mockImplementation(() => {}); + + for (const method of Object.values(mockTrpcClient.agentEvalExternal)) { + for (const fn of Object.values(method)) { + (fn as ReturnType).mockReset(); + } + } + }); + + afterEach(() => { + exitSpy.mockRestore(); + logSpy.mockRestore(); + vi.clearAllMocks(); + }); + + const createProgram = () => { + const program = new Command(); + program.exitOverride(); + registerEvalCommand(program); + return program; + }; + + it('should call runGet and output json envelope', async () => { + mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({ + config: { k: 1 }, + datasetId: 'dataset-1', + id: 'run-1', + }); + + const program = createProgram(); + await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']); + + expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' }); + + const payload = JSON.parse(logSpy.mock.calls[0][0]); + expect(payload).toEqual({ + data: { + config: { k: 1 }, + datasetId: 'dataset-1', + id: 'run-1', + }, + error: null, + ok: true, + version: 'v1', + }); + }); + + it('should call datasetGet and output json envelope', async () => { + mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({ + id: 'dataset-1', + metadata: { preset: 'deepsearchqa' }, + }); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'dataset', + 'get', + '--dataset-id', + 'dataset-1', + '--json', + ]); + + expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({ + datasetId: 'dataset-1', + }); + }); + + it('should pass onlyExternal to runTopicsList', async () => { + mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'run-topics', + 'list', + '--run-id', + 'run-1', + '--only-external', + '--json', + ]); + + expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({ + onlyExternal: true, + runId: 'run-1', + }); + }); + + it('should pass topicId and threadId to messagesList', async () => { + mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'messages', + 'list', + '--topic-id', + 'topic-1', + '--thread-id', + 'thread-1', + '--json', + ]); + + expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({ + threadId: 'thread-1', + topicId: 'topic-1', + }); + }); + + it('should parse and report run-topic result', async () => { + mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({ + success: true, + }); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'run-topic', + 'report-result', + '--run-id', + 'run-1', + '--topic-id', + 'topic-1', + '--thread-id', + 'thread-1', + '--score', + '0.91', + '--correct', + 'true', + '--result-json', + '{"grade":"A"}', + '--json', + ]); + + expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({ + correct: true, + result: { grade: 'A' }, + runId: 'run-1', + score: 0.91, + threadId: 'thread-1', + topicId: 'topic-1', + }); + }); + + it('should update run status', async () => { + mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({ + runId: 'run-1', + status: 'completed', + success: true, + }); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'run', + 'set-status', + '--run-id', + 'run-1', + '--status', + 'completed', + ]); + + expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({ + runId: 'run-1', + status: 'completed', + }); + expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to')); + }); + + it('should output json error envelope when command fails', async () => { + const error = Object.assign(new Error('Run not found'), { + data: { code: 'NOT_FOUND' }, + }); + mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'run', + 'get', + '--run-id', + 'run-404', + '--json', + ]); + + const payload = JSON.parse(logSpy.mock.calls[0][0]); + expect(payload).toEqual({ + data: null, + error: { code: 'NOT_FOUND', message: 'Run not found' }, + ok: false, + version: 'v1', + }); + expect(exitSpy).toHaveBeenCalledWith(1); + }); + + it('should query test case count', async () => { + mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 }); + + const program = createProgram(); + await program.parseAsync([ + 'node', + 'test', + 'eval', + 'test-cases', + 'count', + '--dataset-id', + 'dataset-1', + '--json', + ]); + + expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({ + datasetId: 'dataset-1', + }); + }); + + it('should log plain error without --json', async () => { + mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom')); + + const program = createProgram(); + await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']); + + expect(log.error).toHaveBeenCalledWith('boom'); + expect(exitSpy).toHaveBeenCalledWith(1); + }); +}); diff --git a/apps/cli/src/commands/eval.ts b/apps/cli/src/commands/eval.ts new file mode 100644 index 0000000000..9ef6b2a4a6 --- /dev/null +++ b/apps/cli/src/commands/eval.ts @@ -0,0 +1,326 @@ +import type { Command } from 'commander'; +import { InvalidArgumentError } from 'commander'; +import pc from 'picocolors'; + +import { getTrpcClient } from '../api/client'; +import { log } from '../utils/logger'; + +const JSON_VERSION = 'v1' as const; + +interface JsonError { + code?: string; + message: string; +} + +interface JsonEnvelope { + data: T | null; + error: JsonError | null; + ok: boolean; + version: typeof JSON_VERSION; +} + +interface JsonOption { + json?: boolean; +} + +interface RunGetOptions extends JsonOption { + runId: string; +} + +interface RunSetStatusOptions extends JsonOption { + runId: string; + status: 'completed' | 'external'; +} + +interface DatasetGetOptions extends JsonOption { + datasetId: string; +} + +interface RunTopicsListOptions extends JsonOption { + onlyExternal?: boolean; + runId: string; +} + +interface ThreadsListOptions extends JsonOption { + topicId: string; +} + +interface MessagesListOptions extends JsonOption { + threadId?: string; + topicId: string; +} + +interface TestCasesCountOptions extends JsonOption { + datasetId: string; +} + +interface RunTopicReportResultOptions extends JsonOption { + correct: boolean; + resultJson: Record; + runId: string; + score: number; + threadId?: string; + topicId: string; +} + +const printJson = (data: unknown) => { + console.log(JSON.stringify(data, null, 2)); +}; + +const outputJsonSuccess = (data: unknown) => { + const payload: JsonEnvelope = { + data, + error: null, + ok: true, + version: JSON_VERSION, + }; + printJson(payload); +}; + +const isRecord = (value: unknown): value is Record => + typeof value === 'object' && value !== null; + +const toJsonError = (error: unknown): JsonError => { + if (error instanceof Error) { + const maybeData = (error as Error & { data?: { code?: string } }).data; + const code = maybeData?.code; + + return { + code: typeof code === 'string' ? code : undefined, + message: error.message, + }; + } + + if (isRecord(error)) { + const code = typeof error.code === 'string' ? error.code : undefined; + const message = typeof error.message === 'string' ? error.message : 'Unknown error'; + return { code, message }; + } + + return { message: String(error) }; +}; + +const handleCommandError = (error: unknown, json: boolean) => { + const normalized = toJsonError(error); + + if (json) { + const payload: JsonEnvelope = { + data: null, + error: normalized, + ok: false, + version: JSON_VERSION, + }; + printJson(payload); + } else { + log.error(normalized.message); + } + + process.exit(1); +}; + +const parseScore = (value: string) => { + const score = Number(value); + if (!Number.isFinite(score)) { + throw new InvalidArgumentError(`Invalid score: ${value}`); + } + return score; +}; + +const parseBoolean = (value: string) => { + const normalized = value.trim().toLowerCase(); + if (['1', 'true', 'yes'].includes(normalized)) return true; + if (['0', 'false', 'no'].includes(normalized)) return false; + throw new InvalidArgumentError(`Invalid boolean value: ${value}`); +}; + +const parseResultJson = (value: string) => { + let parsed: unknown; + try { + parsed = JSON.parse(value); + } catch { + throw new InvalidArgumentError('Invalid JSON value for --result-json'); + } + + if (!isRecord(parsed) || Array.isArray(parsed)) { + throw new InvalidArgumentError('--result-json must be a JSON object'); + } + + return parsed; +}; + +const parseRunStatus = (value: string) => { + if (value !== 'completed' && value !== 'external') { + throw new InvalidArgumentError("Only 'completed' and 'external' are supported"); + } + + return value as 'completed' | 'external'; +}; + +const executeCommand = async ( + options: JsonOption, + action: () => Promise, + successMessage?: string, +) => { + try { + const data = await action(); + if (options.json) { + outputJsonSuccess(data); + return; + } + + if (successMessage) { + console.log(`${pc.green('OK')} ${successMessage}`); + return; + } + + printJson(data); + } catch (error) { + handleCommandError(error, Boolean(options.json)); + } +}; + +export function registerEvalCommand(program: Command) { + const evalCmd = program.command('eval').description('Manage external evaluation workflows'); + + const runCmd = evalCmd.command('run').description('Manage evaluation runs'); + + runCmd + .command('get') + .description('Get run information') + .requiredOption('--run-id ', 'Run ID') + .option('--json', 'Output JSON envelope') + .action(async (options: RunGetOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.runGet.query({ runId: options.runId }); + }), + ); + + runCmd + .command('set-status') + .description('Set run status (external API supports completed or external)') + .requiredOption('--run-id ', 'Run ID') + .requiredOption('--status ', 'Status (completed | external)', parseRunStatus) + .option('--json', 'Output JSON envelope') + .action(async (options: RunSetStatusOptions) => + executeCommand( + options, + async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.runSetStatus.mutate({ + runId: options.runId, + status: options.status, + }); + }, + `Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`, + ), + ); + + evalCmd + .command('dataset') + .description('Manage evaluation datasets') + .command('get') + .description('Get dataset information') + .requiredOption('--dataset-id ', 'Dataset ID') + .option('--json', 'Output JSON envelope') + .action(async (options: DatasetGetOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId }); + }), + ); + + evalCmd + .command('run-topics') + .description('Manage run topics') + .command('list') + .description('List topics in a run') + .requiredOption('--run-id ', 'Run ID') + .option('--only-external', 'Only return topics pending external evaluation') + .option('--json', 'Output JSON envelope') + .action(async (options: RunTopicsListOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.runTopicsList.query({ + onlyExternal: Boolean(options.onlyExternal), + runId: options.runId, + }); + }), + ); + + evalCmd + .command('threads') + .description('Manage evaluation threads') + .command('list') + .description('List threads by topic') + .requiredOption('--topic-id ', 'Topic ID') + .option('--json', 'Output JSON envelope') + .action(async (options: ThreadsListOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.threadsList.query({ topicId: options.topicId }); + }), + ); + + evalCmd + .command('messages') + .description('Manage evaluation messages') + .command('list') + .description('List messages by topic and optional thread') + .requiredOption('--topic-id ', 'Topic ID') + .option('--thread-id ', 'Thread ID') + .option('--json', 'Output JSON envelope') + .action(async (options: MessagesListOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.messagesList.query({ + threadId: options.threadId, + topicId: options.topicId, + }); + }), + ); + + evalCmd + .command('test-cases') + .description('Manage evaluation test cases') + .command('count') + .description('Count test cases by dataset') + .requiredOption('--dataset-id ', 'Dataset ID') + .option('--json', 'Output JSON envelope') + .action(async (options: TestCasesCountOptions) => + executeCommand(options, async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId }); + }), + ); + + evalCmd + .command('run-topic') + .description('Manage evaluation run-topic reporting') + .command('report-result') + .description('Report one evaluation result for a run topic') + .requiredOption('--run-id ', 'Run ID') + .requiredOption('--topic-id ', 'Topic ID') + .option('--thread-id ', 'Thread ID (required for k > 1)') + .requiredOption('--score ', 'Evaluation score', parseScore) + .requiredOption('--correct ', 'Whether the result is correct', parseBoolean) + .requiredOption('--result-json ', 'Raw evaluation result JSON object', parseResultJson) + .option('--json', 'Output JSON envelope') + .action(async (options: RunTopicReportResultOptions) => + executeCommand( + options, + async () => { + const client = await getTrpcClient(); + return client.agentEvalExternal.runTopicReportResult.mutate({ + correct: options.correct, + result: options.resultJson, + runId: options.runId, + score: options.score, + threadId: options.threadId, + topicId: options.topicId, + }); + }, + `Reported result for topic ${pc.bold(options.topicId)}`, + ), + ); +} diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 0340d01859..9ef4df98db 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc'; import { registerFileCommand } from './commands/file'; import { registerGenerateCommand } from './commands/generate'; import { registerKbCommand } from './commands/kb'; +import { registerEvalCommand } from './commands/eval'; import { registerLoginCommand } from './commands/login'; import { registerLogoutCommand } from './commands/logout'; import { registerMemoryCommand } from './commands/memory'; @@ -44,5 +45,6 @@ registerModelCommand(program); registerProviderCommand(program); registerPluginCommand(program); registerConfigCommand(program); +registerEvalCommand(program); program.parse(); diff --git a/locales/en-US/eval.json b/locales/en-US/eval.json index e24ef71cc7..a2c7564561 100644 --- a/locales/en-US/eval.json +++ b/locales/en-US/eval.json @@ -157,13 +157,15 @@ "difficulty.easy": "Easy", "difficulty.hard": "Hard", "difficulty.medium": "Medium", + "evalMode.answer-relevance": "LLM Relevance", + "evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)", "evalMode.contains": "Contains Match", "evalMode.contains.desc": "Output must contain the expected text", "evalMode.equals": "Exact Match", "evalMode.equals.desc": "Output must be exactly the same as expected", "evalMode.label": "Eval Mode", "evalMode.llm-rubric": "LLM Judge", - "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality", + "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)", "evalMode.placeholder": "Select eval mode", "evalMode.prompt.label": "Judge Prompt", "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge", @@ -256,12 +258,16 @@ "run.running.hint": "Evaluation is running, results will appear shortly...", "run.status.aborted": "Aborted", "run.status.completed": "Completed", + "run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.", "run.status.error": "Run Error", + "run.status.external": "External", + "run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.", "run.status.failed": "Failed", "run.status.idle": "Idle", "run.status.pending": "Pending", "run.status.running": "Running", "run.status.timeout": "Timeout", + "sidebar": "Evaluation", "sidebar.benchmarks": "Benchmarks", "sidebar.dashboard": "Dashboard", "sidebar.datasets": "Datasets", diff --git a/locales/zh-CN/eval.json b/locales/zh-CN/eval.json index f4502aeb92..8896b4bf8d 100644 --- a/locales/zh-CN/eval.json +++ b/locales/zh-CN/eval.json @@ -161,6 +161,8 @@ "evalMode.contains.desc": "输出中必须包含期望的文本", "evalMode.equals": "精确匹配", "evalMode.equals.desc": "输出必须与期望内容完全一致", + "evalMode.external": "外部评估", + "evalMode.external.desc": "智能体完成运行后,由外部系统提交评估结果", "evalMode.label": "评估模式", "evalMode.llm-rubric": "LLM 评判", "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量", @@ -256,7 +258,10 @@ "run.running.hint": "评测进行中,结果即将呈现...", "run.status.aborted": "已终止", "run.status.completed": "已完成", + "run.status.completed.tooltip": "评测已完成运行,所有结果已评估。", "run.status.error": "运行出错", + "run.status.external": "待外部评测", + "run.status.external.tooltip": "智能体已完成运行,等待外部系统提交评估结果。", "run.status.failed": "失败", "run.status.idle": "待开始", "run.status.pending": "等待中", diff --git a/packages/database/src/models/agentEval/dataset.ts b/packages/database/src/models/agentEval/dataset.ts index 8413acc43d..f4a33256ec 100644 --- a/packages/database/src/models/agentEval/dataset.ts +++ b/packages/database/src/models/agentEval/dataset.ts @@ -50,6 +50,8 @@ export class AgentEvalDatasetModel { benchmarkId: agentEvalDatasets.benchmarkId, createdAt: agentEvalDatasets.createdAt, description: agentEvalDatasets.description, + evalConfig: agentEvalDatasets.evalConfig, + evalMode: agentEvalDatasets.evalMode, id: agentEvalDatasets.id, identifier: agentEvalDatasets.identifier, metadata: agentEvalDatasets.metadata, diff --git a/packages/database/src/models/agentEval/run.ts b/packages/database/src/models/agentEval/run.ts index 0cc6dc89b5..4642b7c9da 100644 --- a/packages/database/src/models/agentEval/run.ts +++ b/packages/database/src/models/agentEval/run.ts @@ -31,7 +31,7 @@ export class AgentEvalRunModel { datasetId?: string; limit?: number; offset?: number; - status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted'; + status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted' | 'external'; }) => { const conditions = [eq(agentEvalRuns.userId, this.userId)]; diff --git a/packages/database/src/schemas/agentEvals.ts b/packages/database/src/schemas/agentEvals.ts index 027e2eabea..8dc5bc2341 100644 --- a/packages/database/src/schemas/agentEvals.ts +++ b/packages/database/src/schemas/agentEvals.ts @@ -43,6 +43,7 @@ const evalModes = [ 'similar', 'levenshtein', 'rubric', + 'external', ] as const; // ============================================ @@ -181,7 +182,7 @@ export const agentEvalRuns = pgTable( name: text('name'), status: text('status', { - enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted'], + enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'], }) .default('idle') .notNull(), @@ -228,7 +229,7 @@ export const agentEvalRunTopics = pgTable( .notNull(), status: text('status', { - enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout'], + enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout', 'external', 'completed'], }), score: real('score'), diff --git a/packages/eval-rubric/src/evaluate.ts b/packages/eval-rubric/src/evaluate.ts index 63262c8178..14c717f2da 100644 --- a/packages/eval-rubric/src/evaluate.ts +++ b/packages/eval-rubric/src/evaluate.ts @@ -87,12 +87,20 @@ export const evaluate = async ( const candidates: string[] = JSON.parse(expected); const results: MatchResult[] = []; for (const c of candidates) { - results.push(await match({ actual: extracted, expected: c, rubric }, matchContext)); + results.push( + await match( + { input: testCase.input, actual: extracted, expected: c, rubric }, + matchContext, + ), + ); } const best = results.reduce((a, b) => (a.score >= b.score ? a : b)); result = best; } else { - result = await match({ actual: extracted, expected, rubric }, matchContext); + result = await match( + { input: testCase.input, actual: extracted, expected, rubric }, + matchContext, + ); } rubricResults.push({ diff --git a/packages/eval-rubric/src/matchers/external.ts b/packages/eval-rubric/src/matchers/external.ts new file mode 100644 index 0000000000..1390ed5edb --- /dev/null +++ b/packages/eval-rubric/src/matchers/external.ts @@ -0,0 +1,9 @@ +import type { MatchResult } from './types'; + +export const matchExternal = async (): Promise => { + return { + passed: false, + score: 0, + reason: 'Waiting for external evaluation...', + }; +}; diff --git a/packages/eval-rubric/src/matchers/index.ts b/packages/eval-rubric/src/matchers/index.ts index fa89733daa..aa17a339c9 100644 --- a/packages/eval-rubric/src/matchers/index.ts +++ b/packages/eval-rubric/src/matchers/index.ts @@ -4,8 +4,10 @@ import { matchAnyOf } from './anyOf'; import { matchContains } from './contains'; import { matchEndsWith } from './endsWith'; import { matchEquals } from './equals'; +import { matchExternal } from './external'; import { matchJsonSchema } from './jsonSchema'; import { matchLevenshtein } from './levenshtein'; +import { matchLLMEq } from './llmEq'; import { matchLLMRubric } from './llmRubric'; import { matchNumeric } from './numeric'; import { matchRegex } from './regex'; @@ -18,10 +20,15 @@ export type { GenerateObjectPayload, MatchContext, MatchResult } from './types'; * Run a single rubric matcher against actual vs expected */ export const match = async ( - params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric }, + params: { + input: string; + actual: string; + expected: string | undefined; + rubric: EvalBenchmarkRubric; + }, context?: MatchContext, ): Promise => { - const { actual, expected, rubric } = params; + const { actual, expected, rubric, input } = params; const { type, config } = rubric; switch (type) { @@ -57,6 +64,10 @@ export const match = async ( return matchLevenshtein(actual, expected, config); } + case 'answer-relevance': { + return matchLLMEq(input, actual, expected, rubric, context); + } + case 'llm-rubric': { return matchLLMRubric(actual, expected, rubric, context); } @@ -65,6 +76,10 @@ export const match = async ( return matchJsonSchema(actual, config); } + case 'external': { + return matchExternal(); + } + default: { return { passed: false, diff --git a/packages/eval-rubric/src/matchers/llmEq.ts b/packages/eval-rubric/src/matchers/llmEq.ts new file mode 100644 index 0000000000..671d5296ef --- /dev/null +++ b/packages/eval-rubric/src/matchers/llmEq.ts @@ -0,0 +1,89 @@ +import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types'; + +import type { MatchContext, MatchResult } from './types'; + +const DEFAULT_SYSTEM_ROLE = [ + 'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.', + 'Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.', + 'Your judgement must be in the format and criteria specified below:', + "extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.", + 'reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.', + 'Scoring rules:', + 'score: Return 1 only when extracted_final_answer clearly and unambiguously matches [correct_answer], or is within a small margin of error for numerical problems.', + 'score: Return 0 when extracted_final_answer is incorrect, missing, ambiguous, non-equivalent, or when you are uncertain.', + 'Treat uncertainty as incorrect (score = 0).', + 'Respond with a JSON object containing ', + '"score" (number: 0 or 1)', + 'and "reason" (brief explanation for the judgement).', +].join('\n'); + +const JUDGE_SCORE_SCHEMA: Record = { + additionalProperties: false, + properties: { + score: { + description: 'Binary score for judgement: 1=correct, 0=incorrect/uncertain', + enum: [0, 1], + type: 'number', + }, + reason: { description: 'Brief explanation for the judgement', type: 'string' }, + }, + required: ['score', 'reason'], + type: 'object', +}; + +function buildJudgeUserPrompt( + question: string, + actual: string, + expected: string | undefined, +): string { + const parts = [`[question]\n${question}`, `[response]\n${actual}`]; + if (expected) { + parts.push(`[correct_answer]\n${expected}`); + } + return parts.join('\n\n'); +} + +export const matchLLMEq = async ( + question: string, + actual: string, + expected: string | undefined, + rubric: EvalBenchmarkRubric, + context?: MatchContext, +): Promise => { + if (!context?.generateObject) { + return { passed: false, reason: 'LLM judge not available', score: 0 }; + } + + const cfg = rubric.config as RubricConfigLLM; + const model = cfg.model || context.judgeModel; + + if (!model) { + return { passed: false, reason: 'No judge model configured', score: 0 }; + } + + try { + const result = await context.generateObject({ + messages: [ + { content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' }, + { content: buildJudgeUserPrompt(question, actual, expected), role: 'user' }, + ], + model, + provider: cfg.provider, + schema: JUDGE_SCORE_SCHEMA, + }); + + const score = result?.score === 1 ? 1 : 0; + + return { + passed: score === 1, + reason: result?.reason, + score, + }; + } catch (error) { + return { + passed: false, + reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`, + score: 0, + }; + } +}; diff --git a/packages/eval-rubric/src/matchers/llmRubric.ts b/packages/eval-rubric/src/matchers/llmRubric.ts index 6c5a4212f8..48758ed4e7 100644 --- a/packages/eval-rubric/src/matchers/llmRubric.ts +++ b/packages/eval-rubric/src/matchers/llmRubric.ts @@ -64,6 +64,10 @@ export const matchLLMRubric = async ( schema: JUDGE_SCORE_SCHEMA, }); + if (!result?.score) { + return { passed: false, reason: 'LLM judge did not return a score', score: 0 }; + } + const score = Math.max(0, Math.min(1, result.score)); const threshold = rubric.threshold ?? 0.6; diff --git a/packages/types/src/eval/agentEval.ts b/packages/types/src/eval/agentEval.ts index 83be7d75fd..6cf994bcc4 100644 --- a/packages/types/src/eval/agentEval.ts +++ b/packages/types/src/eval/agentEval.ts @@ -34,7 +34,7 @@ export interface EvalTestCaseMetadata { /** * Evaluation run status */ -export type EvalRunStatus = 'aborted' | 'completed' | 'failed' | 'pending' | 'running'; +export type EvalRunStatus = 'aborted' | 'completed' | 'external' | 'failed' | 'pending' | 'running'; /** * Evaluation run configuration @@ -96,6 +96,7 @@ export interface EvalRunMetrics { cost?: number; duration?: number; errorCases?: number; + externalCases?: number; failedCases: number; llmCalls?: number; passAllK?: number; @@ -183,6 +184,8 @@ export interface EvalRunTopicResult { completionReason?: string; operationId?: string; rubricScores?: EvalRubricScore[]; + /** Set when evalMode is 'external' — agent finished, awaiting external scoring */ + awaitingExternalEval?: boolean; } /*eslint-enable perfectionist/sort-interfaces */ @@ -194,14 +197,16 @@ export interface EvalThreadResult { cost?: number; duration?: number; error?: string; + llmCalls?: number; operationId?: string; passed?: boolean; rubricScores?: EvalRubricScore[]; score?: number; - status?: 'error' | 'failed' | 'passed' | 'running' | 'timeout'; + status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout' | 'completed'; steps?: number; threadId: string; tokens?: number; + toolCalls?: number; } /** diff --git a/packages/types/src/eval/agentEvalRun.ts b/packages/types/src/eval/agentEvalRun.ts index 2e609c0461..24506c6ed1 100644 --- a/packages/types/src/eval/agentEvalRun.ts +++ b/packages/types/src/eval/agentEvalRun.ts @@ -11,6 +11,7 @@ export type AgentEvalRunStatus = | 'failed' | 'idle' | 'pending' + | 'external' | 'running'; export interface AgentEvalRunTargetAgent { diff --git a/packages/types/src/eval/rubric.ts b/packages/types/src/eval/rubric.ts index 5c9721988d..758d469ebd 100644 --- a/packages/types/src/eval/rubric.ts +++ b/packages/types/src/eval/rubric.ts @@ -22,6 +22,8 @@ export type RubricType = // Similarity | 'similar' | 'levenshtein' + // External evaluation + | 'external' // Composite | 'rubric'; diff --git a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts index 63a92bc8db..23d8e9a9be 100644 --- a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts +++ b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts @@ -66,9 +66,18 @@ export const { POST } = serve( log('Metrics: %O', metrics); - // Step 4: Update run status (failed if all cases errored/timed out) + // Step 4: Update run status + // external: any topic awaits external scoring → whole run waits too + // failed: all cases are non-success (error/timeout) + // completed: everything else const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0); - const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed'; + const externalCount = metrics.externalCases || 0; + const runStatus = + externalCount > 0 + ? 'external' + : nonSuccessCases >= metrics.totalCases + ? 'failed' + : 'completed'; await context.run('agent-eval-run:update-run', async () => { const runModel = new AgentEvalRunModel(db, userId); diff --git a/src/locales/default/eval.ts b/src/locales/default/eval.ts index bca7b5d7cf..99823eb2c6 100644 --- a/src/locales/default/eval.ts +++ b/src/locales/default/eval.ts @@ -173,9 +173,14 @@ export default { 'evalMode.contains.desc': 'Output must contain the expected text', 'evalMode.equals': 'Exact Match', 'evalMode.equals.desc': 'Output must be exactly the same as expected', + 'evalMode.external': 'External Eval', + 'evalMode.external.desc': 'Agent runs to completion; scoring is handled by an external system', 'evalMode.label': 'Eval Mode', 'evalMode.llm-rubric': 'LLM Judge', - 'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality', + 'evalMode.llm-rubric.desc': + 'Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)', + 'evalMode.answer-relevance': 'LLM Relevance', + 'evalMode.answer-relevance.desc': 'Use LLM to evaluate answer relevance (yes or no)', 'evalMode.placeholder': 'Select eval mode', 'evalMode.prompt.label': 'Judge Prompt', 'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge', @@ -204,6 +209,8 @@ export default { 'run.idle.hint': 'Click Start to begin evaluation', 'run.pending.hint': 'Evaluation is queued, waiting to start...', 'run.running.hint': 'Evaluation is running, results will appear shortly...', + 'run.external.hint': + 'Running completed. Waiting for external system to submit evaluation results ...', 'run.filter.active': 'Active', 'run.filter.empty': 'No runs match the current filter.', @@ -249,6 +256,9 @@ export default { 'run.detail.report': 'Evaluation Summary', 'run.detail.config': 'Evaluation Config', 'run.detail.configSnapshot': 'Configuration Snapshot', + 'run.detail.copyRunId': 'Copy Run ID', + 'run.detail.copyRunIdFailed': 'Failed to copy Run ID', + 'run.detail.copyRunIdSuccess': 'Run ID copied', 'run.detail.dataset': 'Dataset', 'run.detail.model': 'Model', 'run.detail.overview': 'Overview', @@ -279,7 +289,11 @@ export default { 'run.status.aborted': 'Aborted', 'run.status.completed': 'Completed', + 'run.status.completed.tooltip': 'The run and external scoring are completed.', 'run.status.error': 'Run Error', + 'run.status.external': 'Awaiting Eval', + 'run.status.external.tooltip': + 'The agent has finished running. Waiting for an external system to submit evaluation results.', 'run.status.failed': 'Failed', 'run.status.idle': 'Idle', 'run.status.pending': 'Pending', diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx index 2f55f35012..c56e5c4d99 100644 --- a/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx +++ b/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx @@ -208,6 +208,7 @@ const DatasetDetail = memo(() => { }} > ( ) : ( ({ })); interface TestCaseTableProps { + datasetEvalMode?: string | null; diffFilter: 'all' | 'easy' | 'medium' | 'hard'; onAddCase?: () => void; onDelete?: (testCase: any) => void; @@ -106,6 +107,7 @@ const TestCaseTable = memo( total, search, diffFilter, + datasetEvalMode, pagination, onSearchChange, onDiffFilterChange, @@ -170,10 +172,18 @@ const TestCaseTable = memo( dataIndex: 'evalMode', key: 'evalMode', render: (text: string) => { - if (!text) return -; + const effective = text ?? datasetEvalMode; + if (!effective) return -; + const isInherited = !text && !!datasetEvalMode; return ( - - {t(`evalMode.${text}` as any)} + + {t(`evalMode.${effective}` as any)} ); }, @@ -238,7 +248,7 @@ const TestCaseTable = memo( } return base; - }, [pagination, readOnly, onEdit, onDelete, t]); + }, [pagination, readOnly, onEdit, onDelete, t, datasetEvalMode]); return ( <> diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx index 002bb2a9af..87a97ee372 100644 --- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx +++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx @@ -67,6 +67,8 @@ const StatusBadge = memo<{ record: any }>(({ record }) => { const { t } = useTranslation('eval'); const status: string | null | undefined = record.status; + // return
{status}
; + if (!status || status === 'pending') return {t('run.status.pending')}} />; @@ -86,6 +88,17 @@ const StatusBadge = memo<{ record: any }>(({ record }) => { if (status === 'timeout') return {t('run.status.timeout')}} />; + if (status === 'external') { + const badge = {t('run.status.external')}} />; + return {badge}; + } + + if (status === 'completed') { + // 完成代表运行完成 + 评测完成,不代表结果一定通过 + const badge = {t('run.status.completed')}} />; + return {badge}; + } + return {status}} />; }); @@ -99,15 +112,29 @@ const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => ( if (thread.passed === true) { color = cssVar.colorSuccess; + } else if (thread.passed === false) { + color = cssVar.colorError; + } + + if (thread.status === 'external') { + color = cssVar.colorWarning; + } + + if (thread.status === 'completed') { + color = cssVar.colorPrimary; } const label = thread.error ? 'error' : thread.passed === true ? 'passed' - : thread.passed === false + : thread.passed === false && thread.status !== 'completed' ? 'failed' - : 'pending'; + : thread.status === 'external' + ? 'Awaiting for external evaluation' + : thread.status === 'completed' + ? 'completed' + : 'pending'; return ( @@ -406,6 +433,8 @@ const CaseResultsTable = memo( { label: t('table.filter.error'), value: 'error' }, { label: t('table.filter.running'), value: 'running' }, { label: t('run.status.pending'), value: 'pending' }, + { label: t('run.status.external'), value: 'external' }, + { label: t('run.status.completed'), value: 'completed' }, ]} onChange={setStatusFilter} /> diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx index 31dcfd1d72..88cd7f9766 100644 --- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx +++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx @@ -96,7 +96,7 @@ const useStyles = createStyles(({ css, token }) => ({ `, })); -const PendingState = memo(() => { +const PendingState = memo(({ hint }: { hint?: string }) => { const { t } = useTranslation('eval'); const { cx, styles } = useStyles(); @@ -119,7 +119,7 @@ const PendingState = memo(() => { -
{t('run.pending.hint')}
+
{hint}
); }); diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx index dc811ee957..5d9dc12e00 100644 --- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx +++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx @@ -2,10 +2,19 @@ import { AGENT_PROFILE_URL } from '@lobechat/const'; import type { AgentEvalRunDetail } from '@lobechat/types'; -import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui'; +import { ActionIcon, Avatar, copyToClipboard, Flexbox, Highlighter, Markdown } from '@lobehub/ui'; import { App, Button, Card, Tag, Typography } from 'antd'; import { createStyles } from 'antd-style'; -import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react'; +import { + ArrowLeft, + ChevronDown, + ChevronUp, + Copy, + Pencil, + Play, + Square, + Trash2, +} from 'lucide-react'; import { memo, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { Link, useNavigate } from 'react-router-dom'; @@ -170,6 +179,14 @@ const RunHeader = memo(({ run, benchmarkId, hideStart }) => { window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank'); } }; + const handleCopyRunId = async () => { + try { + await copyToClipboard(run.id); + message.success(t('run.detail.copyRunIdSuccess')); + } catch { + message.error(t('run.detail.copyRunIdFailed')); + } + }; const formatDate = (date?: Date | string) => { if (!date) return ''; @@ -194,6 +211,12 @@ const RunHeader = memo(({ run, benchmarkId, hideStart }) => { {run.name || run.id.slice(0, 8)} + {/* Meta info row */} diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx index b34e9cadf1..e9842d722c 100644 --- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx +++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx @@ -104,7 +104,9 @@ const RunDetail = memo(() => { {runDetail.status === 'running' ? ( ) : runDetail.status === 'pending' ? ( - + + ) : runDetail.status === 'external' ? ( + ) : ( )} diff --git a/src/routes/(main)/eval/config/datasetPresets.ts b/src/routes/(main)/eval/config/datasetPresets.ts index 842ff1fda0..fd1d742211 100644 --- a/src/routes/(main)/eval/config/datasetPresets.ts +++ b/src/routes/(main)/eval/config/datasetPresets.ts @@ -36,6 +36,26 @@ export interface DatasetPreset { } export const DATASET_PRESETS: Record = { + 'browsecomp': { + id: 'browsecomp', + category: 'research', + name: 'BrowseComp', + description: 'Measuring the ability for agents to browse the web, comprises 1,266 questions.', + icon: Globe, + formatDescription: 'format: Topic (category/tags), Question (input), Answer (expected)', + requiredFields: ['question', 'answer', 'problem_topic', 'canary'], + optionalFields: [], + fieldInference: { + input: ['question'], + expected: ['answer'], + choices: [], + category: ['problem_topic'], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, // === Deep Research / QA Category === 'browsecomp-zh': { id: 'browsecomp-zh', @@ -58,6 +78,129 @@ export const DATASET_PRESETS: Record = { }, }, + 'widesearch': { + id: 'widesearch', + category: 'research', + name: 'WideSearch', + description: + 'Evaluating the capabilities of agents in broad information-seeking tasks, consisting of 200 questions.', + icon: Globe, + formatDescription: 'format: instance_id, query (input), evaluation (expected), language', + requiredFields: ['instance_id', 'query', 'evaluation', 'language'], + optionalFields: [], + fieldInference: { + input: ['query'], + expected: ['evaluation'], + choices: [], + category: ['language'], + sortOrder: [], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, + + 'hle-text': { + id: 'hle-text', + category: 'research', + name: "Humanity's Last Exam, HLE (Text Only)", + description: + "Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, consisting of 2150 questions.", + icon: Globe, + formatDescription: + 'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category', + requiredFields: [ + 'id', + 'question', + 'answer', + 'answer_type', + 'rationale', + 'raw_subject', + 'category', + ], + optionalFields: ['canary'], + fieldInference: { + input: ['question'], + expected: ['answer'], + choices: [], + category: ['category'], + }, + }, + + 'hle-verified': { + id: 'hle-verified', + category: 'research', + name: "Humanity's Last Exam, HLE (Verified Answers)", + description: + "A subset of Humanity's Last Exam (HLE) with verified answers, designed to evaluate the ability to produce correct answers rather than just plausible ones.", + icon: Globe, + formatDescription: + 'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category, Verified_Classes', + requiredFields: [ + 'id', + 'question', + 'answer', + 'answer_type', + 'rationale', + 'raw_subject', + 'category', + 'Verified_Classes', + ], + optionalFields: ['canary'], + fieldInference: { + input: ['question'], + expected: ['answer'], + choices: [], + category: ['category'], + }, + }, + + 'deepsearchqa': { + id: 'deepsearchqa', + category: 'research', + name: 'DeepSearchQA', + description: + 'A 900-prompt factuality benchmark from Google DeepMind, designed to evaluate agents on difficult multi-step information-seeking tasks across 17 different fields.', + icon: Globe, + formatDescription: 'problem, problem_category, answer, answer_type', + requiredFields: ['problem', 'answer', 'problem_category', 'answer_type'], + optionalFields: [], + fieldInference: { + input: ['problem'], + expected: ['answer'], + choices: [], + category: ['problem_category'], + sortOrder: [], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, + + 'sealqa': { + id: 'sealqa', + category: 'research', + name: 'SealQA', + description: + 'SealQA is a new challenge benchmark for evaluating SEarch- Augmented Language models on fact-seeking questions where web search yields conflicting, noisy, or unhelpful results.', + icon: Globe, + formatDescription: 'format: question (input), answer (expected), topic (category)', + requiredFields: ['question', 'answer', 'topic', 'canary'], + optionalFields: [], + fieldInference: { + input: ['question'], + expected: ['answer'], + choices: [], + category: ['topic'], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, + 'xbench': { id: 'xbench', category: 'research', diff --git a/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx b/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx index 8fc8f58a15..76dadff60c 100644 --- a/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx +++ b/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx @@ -157,6 +157,7 @@ const DatasetCreateModal = memo( { label: t('evalMode.equals'), value: 'equals' }, { label: t('evalMode.contains'), value: 'contains' }, { label: t('evalMode.llm-rubric'), value: 'llm-rubric' }, + { label: t('evalMode.external'), value: 'external' }, ]} /> diff --git a/src/routes/(main)/eval/features/DatasetEditModal/index.tsx b/src/routes/(main)/eval/features/DatasetEditModal/index.tsx index c4871dac34..e86fb1b14c 100644 --- a/src/routes/(main)/eval/features/DatasetEditModal/index.tsx +++ b/src/routes/(main)/eval/features/DatasetEditModal/index.tsx @@ -131,14 +131,30 @@ const DatasetEditModal = memo(({ open, onCancel, dataset, { label: t('evalMode.equals'), value: 'equals' }, { label: t('evalMode.contains'), value: 'contains' }, { label: t('evalMode.llm-rubric'), value: 'llm-rubric' }, + { label: t('evalMode.answer-relevance'), value: 'answer-relevance' }, + { label: t('evalMode.external'), value: 'external' }, ]} /> - {evalModeValue === 'llm-rubric' && ( - -