feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp

* wip: add widesearch desc

* wip: dsqa, hle, widesearch

* wip: add dsqa

* wip: add awaiting eval status for runs

* wip: add awaiting status for run

* wip: adjust hle-verified

* 🐛 fix: browsecomp topics

* 📝 docs: add annotations

* wip: add awaiting status for pass@k

* wip: add complete status

* wip: update theard dots

* wip: update run status page

* wip: remove useless impl

* wip: update prompt

*  feat: add external eval routes

* wip: add eval cli

* 🐛 fix: support authoritize in no browser environment

* wip: pass tests

* ♻️ refactor: remove tests

* ♻️ refactor: mo camel case
This commit is contained in:
Rylan Cai
2026-03-10 09:53:26 +08:00
committed by GitHub
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions

View File

@@ -0,0 +1,285 @@
import { Command } from 'commander';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
const { mockTrpcClient } = vi.hoisted(() => ({
mockTrpcClient: {
agentEvalExternal: {
datasetGet: { query: vi.fn() },
messagesList: { query: vi.fn() },
runGet: { query: vi.fn() },
runSetStatus: { mutate: vi.fn() },
runTopicReportResult: { mutate: vi.fn() },
runTopicsList: { query: vi.fn() },
testCasesCount: { query: vi.fn() },
threadsList: { query: vi.fn() },
},
},
}));
const { getTrpcClientMock } = vi.hoisted(() => ({
getTrpcClientMock: vi.fn(),
}));
vi.mock('../api/client', () => ({
getTrpcClient: getTrpcClientMock,
}));
vi.mock('../utils/logger', () => ({
log: {
debug: vi.fn(),
error: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
},
setVerbose: vi.fn(),
}));
// eslint-disable-next-line import-x/first
import { log } from '../utils/logger';
// eslint-disable-next-line import-x/first
import { registerEvalCommand } from './eval';
describe('eval command', () => {
let exitSpy: ReturnType<typeof vi.spyOn>;
let logSpy: ReturnType<typeof vi.spyOn>;
beforeEach(() => {
getTrpcClientMock.mockResolvedValue(mockTrpcClient);
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
for (const fn of Object.values(method)) {
(fn as ReturnType<typeof vi.fn>).mockReset();
}
}
});
afterEach(() => {
exitSpy.mockRestore();
logSpy.mockRestore();
vi.clearAllMocks();
});
const createProgram = () => {
const program = new Command();
program.exitOverride();
registerEvalCommand(program);
return program;
};
it('should call runGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
});
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: {
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
},
error: null,
ok: true,
version: 'v1',
});
});
it('should call datasetGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
id: 'dataset-1',
metadata: { preset: 'deepsearchqa' },
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'dataset',
'get',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should pass onlyExternal to runTopicsList', async () => {
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topics',
'list',
'--run-id',
'run-1',
'--only-external',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
onlyExternal: true,
runId: 'run-1',
});
});
it('should pass topicId and threadId to messagesList', async () => {
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'messages',
'list',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
threadId: 'thread-1',
topicId: 'topic-1',
});
});
it('should parse and report run-topic result', async () => {
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topic',
'report-result',
'--run-id',
'run-1',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--score',
'0.91',
'--correct',
'true',
'--result-json',
'{"grade":"A"}',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
correct: true,
result: { grade: 'A' },
runId: 'run-1',
score: 0.91,
threadId: 'thread-1',
topicId: 'topic-1',
});
});
it('should update run status', async () => {
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
runId: 'run-1',
status: 'completed',
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'set-status',
'--run-id',
'run-1',
'--status',
'completed',
]);
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
runId: 'run-1',
status: 'completed',
});
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
});
it('should output json error envelope when command fails', async () => {
const error = Object.assign(new Error('Run not found'), {
data: { code: 'NOT_FOUND' },
});
mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'get',
'--run-id',
'run-404',
'--json',
]);
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: null,
error: { code: 'NOT_FOUND', message: 'Run not found' },
ok: false,
version: 'v1',
});
expect(exitSpy).toHaveBeenCalledWith(1);
});
it('should query test case count', async () => {
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'test-cases',
'count',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should log plain error without --json', async () => {
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
expect(log.error).toHaveBeenCalledWith('boom');
expect(exitSpy).toHaveBeenCalledWith(1);
});
});

View File

@@ -0,0 +1,326 @@
import type { Command } from 'commander';
import { InvalidArgumentError } from 'commander';
import pc from 'picocolors';
import { getTrpcClient } from '../api/client';
import { log } from '../utils/logger';
const JSON_VERSION = 'v1' as const;
interface JsonError {
code?: string;
message: string;
}
interface JsonEnvelope<T> {
data: T | null;
error: JsonError | null;
ok: boolean;
version: typeof JSON_VERSION;
}
interface JsonOption {
json?: boolean;
}
interface RunGetOptions extends JsonOption {
runId: string;
}
interface RunSetStatusOptions extends JsonOption {
runId: string;
status: 'completed' | 'external';
}
interface DatasetGetOptions extends JsonOption {
datasetId: string;
}
interface RunTopicsListOptions extends JsonOption {
onlyExternal?: boolean;
runId: string;
}
interface ThreadsListOptions extends JsonOption {
topicId: string;
}
interface MessagesListOptions extends JsonOption {
threadId?: string;
topicId: string;
}
interface TestCasesCountOptions extends JsonOption {
datasetId: string;
}
interface RunTopicReportResultOptions extends JsonOption {
correct: boolean;
resultJson: Record<string, unknown>;
runId: string;
score: number;
threadId?: string;
topicId: string;
}
const printJson = (data: unknown) => {
console.log(JSON.stringify(data, null, 2));
};
const outputJsonSuccess = (data: unknown) => {
const payload: JsonEnvelope<unknown> = {
data,
error: null,
ok: true,
version: JSON_VERSION,
};
printJson(payload);
};
const isRecord = (value: unknown): value is Record<string, unknown> =>
typeof value === 'object' && value !== null;
const toJsonError = (error: unknown): JsonError => {
if (error instanceof Error) {
const maybeData = (error as Error & { data?: { code?: string } }).data;
const code = maybeData?.code;
return {
code: typeof code === 'string' ? code : undefined,
message: error.message,
};
}
if (isRecord(error)) {
const code = typeof error.code === 'string' ? error.code : undefined;
const message = typeof error.message === 'string' ? error.message : 'Unknown error';
return { code, message };
}
return { message: String(error) };
};
const handleCommandError = (error: unknown, json: boolean) => {
const normalized = toJsonError(error);
if (json) {
const payload: JsonEnvelope<null> = {
data: null,
error: normalized,
ok: false,
version: JSON_VERSION,
};
printJson(payload);
} else {
log.error(normalized.message);
}
process.exit(1);
};
const parseScore = (value: string) => {
const score = Number(value);
if (!Number.isFinite(score)) {
throw new InvalidArgumentError(`Invalid score: ${value}`);
}
return score;
};
const parseBoolean = (value: string) => {
const normalized = value.trim().toLowerCase();
if (['1', 'true', 'yes'].includes(normalized)) return true;
if (['0', 'false', 'no'].includes(normalized)) return false;
throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
};
const parseResultJson = (value: string) => {
let parsed: unknown;
try {
parsed = JSON.parse(value);
} catch {
throw new InvalidArgumentError('Invalid JSON value for --result-json');
}
if (!isRecord(parsed) || Array.isArray(parsed)) {
throw new InvalidArgumentError('--result-json must be a JSON object');
}
return parsed;
};
const parseRunStatus = (value: string) => {
if (value !== 'completed' && value !== 'external') {
throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
}
return value as 'completed' | 'external';
};
const executeCommand = async (
options: JsonOption,
action: () => Promise<unknown>,
successMessage?: string,
) => {
try {
const data = await action();
if (options.json) {
outputJsonSuccess(data);
return;
}
if (successMessage) {
console.log(`${pc.green('OK')} ${successMessage}`);
return;
}
printJson(data);
} catch (error) {
handleCommandError(error, Boolean(options.json));
}
};
export function registerEvalCommand(program: Command) {
const evalCmd = program.command('eval').description('Manage external evaluation workflows');
const runCmd = evalCmd.command('run').description('Manage evaluation runs');
runCmd
.command('get')
.description('Get run information')
.requiredOption('--run-id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: RunGetOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runGet.query({ runId: options.runId });
}),
);
runCmd
.command('set-status')
.description('Set run status (external API supports completed or external)')
.requiredOption('--run-id <id>', 'Run ID')
.requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
.option('--json', 'Output JSON envelope')
.action(async (options: RunSetStatusOptions) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runSetStatus.mutate({
runId: options.runId,
status: options.status,
});
},
`Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
),
);
evalCmd
.command('dataset')
.description('Manage evaluation datasets')
.command('get')
.description('Get dataset information')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: DatasetGetOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
}),
);
evalCmd
.command('run-topics')
.description('Manage run topics')
.command('list')
.description('List topics in a run')
.requiredOption('--run-id <id>', 'Run ID')
.option('--only-external', 'Only return topics pending external evaluation')
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicsListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicsList.query({
onlyExternal: Boolean(options.onlyExternal),
runId: options.runId,
});
}),
);
evalCmd
.command('threads')
.description('Manage evaluation threads')
.command('list')
.description('List threads by topic')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--json', 'Output JSON envelope')
.action(async (options: ThreadsListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
}),
);
evalCmd
.command('messages')
.description('Manage evaluation messages')
.command('list')
.description('List messages by topic and optional thread')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID')
.option('--json', 'Output JSON envelope')
.action(async (options: MessagesListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.messagesList.query({
threadId: options.threadId,
topicId: options.topicId,
});
}),
);
evalCmd
.command('test-cases')
.description('Manage evaluation test cases')
.command('count')
.description('Count test cases by dataset')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: TestCasesCountOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
}),
);
evalCmd
.command('run-topic')
.description('Manage evaluation run-topic reporting')
.command('report-result')
.description('Report one evaluation result for a run topic')
.requiredOption('--run-id <id>', 'Run ID')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID (required for k > 1)')
.requiredOption('--score <score>', 'Evaluation score', parseScore)
.requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
.requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicReportResultOptions) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicReportResult.mutate({
correct: options.correct,
result: options.resultJson,
runId: options.runId,
score: options.score,
threadId: options.threadId,
topicId: options.topicId,
});
},
`Reported result for topic ${pc.bold(options.topicId)}`,
),
);
}

View File

@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
import { registerFileCommand } from './commands/file';
import { registerGenerateCommand } from './commands/generate';
import { registerKbCommand } from './commands/kb';
import { registerEvalCommand } from './commands/eval';
import { registerLoginCommand } from './commands/login';
import { registerLogoutCommand } from './commands/logout';
import { registerMemoryCommand } from './commands/memory';
@@ -44,5 +45,6 @@ registerModelCommand(program);
registerProviderCommand(program);
registerPluginCommand(program);
registerConfigCommand(program);
registerEvalCommand(program);
program.parse();

View File

@@ -157,13 +157,15 @@
"difficulty.easy": "Easy",
"difficulty.hard": "Hard",
"difficulty.medium": "Medium",
"evalMode.answer-relevance": "LLM Relevance",
"evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
"evalMode.contains": "Contains Match",
"evalMode.contains.desc": "Output must contain the expected text",
"evalMode.equals": "Exact Match",
"evalMode.equals.desc": "Output must be exactly the same as expected",
"evalMode.label": "Eval Mode",
"evalMode.llm-rubric": "LLM Judge",
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
"evalMode.placeholder": "Select eval mode",
"evalMode.prompt.label": "Judge Prompt",
"evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
@@ -256,12 +258,16 @@
"run.running.hint": "Evaluation is running, results will appear shortly...",
"run.status.aborted": "Aborted",
"run.status.completed": "Completed",
"run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
"run.status.error": "Run Error",
"run.status.external": "External",
"run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
"run.status.failed": "Failed",
"run.status.idle": "Idle",
"run.status.pending": "Pending",
"run.status.running": "Running",
"run.status.timeout": "Timeout",
"sidebar": "Evaluation",
"sidebar.benchmarks": "Benchmarks",
"sidebar.dashboard": "Dashboard",
"sidebar.datasets": "Datasets",

View File

@@ -161,6 +161,8 @@
"evalMode.contains.desc": "输出中必须包含期望的文本",
"evalMode.equals": "精确匹配",
"evalMode.equals.desc": "输出必须与期望内容完全一致",
"evalMode.external": "外部评估",
"evalMode.external.desc": "智能体完成运行后,由外部系统提交评估结果",
"evalMode.label": "评估模式",
"evalMode.llm-rubric": "LLM 评判",
"evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
@@ -256,7 +258,10 @@
"run.running.hint": "评测进行中,结果即将呈现...",
"run.status.aborted": "已终止",
"run.status.completed": "已完成",
"run.status.completed.tooltip": "评测已完成运行,所有结果已评估。",
"run.status.error": "运行出错",
"run.status.external": "待外部评测",
"run.status.external.tooltip": "智能体已完成运行,等待外部系统提交评估结果。",
"run.status.failed": "失败",
"run.status.idle": "待开始",
"run.status.pending": "等待中",

View File

@@ -50,6 +50,8 @@ export class AgentEvalDatasetModel {
benchmarkId: agentEvalDatasets.benchmarkId,
createdAt: agentEvalDatasets.createdAt,
description: agentEvalDatasets.description,
evalConfig: agentEvalDatasets.evalConfig,
evalMode: agentEvalDatasets.evalMode,
id: agentEvalDatasets.id,
identifier: agentEvalDatasets.identifier,
metadata: agentEvalDatasets.metadata,

View File

@@ -31,7 +31,7 @@ export class AgentEvalRunModel {
datasetId?: string;
limit?: number;
offset?: number;
status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted';
status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted' | 'external';
}) => {
const conditions = [eq(agentEvalRuns.userId, this.userId)];

View File

@@ -43,6 +43,7 @@ const evalModes = [
'similar',
'levenshtein',
'rubric',
'external',
] as const;
// ============================================
@@ -181,7 +182,7 @@ export const agentEvalRuns = pgTable(
name: text('name'),
status: text('status', {
enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted'],
enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'],
})
.default('idle')
.notNull(),
@@ -228,7 +229,7 @@ export const agentEvalRunTopics = pgTable(
.notNull(),
status: text('status', {
enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout'],
enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout', 'external', 'completed'],
}),
score: real('score'),

View File

@@ -87,12 +87,20 @@ export const evaluate = async (
const candidates: string[] = JSON.parse(expected);
const results: MatchResult[] = [];
for (const c of candidates) {
results.push(await match({ actual: extracted, expected: c, rubric }, matchContext));
results.push(
await match(
{ input: testCase.input, actual: extracted, expected: c, rubric },
matchContext,
),
);
}
const best = results.reduce((a, b) => (a.score >= b.score ? a : b));
result = best;
} else {
result = await match({ actual: extracted, expected, rubric }, matchContext);
result = await match(
{ input: testCase.input, actual: extracted, expected, rubric },
matchContext,
);
}
rubricResults.push({

View File

@@ -0,0 +1,9 @@
import type { MatchResult } from './types';
export const matchExternal = async (): Promise<MatchResult> => {
return {
passed: false,
score: 0,
reason: 'Waiting for external evaluation...',
};
};

View File

@@ -4,8 +4,10 @@ import { matchAnyOf } from './anyOf';
import { matchContains } from './contains';
import { matchEndsWith } from './endsWith';
import { matchEquals } from './equals';
import { matchExternal } from './external';
import { matchJsonSchema } from './jsonSchema';
import { matchLevenshtein } from './levenshtein';
import { matchLLMEq } from './llmEq';
import { matchLLMRubric } from './llmRubric';
import { matchNumeric } from './numeric';
import { matchRegex } from './regex';
@@ -18,10 +20,15 @@ export type { GenerateObjectPayload, MatchContext, MatchResult } from './types';
* Run a single rubric matcher against actual vs expected
*/
export const match = async (
params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric },
params: {
input: string;
actual: string;
expected: string | undefined;
rubric: EvalBenchmarkRubric;
},
context?: MatchContext,
): Promise<MatchResult> => {
const { actual, expected, rubric } = params;
const { actual, expected, rubric, input } = params;
const { type, config } = rubric;
switch (type) {
@@ -57,6 +64,10 @@ export const match = async (
return matchLevenshtein(actual, expected, config);
}
case 'answer-relevance': {
return matchLLMEq(input, actual, expected, rubric, context);
}
case 'llm-rubric': {
return matchLLMRubric(actual, expected, rubric, context);
}
@@ -65,6 +76,10 @@ export const match = async (
return matchJsonSchema(actual, config);
}
case 'external': {
return matchExternal();
}
default: {
return {
passed: false,

View File

@@ -0,0 +1,89 @@
import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types';
import type { MatchContext, MatchResult } from './types';
const DEFAULT_SYSTEM_ROLE = [
'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.',
'Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.',
'Your judgement must be in the format and criteria specified below:',
"extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.",
'reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.',
'Scoring rules:',
'score: Return 1 only when extracted_final_answer clearly and unambiguously matches [correct_answer], or is within a small margin of error for numerical problems.',
'score: Return 0 when extracted_final_answer is incorrect, missing, ambiguous, non-equivalent, or when you are uncertain.',
'Treat uncertainty as incorrect (score = 0).',
'Respond with a JSON object containing ',
'"score" (number: 0 or 1)',
'and "reason" (brief explanation for the judgement).',
].join('\n');
const JUDGE_SCORE_SCHEMA: Record<string, unknown> = {
additionalProperties: false,
properties: {
score: {
description: 'Binary score for judgement: 1=correct, 0=incorrect/uncertain',
enum: [0, 1],
type: 'number',
},
reason: { description: 'Brief explanation for the judgement', type: 'string' },
},
required: ['score', 'reason'],
type: 'object',
};
function buildJudgeUserPrompt(
question: string,
actual: string,
expected: string | undefined,
): string {
const parts = [`[question]\n${question}`, `[response]\n${actual}`];
if (expected) {
parts.push(`[correct_answer]\n${expected}`);
}
return parts.join('\n\n');
}
export const matchLLMEq = async (
question: string,
actual: string,
expected: string | undefined,
rubric: EvalBenchmarkRubric,
context?: MatchContext,
): Promise<MatchResult> => {
if (!context?.generateObject) {
return { passed: false, reason: 'LLM judge not available', score: 0 };
}
const cfg = rubric.config as RubricConfigLLM;
const model = cfg.model || context.judgeModel;
if (!model) {
return { passed: false, reason: 'No judge model configured', score: 0 };
}
try {
const result = await context.generateObject({
messages: [
{ content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' },
{ content: buildJudgeUserPrompt(question, actual, expected), role: 'user' },
],
model,
provider: cfg.provider,
schema: JUDGE_SCORE_SCHEMA,
});
const score = result?.score === 1 ? 1 : 0;
return {
passed: score === 1,
reason: result?.reason,
score,
};
} catch (error) {
return {
passed: false,
reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`,
score: 0,
};
}
};

View File

@@ -64,6 +64,10 @@ export const matchLLMRubric = async (
schema: JUDGE_SCORE_SCHEMA,
});
if (!result?.score) {
return { passed: false, reason: 'LLM judge did not return a score', score: 0 };
}
const score = Math.max(0, Math.min(1, result.score));
const threshold = rubric.threshold ?? 0.6;

View File

@@ -34,7 +34,7 @@ export interface EvalTestCaseMetadata {
/**
* Evaluation run status
*/
export type EvalRunStatus = 'aborted' | 'completed' | 'failed' | 'pending' | 'running';
export type EvalRunStatus = 'aborted' | 'completed' | 'external' | 'failed' | 'pending' | 'running';
/**
* Evaluation run configuration
@@ -96,6 +96,7 @@ export interface EvalRunMetrics {
cost?: number;
duration?: number;
errorCases?: number;
externalCases?: number;
failedCases: number;
llmCalls?: number;
passAllK?: number;
@@ -183,6 +184,8 @@ export interface EvalRunTopicResult {
completionReason?: string;
operationId?: string;
rubricScores?: EvalRubricScore[];
/** Set when evalMode is 'external' — agent finished, awaiting external scoring */
awaitingExternalEval?: boolean;
}
/*eslint-enable perfectionist/sort-interfaces */
@@ -194,14 +197,16 @@ export interface EvalThreadResult {
cost?: number;
duration?: number;
error?: string;
llmCalls?: number;
operationId?: string;
passed?: boolean;
rubricScores?: EvalRubricScore[];
score?: number;
status?: 'error' | 'failed' | 'passed' | 'running' | 'timeout';
status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout' | 'completed';
steps?: number;
threadId: string;
tokens?: number;
toolCalls?: number;
}
/**

View File

@@ -11,6 +11,7 @@ export type AgentEvalRunStatus =
| 'failed'
| 'idle'
| 'pending'
| 'external'
| 'running';
export interface AgentEvalRunTargetAgent {

View File

@@ -22,6 +22,8 @@ export type RubricType =
// Similarity
| 'similar'
| 'levenshtein'
// External evaluation
| 'external'
// Composite
| 'rubric';

View File

@@ -66,9 +66,18 @@ export const { POST } = serve<FinalizeRunPayload>(
log('Metrics: %O', metrics);
// Step 4: Update run status (failed if all cases errored/timed out)
// Step 4: Update run status
// external: any topic awaits external scoring → whole run waits too
// failed: all cases are non-success (error/timeout)
// completed: everything else
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
const externalCount = metrics.externalCases || 0;
const runStatus =
externalCount > 0
? 'external'
: nonSuccessCases >= metrics.totalCases
? 'failed'
: 'completed';
await context.run('agent-eval-run:update-run', async () => {
const runModel = new AgentEvalRunModel(db, userId);

View File

@@ -173,9 +173,14 @@ export default {
'evalMode.contains.desc': 'Output must contain the expected text',
'evalMode.equals': 'Exact Match',
'evalMode.equals.desc': 'Output must be exactly the same as expected',
'evalMode.external': 'External Eval',
'evalMode.external.desc': 'Agent runs to completion; scoring is handled by an external system',
'evalMode.label': 'Eval Mode',
'evalMode.llm-rubric': 'LLM Judge',
'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality',
'evalMode.llm-rubric.desc':
'Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)',
'evalMode.answer-relevance': 'LLM Relevance',
'evalMode.answer-relevance.desc': 'Use LLM to evaluate answer relevance (yes or no)',
'evalMode.placeholder': 'Select eval mode',
'evalMode.prompt.label': 'Judge Prompt',
'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge',
@@ -204,6 +209,8 @@ export default {
'run.idle.hint': 'Click Start to begin evaluation',
'run.pending.hint': 'Evaluation is queued, waiting to start...',
'run.running.hint': 'Evaluation is running, results will appear shortly...',
'run.external.hint':
'Running completed. Waiting for external system to submit evaluation results ...',
'run.filter.active': 'Active',
'run.filter.empty': 'No runs match the current filter.',
@@ -249,6 +256,9 @@ export default {
'run.detail.report': 'Evaluation Summary',
'run.detail.config': 'Evaluation Config',
'run.detail.configSnapshot': 'Configuration Snapshot',
'run.detail.copyRunId': 'Copy Run ID',
'run.detail.copyRunIdFailed': 'Failed to copy Run ID',
'run.detail.copyRunIdSuccess': 'Run ID copied',
'run.detail.dataset': 'Dataset',
'run.detail.model': 'Model',
'run.detail.overview': 'Overview',
@@ -279,7 +289,11 @@ export default {
'run.status.aborted': 'Aborted',
'run.status.completed': 'Completed',
'run.status.completed.tooltip': 'The run and external scoring are completed.',
'run.status.error': 'Run Error',
'run.status.external': 'Awaiting Eval',
'run.status.external.tooltip':
'The agent has finished running. Waiting for an external system to submit evaluation results.',
'run.status.failed': 'Failed',
'run.status.idle': 'Idle',
'run.status.pending': 'Pending',

View File

@@ -208,6 +208,7 @@ const DatasetDetail = memo(() => {
}}
>
<TestCaseTable
datasetEvalMode={dataset?.evalMode}
diffFilter={diffFilter}
pagination={pagination}
search={search}

View File

@@ -238,6 +238,7 @@ const DatasetCard = memo<DatasetCardProps>(
) : (
<TestCaseTable
readOnly
datasetEvalMode={dataset.evalMode}
diffFilter={diffFilter}
pagination={pagination}
search={search}

View File

@@ -83,6 +83,7 @@ const styles = createStaticStyles(({ css, cssVar }) => ({
}));
interface TestCaseTableProps {
datasetEvalMode?: string | null;
diffFilter: 'all' | 'easy' | 'medium' | 'hard';
onAddCase?: () => void;
onDelete?: (testCase: any) => void;
@@ -106,6 +107,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
total,
search,
diffFilter,
datasetEvalMode,
pagination,
onSearchChange,
onDiffFilterChange,
@@ -170,10 +172,18 @@ const TestCaseTable = memo<TestCaseTableProps>(
dataIndex: 'evalMode',
key: 'evalMode',
render: (text: string) => {
if (!text) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
const effective = text ?? datasetEvalMode;
if (!effective) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
const isInherited = !text && !!datasetEvalMode;
return (
<span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
{t(`evalMode.${text}` as any)}
<span
style={{
color: isInherited ? cssVar.colorTextQuaternary : cssVar.colorTextSecondary,
fontSize: 12,
fontStyle: isInherited ? 'italic' : 'normal',
}}
>
{t(`evalMode.${effective}` as any)}
</span>
);
},
@@ -238,7 +248,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
}
return base;
}, [pagination, readOnly, onEdit, onDelete, t]);
}, [pagination, readOnly, onEdit, onDelete, t, datasetEvalMode]);
return (
<>

View File

@@ -67,6 +67,8 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
const { t } = useTranslation('eval');
const status: string | null | undefined = record.status;
// return <div>{status}</div>;
if (!status || status === 'pending')
return <Badge status="default" text={<BadgeText>{t('run.status.pending')}</BadgeText>} />;
@@ -86,6 +88,17 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
if (status === 'timeout')
return <Badge color="orange" text={<BadgeText>{t('run.status.timeout')}</BadgeText>} />;
if (status === 'external') {
const badge = <Badge color="purple" text={<BadgeText>{t('run.status.external')}</BadgeText>} />;
return <Tooltip title={t('run.status.external.tooltip')}>{badge}</Tooltip>;
}
if (status === 'completed') {
// 完成代表运行完成 + 评测完成,不代表结果一定通过
const badge = <Badge color="blue" text={<BadgeText>{t('run.status.completed')}</BadgeText>} />;
return <Tooltip title={t('run.status.completed.tooltip')}>{badge}</Tooltip>;
}
return <Badge status="default" text={<BadgeText>{status}</BadgeText>} />;
});
@@ -99,15 +112,29 @@ const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => (
if (thread.passed === true) {
color = cssVar.colorSuccess;
} else if (thread.passed === false) {
color = cssVar.colorError;
}
if (thread.status === 'external') {
color = cssVar.colorWarning;
}
if (thread.status === 'completed') {
color = cssVar.colorPrimary;
}
const label = thread.error
? 'error'
: thread.passed === true
? 'passed'
: thread.passed === false
: thread.passed === false && thread.status !== 'completed'
? 'failed'
: 'pending';
: thread.status === 'external'
? 'Awaiting for external evaluation'
: thread.status === 'completed'
? 'completed'
: 'pending';
return (
<Tooltip key={thread.threadId} title={label}>
@@ -406,6 +433,8 @@ const CaseResultsTable = memo<CaseResultsTableProps>(
{ label: t('table.filter.error'), value: 'error' },
{ label: t('table.filter.running'), value: 'running' },
{ label: t('run.status.pending'), value: 'pending' },
{ label: t('run.status.external'), value: 'external' },
{ label: t('run.status.completed'), value: 'completed' },
]}
onChange={setStatusFilter}
/>

View File

@@ -96,7 +96,7 @@ const useStyles = createStyles(({ css, token }) => ({
`,
}));
const PendingState = memo(() => {
const PendingState = memo(({ hint }: { hint?: string }) => {
const { t } = useTranslation('eval');
const { cx, styles } = useStyles();
@@ -119,7 +119,7 @@ const PendingState = memo(() => {
<Icon icon={Clock} size={18} />
</div>
</div>
<div className={styles.hint}>{t('run.pending.hint')}</div>
<div className={styles.hint}>{hint}</div>
</div>
);
});

View File

@@ -2,10 +2,19 @@
import { AGENT_PROFILE_URL } from '@lobechat/const';
import type { AgentEvalRunDetail } from '@lobechat/types';
import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
import { ActionIcon, Avatar, copyToClipboard, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
import { App, Button, Card, Tag, Typography } from 'antd';
import { createStyles } from 'antd-style';
import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react';
import {
ArrowLeft,
ChevronDown,
ChevronUp,
Copy,
Pencil,
Play,
Square,
Trash2,
} from 'lucide-react';
import { memo, useState } from 'react';
import { useTranslation } from 'react-i18next';
import { Link, useNavigate } from 'react-router-dom';
@@ -170,6 +179,14 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
}
};
const handleCopyRunId = async () => {
try {
await copyToClipboard(run.id);
message.success(t('run.detail.copyRunIdSuccess'));
} catch {
message.error(t('run.detail.copyRunIdFailed'));
}
};
const formatDate = (date?: Date | string) => {
if (!date) return '';
@@ -194,6 +211,12 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
<Typography.Title level={4} style={{ margin: 0 }}>
{run.name || run.id.slice(0, 8)}
</Typography.Title>
<ActionIcon
icon={Copy}
size="small"
title={t('run.detail.copyRunId')}
onClick={handleCopyRunId}
/>
<StatusBadge status={run.status} />
</Flexbox>
{/* Meta info row */}

View File

@@ -104,7 +104,9 @@ const RunDetail = memo(() => {
{runDetail.status === 'running' ? (
<RunningState />
) : runDetail.status === 'pending' ? (
<PendingState />
<PendingState hint={t('run.pending.hint')} />
) : runDetail.status === 'external' ? (
<PendingState hint={t('run.external.hint')} />
) : (
<IdleState run={runDetail} />
)}

View File

@@ -36,6 +36,26 @@ export interface DatasetPreset {
}
export const DATASET_PRESETS: Record<string, DatasetPreset> = {
'browsecomp': {
id: 'browsecomp',
category: 'research',
name: 'BrowseComp',
description: 'Measuring the ability for agents to browse the web, comprises 1,266 questions.',
icon: Globe,
formatDescription: 'format: Topic (category/tags), Question (input), Answer (expected)',
requiredFields: ['question', 'answer', 'problem_topic', 'canary'],
optionalFields: [],
fieldInference: {
input: ['question'],
expected: ['answer'],
choices: [],
category: ['problem_topic'],
},
validation: {
requireExpected: true,
expectedFormat: 'string',
},
},
// === Deep Research / QA Category ===
'browsecomp-zh': {
id: 'browsecomp-zh',
@@ -58,6 +78,129 @@ export const DATASET_PRESETS: Record<string, DatasetPreset> = {
},
},
'widesearch': {
id: 'widesearch',
category: 'research',
name: 'WideSearch',
description:
'Evaluating the capabilities of agents in broad information-seeking tasks, consisting of 200 questions.',
icon: Globe,
formatDescription: 'format: instance_id, query (input), evaluation (expected), language',
requiredFields: ['instance_id', 'query', 'evaluation', 'language'],
optionalFields: [],
fieldInference: {
input: ['query'],
expected: ['evaluation'],
choices: [],
category: ['language'],
sortOrder: [],
},
validation: {
requireExpected: true,
expectedFormat: 'string',
},
},
'hle-text': {
id: 'hle-text',
category: 'research',
name: "Humanity's Last Exam, HLE (Text Only)",
description:
"Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, consisting of 2150 questions.",
icon: Globe,
formatDescription:
'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category',
requiredFields: [
'id',
'question',
'answer',
'answer_type',
'rationale',
'raw_subject',
'category',
],
optionalFields: ['canary'],
fieldInference: {
input: ['question'],
expected: ['answer'],
choices: [],
category: ['category'],
},
},
'hle-verified': {
id: 'hle-verified',
category: 'research',
name: "Humanity's Last Exam, HLE (Verified Answers)",
description:
"A subset of Humanity's Last Exam (HLE) with verified answers, designed to evaluate the ability to produce correct answers rather than just plausible ones.",
icon: Globe,
formatDescription:
'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category, Verified_Classes',
requiredFields: [
'id',
'question',
'answer',
'answer_type',
'rationale',
'raw_subject',
'category',
'Verified_Classes',
],
optionalFields: ['canary'],
fieldInference: {
input: ['question'],
expected: ['answer'],
choices: [],
category: ['category'],
},
},
'deepsearchqa': {
id: 'deepsearchqa',
category: 'research',
name: 'DeepSearchQA',
description:
'A 900-prompt factuality benchmark from Google DeepMind, designed to evaluate agents on difficult multi-step information-seeking tasks across 17 different fields.',
icon: Globe,
formatDescription: 'problem, problem_category, answer, answer_type',
requiredFields: ['problem', 'answer', 'problem_category', 'answer_type'],
optionalFields: [],
fieldInference: {
input: ['problem'],
expected: ['answer'],
choices: [],
category: ['problem_category'],
sortOrder: [],
},
validation: {
requireExpected: true,
expectedFormat: 'string',
},
},
'sealqa': {
id: 'sealqa',
category: 'research',
name: 'SealQA',
description:
'SealQA is a new challenge benchmark for evaluating SEarch- Augmented Language models on fact-seeking questions where web search yields conflicting, noisy, or unhelpful results.',
icon: Globe,
formatDescription: 'format: question (input), answer (expected), topic (category)',
requiredFields: ['question', 'answer', 'topic', 'canary'],
optionalFields: [],
fieldInference: {
input: ['question'],
expected: ['answer'],
choices: [],
category: ['topic'],
},
validation: {
requireExpected: true,
expectedFormat: 'string',
},
},
'xbench': {
id: 'xbench',
category: 'research',

View File

@@ -157,6 +157,7 @@ const DatasetCreateModal = memo<DatasetCreateModalProps>(
{ label: t('evalMode.equals'), value: 'equals' },
{ label: t('evalMode.contains'), value: 'contains' },
{ label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
{ label: t('evalMode.external'), value: 'external' },
]}
/>
</Form.Item>

View File

@@ -131,14 +131,30 @@ const DatasetEditModal = memo<DatasetEditModalProps>(({ open, onCancel, dataset,
{ label: t('evalMode.equals'), value: 'equals' },
{ label: t('evalMode.contains'), value: 'contains' },
{ label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
{ label: t('evalMode.answer-relevance'), value: 'answer-relevance' },
{ label: t('evalMode.external'), value: 'external' },
]}
/>
</Form.Item>
{evalModeValue === 'llm-rubric' && (
<Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
<TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
</Form.Item>
{(evalModeValue === 'llm-rubric' || evalModeValue === 'answer-relevance') && (
<>
<Form.Item initialValue="aihubmix" label={'Provider'} name={['evalConfig', 'provider']}>
<TextArea placeholder={'LLM provider (e.g. openai, azure)'} rows={1} />
</Form.Item>
<Form.Item initialValue="gpt-5-nano" label={'Model'} name={['evalConfig', 'model']}>
<TextArea placeholder={'LLM model to use for evaluation (e.g. gpt-4)'} rows={1} />
</Form.Item>
<Form.Item label={'System Prompt'} name={['evalConfig', 'systemRole']}>
<TextArea placeholder={'Optional system prompt for the LLM judge'} rows={3} />
</Form.Item>
<Form.Item label={'Eval Prompt'} name={['evalConfig', 'criteria']}>
<TextArea placeholder={'Prompt template for the LLM judge'} rows={3} />
</Form.Item>
<Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
<TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
</Form.Item>
</>
)}
<Form.Item label={t('dataset.create.preset.label')} style={{ marginBottom: 0 }}>

View File

@@ -92,6 +92,14 @@ const autoInferMapping = (
? new Set(preset.fieldInference.sortOrder.map((s) => s.toLowerCase()))
: SORT_ORDER_CANDIDATES;
const requiredCandidates = new Set<string>(
preset ? preset.requiredFields.map((s) => s.toLowerCase()) : [],
);
const optionalCandidates = new Set<string>(
preset ? preset.optionalFields.map((s) => s.toLowerCase()) : [],
);
for (const h of headers) {
const lower = h.toLowerCase().trim();
if (!inputFound && inputCandidates.has(lower)) {
@@ -109,6 +117,10 @@ const autoInferMapping = (
} else if (!sortOrderFound && sortOrderCandidates.has(lower)) {
result[h] = 'sortOrder';
sortOrderFound = true;
} else if (requiredCandidates.has(lower) || optionalCandidates.has(lower)) {
// If the field was claimed by the config but not matched by any candidate,
// assign it to metadata to ensure it's not missed
result[h] = 'metadata';
} else {
result[h] = 'ignore';
}

View File

@@ -2,13 +2,14 @@
import { Icon } from '@lobehub/ui';
import { createStaticStyles } from 'antd-style';
import { Activity, CheckCircle2, Clock, Pause, XCircle } from 'lucide-react';
import { Activity, CheckCircle2, Clock, Hourglass, Pause, XCircle } from 'lucide-react';
import { memo } from 'react';
import { useTranslation } from 'react-i18next';
const statusConfig: Record<string, { cls: string; icon: any }> = {
aborted: { cls: 'default', icon: Pause },
completed: { cls: 'success', icon: CheckCircle2 },
external: { cls: 'warning', icon: Hourglass },
failed: { cls: 'error', icon: XCircle },
idle: { cls: 'default', icon: Clock },
pending: { cls: 'warning', icon: Clock },

View File

@@ -33,6 +33,7 @@ const rubricTypeSchema = z.enum([
'similar',
'levenshtein',
'rubric',
'external',
]);
const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passthrough();
@@ -621,7 +622,9 @@ export const agentEvalRouter = router({
z.object({
benchmarkId: z.string().optional(),
datasetId: z.string().optional(),
status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']).optional(),
status: z
.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'])
.optional(),
limit: z.number().min(1).max(100).default(50).optional(),
offset: z.number().min(0).default(0).optional(),
}),
@@ -871,7 +874,15 @@ export const agentEvalRouter = router({
.input(
z.object({
id: z.string(),
status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']),
status: z.enum([
'idle',
'pending',
'running',
'completed',
'failed',
'aborted',
'external',
]),
}),
)
.mutation(async ({ input, ctx }) => {

View File

@@ -0,0 +1,514 @@
import type { EvalRunTopicResult, EvalThreadResult } from '@lobechat/types';
import { TRPCError } from '@trpc/server';
import { and, asc, eq, isNull } from 'drizzle-orm';
import { z } from 'zod';
import {
AgentEvalDatasetModel,
AgentEvalRunModel,
AgentEvalRunTopicModel,
AgentEvalTestCaseModel,
} from '@/database/models/agentEval';
import { ThreadModel } from '@/database/models/thread';
import { messages } from '@/database/schemas';
import { authedProcedure, router } from '@/libs/trpc/lambda';
import { serverDatabase } from '@/libs/trpc/lambda/middleware';
import { AgentEvalRunService } from '@/server/services/agentEvalRun';
const runStatusSchema = z.enum([
'idle',
'pending',
'running',
'completed',
'failed',
'aborted',
'external',
]);
const reportResultItemSchema = z.object({
correct: z.boolean(),
result: z.record(z.unknown()).optional(),
score: z.number(),
threadId: z.string().optional(),
topicId: z.string(),
});
const toIsoString = (value?: Date | null) => (value ? value.toISOString() : undefined);
const agentEvalExternalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
const { ctx } = opts;
return opts.next({
ctx: {
datasetModel: new AgentEvalDatasetModel(ctx.serverDB, ctx.userId),
runModel: new AgentEvalRunModel(ctx.serverDB, ctx.userId),
runService: new AgentEvalRunService(ctx.serverDB, ctx.userId),
runTopicModel: new AgentEvalRunTopicModel(ctx.serverDB, ctx.userId),
testCaseModel: new AgentEvalTestCaseModel(ctx.serverDB, ctx.userId),
threadModel: new ThreadModel(ctx.serverDB, ctx.userId),
},
});
});
type ReportResultInput = z.infer<typeof reportResultItemSchema> & { runId: string };
const recomputeRunAggregation = async (
ctx: {
runModel: AgentEvalRunModel;
runService: AgentEvalRunService;
runTopicModel: AgentEvalRunTopicModel;
},
runId: string,
) => {
const refreshedRun = await ctx.runModel.findById(runId);
if (!refreshedRun) return undefined;
const refreshedTopics = await ctx.runTopicModel.findByRunId(runId);
const metrics = await ctx.runService.evaluateAndFinalizeRun({
run: {
config: refreshedRun.config,
id: refreshedRun.id,
metrics: refreshedRun.metrics,
startedAt: refreshedRun.startedAt,
},
runTopics: refreshedTopics,
});
const hasAwaitingExternal = refreshedTopics.some(
(topic) =>
topic.status === 'external' ||
(topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
);
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
const status = hasAwaitingExternal
? 'external'
: nonSuccessCases >= metrics.totalCases
? 'failed'
: 'completed';
await ctx.runModel.update(runId, { metrics, status });
return status;
};
const applyReportResult = async (
ctx: {
runModel: AgentEvalRunModel;
runTopicModel: AgentEvalRunTopicModel;
runService: AgentEvalRunService;
threadModel: ThreadModel;
},
input: ReportResultInput,
recomputeRun: boolean,
) => {
const run = await ctx.runModel.findById(input.runId);
if (!run) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
}
const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
const runTopic = runTopics.find((item) => item.topicId === input.topicId);
if (!runTopic) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run topic not found' });
}
const runK = run.config?.k ?? 1;
const rubricScores = [{ rubricId: 'external', score: input.score }];
const existingEvalResult = (runTopic.evalResult ?? {}) as EvalRunTopicResult &
Record<string, unknown>;
const externalResult = input.result ?? {};
let idempotent = false;
let reportedThreads: number;
let totalThreads: number;
let topicFinalized: boolean;
if (runK > 1) {
if (!input.threadId) {
throw new TRPCError({
code: 'BAD_REQUEST',
message: 'threadId is required when k > 1',
});
}
const allThreads = await ctx.threadModel.queryByTopicId(input.topicId);
const evalThreads = allThreads.filter((thread) => thread.type === 'eval');
const sourceThreads = evalThreads.length > 0 ? evalThreads : allThreads;
if (sourceThreads.length === 0) {
throw new TRPCError({
code: 'BAD_REQUEST',
message: 'No threads found for this topic',
});
}
const threads: EvalThreadResult[] =
(existingEvalResult.threads as EvalThreadResult[] | undefined)?.map((thread) => ({
...thread,
})) ??
sourceThreads.map((thread) => ({
status: 'external',
threadId: thread.id,
}));
let targetIndex = threads.findIndex((thread) => thread.threadId === input.threadId);
if (targetIndex < 0) {
const existsInTopic = sourceThreads.some((thread) => thread.id === input.threadId);
if (!existsInTopic) {
throw new TRPCError({
code: 'NOT_FOUND',
message: 'Thread not found for this topic',
});
}
threads.push({ status: 'external', threadId: input.threadId });
targetIndex = threads.length - 1;
}
totalThreads = threads.length;
const targetThread = threads[targetIndex];
const alreadyReported =
targetThread.status === 'completed' &&
targetThread.score === input.score &&
targetThread.passed === input.correct;
if (alreadyReported) {
idempotent = true;
} else {
threads[targetIndex] = {
...targetThread,
passed: input.correct,
rubricScores,
score: input.score,
status: 'completed',
};
const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
string,
unknown
>;
const nextEvalResult = {
...existingEvalResult,
awaitingExternalEval: true,
externalThreadResults: {
...existingThreadResults,
[input.threadId]: externalResult,
},
threads,
} satisfies EvalRunTopicResult & Record<string, unknown>;
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
evalResult: nextEvalResult,
status: 'external',
});
}
reportedThreads = threads.filter(
(thread) => thread.status === 'completed' && typeof thread.score === 'number',
).length;
topicFinalized = reportedThreads >= totalThreads;
if (topicFinalized) {
const finalThreads = threads;
const totalScore = finalThreads.reduce((acc, thread) => acc + (thread.score ?? 0), 0);
const avgScore = totalScore / finalThreads.length;
const passAtK = finalThreads.some((thread) => thread.passed === true);
const passAllK = finalThreads.every((thread) => thread.passed === true);
const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
string,
unknown
>;
const nextEvalResult = {
...existingEvalResult,
awaitingExternalEval: false,
externalThreadResults: {
...existingThreadResults,
[input.threadId]: externalResult,
},
passAllK,
passAtK,
rubricScores: [{ rubricId: 'external', score: avgScore }],
threads: finalThreads,
} satisfies EvalRunTopicResult & Record<string, unknown>;
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
evalResult: nextEvalResult,
passed: passAtK,
score: avgScore,
status: passAtK ? 'passed' : 'failed',
});
}
} else {
const alreadyReported =
runTopic.status === (input.correct ? 'passed' : 'failed') &&
runTopic.score === input.score &&
runTopic.passed === input.correct;
if (alreadyReported) {
idempotent = true;
} else {
const nextEvalResult = {
...existingEvalResult,
awaitingExternalEval: false,
externalResult,
rubricScores,
} satisfies EvalRunTopicResult & Record<string, unknown>;
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
evalResult: nextEvalResult,
passed: input.correct,
score: input.score,
status: input.correct ? 'passed' : 'failed',
});
}
reportedThreads = 1;
totalThreads = 1;
topicFinalized = true;
}
let runStatus: string | undefined;
if (recomputeRun) {
runStatus = await recomputeRunAggregation(ctx, input.runId);
}
return {
idempotent,
reportedThreads,
runId: input.runId,
runStatus,
success: true,
threadId: input.threadId,
topicFinalized,
topicId: input.topicId,
totalThreads,
};
};
export const agentEvalExternalRouter = router({
datasetGet: agentEvalExternalProcedure
.input(z.object({ datasetId: z.string() }))
.query(async ({ ctx, input }) => {
const dataset = await ctx.datasetModel.findById(input.datasetId);
if (!dataset) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Dataset not found' });
}
const metadata = (dataset.metadata ?? {}) as Record<string, unknown>;
return {
benchmarkId: dataset.benchmarkId,
id: dataset.id,
identifier: dataset.identifier,
metadata,
name: dataset.name,
};
}),
messagesList: agentEvalExternalProcedure
.input(z.object({ threadId: z.string().optional(), topicId: z.string() }))
.query(async ({ ctx, input }) => {
const conditions = [
eq(messages.userId, ctx.userId),
eq(messages.topicId, input.topicId),
isNull(messages.messageGroupId),
];
if (input.threadId) conditions.push(eq(messages.threadId, input.threadId));
const rows = await ctx.serverDB
.select({
content: messages.content,
createdAt: messages.createdAt,
id: messages.id,
role: messages.role,
threadId: messages.threadId,
topicId: messages.topicId,
})
.from(messages)
.where(and(...conditions))
.orderBy(asc(messages.createdAt));
return rows.map((row) => ({
content: row.content,
createdAt: toIsoString(row.createdAt),
id: row.id,
role: row.role,
threadId: row.threadId,
topicId: row.topicId,
}));
}),
reportResult: agentEvalExternalProcedure
.input(
z.object({
correct: z.boolean(),
result: z.record(z.unknown()).optional(),
runId: z.string(),
score: z.number(),
threadId: z.string().optional(),
topicId: z.string(),
}),
)
.mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
reportResultsBatch: agentEvalExternalProcedure
.input(z.object({ items: z.array(reportResultItemSchema).min(1), runId: z.string() }))
.mutation(async ({ ctx, input }) => {
const receipts = [];
for (const item of input.items) {
receipts.push(await applyReportResult(ctx, { ...item, runId: input.runId }, false));
}
const runStatus = await recomputeRunAggregation(ctx, input.runId);
return {
items: receipts,
runId: input.runId,
runStatus,
success: true,
};
}),
runGet: agentEvalExternalProcedure
.input(z.object({ runId: z.string() }))
.query(async ({ ctx, input }) => {
const run = await ctx.runModel.findById(input.runId);
if (!run) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
}
const config = { ...run.config, k: run.config?.k ?? 1 };
return {
config,
createdAt: run.createdAt,
datasetId: run.datasetId,
id: run.id,
metrics: run.metrics ?? undefined,
name: run.name,
startedAt: run.startedAt,
status: run.status,
targetAgentId: run.targetAgentId,
};
}),
runSetStatus: agentEvalExternalProcedure
.input(z.object({ runId: z.string(), status: runStatusSchema }))
.mutation(async ({ ctx, input }) => {
const run = await ctx.runModel.findById(input.runId);
if (!run) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
}
if (input.status !== 'completed' && input.status !== 'external') {
throw new TRPCError({
code: 'BAD_REQUEST',
message: 'External endpoint only supports setting status to completed or external',
});
}
if (run.status !== 'external' && run.status !== 'completed') {
throw new TRPCError({
code: 'BAD_REQUEST',
message: `Only external runs can be finalized via this endpoint. current=${run.status}`,
});
}
if (input.status === 'completed') {
const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
const hasAwaitingExternal = runTopics.some(
(topic) =>
topic.status === 'external' ||
(topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
);
if (hasAwaitingExternal) {
throw new TRPCError({
code: 'BAD_REQUEST',
message: 'Cannot set run to completed while external evaluation is pending',
});
}
const metrics = await ctx.runService.evaluateAndFinalizeRun({
run: { config: run.config, id: run.id, metrics: run.metrics, startedAt: run.startedAt },
runTopics,
});
const updated = await ctx.runModel.update(input.runId, { metrics, status: 'completed' });
return {
metrics,
runId: input.runId,
status: updated?.status ?? 'completed',
success: true,
};
}
const updated = await ctx.runModel.update(input.runId, { status: 'external' });
return {
runId: input.runId,
status: updated?.status ?? 'external',
success: true,
};
}),
runTopicReportResult: agentEvalExternalProcedure
.input(
z.object({
correct: z.boolean(),
result: z.record(z.unknown()).optional(),
runId: z.string(),
score: z.number(),
threadId: z.string().optional(),
topicId: z.string(),
}),
)
.mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
runTopicsList: agentEvalExternalProcedure
.input(z.object({ onlyExternal: z.boolean().default(false).optional(), runId: z.string() }))
.query(async ({ ctx, input }) => {
const run = await ctx.runModel.findById(input.runId);
if (!run) {
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
}
const allRunTopics = await ctx.runTopicModel.findByRunId(input.runId);
const runTopics = input.onlyExternal
? allRunTopics.filter((topic) => topic.status === 'external')
: allRunTopics;
return runTopics.map((topic) => {
const testCase = topic.testCase;
return {
createdAt: topic.createdAt,
evalResult: topic.evalResult,
passed: topic.passed,
runId: topic.runId,
score: topic.score,
status: topic.status,
testCase,
testCaseId: topic.testCaseId,
topic: topic.topic,
topicId: topic.topicId,
};
});
}),
testCasesCount: agentEvalExternalProcedure
.input(z.object({ datasetId: z.string() }))
.query(async ({ ctx, input }) => {
const count = await ctx.testCaseModel.countByDatasetId(input.datasetId);
return { count };
}),
threadsList: agentEvalExternalProcedure
.input(z.object({ topicId: z.string() }))
.query(async ({ ctx, input }) => {
const threads = await ctx.threadModel.queryByTopicId(input.topicId);
return threads.map((thread) => ({
id: thread.id,
topicId: thread.topicId,
type: thread.type,
}));
}),
});

View File

@@ -12,6 +12,7 @@ import { agentRouter } from './agent';
import { agentBotProviderRouter } from './agentBotProvider';
import { agentCronJobRouter } from './agentCronJob';
import { agentEvalRouter } from './agentEval';
import { agentEvalExternalRouter } from './agentEvalExternal';
import { agentGroupRouter } from './agentGroup';
import { agentSkillsRouter } from './agentSkills';
import { aiAgentRouter } from './aiAgent';
@@ -57,6 +58,7 @@ export const lambdaRouter = router({
agentBotProvider: agentBotProviderRouter,
agentCronJob: agentCronJobRouter,
agentEval: agentEvalRouter,
agentEvalExternal: agentEvalExternalRouter,
agentSkills: agentSkillsRouter,
aiAgent: aiAgentRouter,
aiChat: aiChatRouter,

View File

@@ -512,6 +512,7 @@ export class AgentEvalRunService {
const passedCases = allTopics.filter((t) => t.status === 'passed').length;
const failedCases = allTopics.filter((t) => t.status === 'failed').length;
const errorCases = allTopics.filter((t) => t.status === 'error').length;
const externalCasesRT = allTopics.filter((t) => t.status === 'external').length;
const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
let sumCost = 0;
@@ -556,6 +557,7 @@ export class AgentEvalRunService {
completedCases: completedCount,
cost: sumCost ? roundCost(sumCost) : undefined,
errorCases,
externalCases: externalCasesRT || undefined,
failedCases,
llmCalls: sumLlmCalls || undefined,
passedCases,
@@ -667,6 +669,17 @@ export class AgentEvalRunService {
const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
// ── External eval mode: agent finished, hand off to external scorer ──
if (evalMode === 'external') {
return {
...baseMeta,
awaitingExternalEval: true,
passed: undefined,
score: undefined,
status: 'external',
};
}
let effectiveRubrics: EvalBenchmarkRubric[];
if (evalMode) {
effectiveRubrics = [
@@ -722,6 +735,7 @@ export class AgentEvalRunService {
passed?: boolean;
rubricScores?: Array<{ reason?: string; rubricId: string; score: number }>;
score?: number;
status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout';
steps?: number;
threadId: string;
tokens?: number;
@@ -737,6 +751,14 @@ export class AgentEvalRunService {
passed: meta.passed as boolean | undefined,
rubricScores: meta.rubricScores as any,
score: meta.score as number | undefined,
status: meta.status as
| 'error'
| 'external'
| 'failed'
| 'passed'
| 'running'
| 'timeout'
| undefined,
steps: meta.steps as number | undefined,
threadId: t.id,
tokens: meta.tokens as number | undefined,
@@ -744,6 +766,20 @@ export class AgentEvalRunService {
};
});
// ── External eval mode: if all threads await external scoring, propagate that status ──
const allExternal = threadResults.every((t) => t.status === 'external');
if (allExternal) {
await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
evalResult: {
awaitingExternalEval: true,
completionReason: 'external',
threads: threadResults,
} satisfies EvalRunTopicResult,
status: 'external',
});
return;
}
// pass@k: at least one thread passed
const anyPassed = threadResults.some((t) => t.passed === true);
// pass^k: all threads passed
@@ -888,7 +924,7 @@ export class AgentEvalRunService {
if (runTopic) {
// Skip if topic is already in a terminal state (e.g. timeout marked by checkAndHandleRunTimeout).
// The interrupted agent still fires the completion webhook, but we must not overwrite the result.
const terminalStates = ['passed', 'failed', 'error', 'timeout'];
const terminalStates = ['passed', 'failed', 'error', 'timeout', 'external'];
if (runTopic.status && terminalStates.includes(runTopic.status)) {
// Fall through to progress tracking below without modifying this topic
} else {
@@ -945,11 +981,15 @@ export class AgentEvalRunService {
// Aggregate real-time metrics from all RunTopics
const allTopics = await this.runTopicModel.findByRunId(runId);
const completedCount = allTopics.filter(
(t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
(t) =>
(t.evalResult && 'completionReason' in t.evalResult) ||
t.status === 'timeout' ||
t.status === 'external',
).length;
const passedCases = allTopics.filter((t) => t.status === 'passed').length;
const failedCases = allTopics.filter((t) => t.status === 'failed').length;
const errorCases = allTopics.filter((t) => t.status === 'error').length;
const externalCasesTraj = allTopics.filter((t) => t.status === 'external').length;
const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
let sumCost = 0;
@@ -995,6 +1035,7 @@ export class AgentEvalRunService {
completedCases: completedCount,
cost: sumCost ? roundCost(sumCost) : undefined,
errorCases,
externalCases: externalCasesTraj || undefined,
failedCases,
llmCalls: sumLlmCalls || undefined,
passedCases,
@@ -1048,6 +1089,7 @@ export class AgentEvalRunService {
let passedCases = 0;
let failedCases = 0;
let errorCases = 0;
let externalCases = 0;
let timeoutCases = 0;
let totalScore = 0;
// Sum of per-case averages (for per-case display)
@@ -1088,19 +1130,27 @@ export class AgentEvalRunService {
failedCases++;
} else if (runTopic.status === 'error') {
errorCases++;
} else if (runTopic.status === 'external') {
externalCases++;
} else if (runTopic.status === 'timeout') {
timeoutCases++;
}
// Only accumulate scores for evaluated (non-error, non-timeout) cases
if (runTopic.status !== 'error' && runTopic.status !== 'timeout' && runTopic.score != null) {
totalScore += runTopic.score;
}
// Accumulate per-rubric scores from existing evalResult (exclude error/timeout cases)
// Only accumulate scores for evaluated (non-error, non-timeout, non-external) cases
if (
runTopic.status !== 'error' &&
runTopic.status !== 'timeout' &&
runTopic.status !== 'external' &&
runTopic.score != null
) {
totalScore += runTopic.score;
}
// Accumulate per-rubric scores from existing evalResult (exclude error/timeout/external cases)
if (
runTopic.status !== 'error' &&
runTopic.status !== 'timeout' &&
runTopic.status !== 'external' &&
existingResult?.rubricScores
) {
for (const rs of existingResult.rubricScores) {
@@ -1138,6 +1188,7 @@ export class AgentEvalRunService {
cost: sumCost ? roundCost(sumCost) : undefined,
duration: wallClockDuration || undefined,
errorCases,
externalCases: externalCases || undefined,
failedCases,
llmCalls: sumLlmCalls || undefined,
passRate: totalCases > 0 ? passedCases / totalCases : 0,
@@ -1216,6 +1267,15 @@ export class AgentEvalRunService {
const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
// ── External eval mode: agent finished, hand off to external scorer ──
if (evalMode === 'external') {
await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
evalResult: { ...existingResult, awaitingExternalEval: true },
status: 'external',
});
return;
}
let effectiveRubrics: EvalBenchmarkRubric[];
if (evalMode) {
effectiveRubrics = [
@@ -1324,7 +1384,13 @@ export class AgentEvalRunService {
});
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
const externalCount = metrics.externalCases || 0;
const runStatus =
externalCount > 0
? 'external'
: nonSuccessCases >= metrics.totalCases
? 'failed'
: 'completed';
await this.runModel.update(run.id, { metrics, status: runStatus });
} else {