feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp

* wip: add widesearch desc

* wip: dsqa, hle, widesearch

* wip: add dsqa

* wip: add awaiting eval status for runs

* wip: add awaiting status for run

* wip: adjust hle-verified

* 🐛 fix: browsecomp topics

* 📝 docs: add annotations

* wip: add awaiting status for pass@k

* wip: add complete status

* wip: update theard dots

* wip: update run status page

* wip: remove useless impl

* wip: update prompt

*  feat: add external eval routes

* wip: add eval cli

* 🐛 fix: support authoritize in no browser environment

* wip: pass tests

* ♻️ refactor: remove tests

* ♻️ refactor: mo camel case
This commit is contained in:
Rylan Cai
2026-03-10 09:53:26 +08:00
committed by GitHub
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions

View File

@@ -0,0 +1,285 @@
import { Command } from 'commander';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
const { mockTrpcClient } = vi.hoisted(() => ({
mockTrpcClient: {
agentEvalExternal: {
datasetGet: { query: vi.fn() },
messagesList: { query: vi.fn() },
runGet: { query: vi.fn() },
runSetStatus: { mutate: vi.fn() },
runTopicReportResult: { mutate: vi.fn() },
runTopicsList: { query: vi.fn() },
testCasesCount: { query: vi.fn() },
threadsList: { query: vi.fn() },
},
},
}));
const { getTrpcClientMock } = vi.hoisted(() => ({
getTrpcClientMock: vi.fn(),
}));
vi.mock('../api/client', () => ({
getTrpcClient: getTrpcClientMock,
}));
vi.mock('../utils/logger', () => ({
log: {
debug: vi.fn(),
error: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
},
setVerbose: vi.fn(),
}));
// eslint-disable-next-line import-x/first
import { log } from '../utils/logger';
// eslint-disable-next-line import-x/first
import { registerEvalCommand } from './eval';
describe('eval command', () => {
let exitSpy: ReturnType<typeof vi.spyOn>;
let logSpy: ReturnType<typeof vi.spyOn>;
beforeEach(() => {
getTrpcClientMock.mockResolvedValue(mockTrpcClient);
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
for (const fn of Object.values(method)) {
(fn as ReturnType<typeof vi.fn>).mockReset();
}
}
});
afterEach(() => {
exitSpy.mockRestore();
logSpy.mockRestore();
vi.clearAllMocks();
});
const createProgram = () => {
const program = new Command();
program.exitOverride();
registerEvalCommand(program);
return program;
};
it('should call runGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
});
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: {
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
},
error: null,
ok: true,
version: 'v1',
});
});
it('should call datasetGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
id: 'dataset-1',
metadata: { preset: 'deepsearchqa' },
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'dataset',
'get',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should pass onlyExternal to runTopicsList', async () => {
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topics',
'list',
'--run-id',
'run-1',
'--only-external',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
onlyExternal: true,
runId: 'run-1',
});
});
it('should pass topicId and threadId to messagesList', async () => {
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'messages',
'list',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
threadId: 'thread-1',
topicId: 'topic-1',
});
});
it('should parse and report run-topic result', async () => {
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topic',
'report-result',
'--run-id',
'run-1',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--score',
'0.91',
'--correct',
'true',
'--result-json',
'{"grade":"A"}',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
correct: true,
result: { grade: 'A' },
runId: 'run-1',
score: 0.91,
threadId: 'thread-1',
topicId: 'topic-1',
});
});
it('should update run status', async () => {
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
runId: 'run-1',
status: 'completed',
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'set-status',
'--run-id',
'run-1',
'--status',
'completed',
]);
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
runId: 'run-1',
status: 'completed',
});
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
});
it('should output json error envelope when command fails', async () => {
const error = Object.assign(new Error('Run not found'), {
data: { code: 'NOT_FOUND' },
});
mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'get',
'--run-id',
'run-404',
'--json',
]);
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: null,
error: { code: 'NOT_FOUND', message: 'Run not found' },
ok: false,
version: 'v1',
});
expect(exitSpy).toHaveBeenCalledWith(1);
});
it('should query test case count', async () => {
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'test-cases',
'count',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should log plain error without --json', async () => {
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
expect(log.error).toHaveBeenCalledWith('boom');
expect(exitSpy).toHaveBeenCalledWith(1);
});
});

View File

@@ -0,0 +1,326 @@
import type { Command } from 'commander';
import { InvalidArgumentError } from 'commander';
import pc from 'picocolors';
import { getTrpcClient } from '../api/client';
import { log } from '../utils/logger';
const JSON_VERSION = 'v1' as const;
interface JsonError {
code?: string;
message: string;
}
interface JsonEnvelope<T> {
data: T | null;
error: JsonError | null;
ok: boolean;
version: typeof JSON_VERSION;
}
interface JsonOption {
json?: boolean;
}
interface RunGetOptions extends JsonOption {
runId: string;
}
interface RunSetStatusOptions extends JsonOption {
runId: string;
status: 'completed' | 'external';
}
interface DatasetGetOptions extends JsonOption {
datasetId: string;
}
interface RunTopicsListOptions extends JsonOption {
onlyExternal?: boolean;
runId: string;
}
interface ThreadsListOptions extends JsonOption {
topicId: string;
}
interface MessagesListOptions extends JsonOption {
threadId?: string;
topicId: string;
}
interface TestCasesCountOptions extends JsonOption {
datasetId: string;
}
interface RunTopicReportResultOptions extends JsonOption {
correct: boolean;
resultJson: Record<string, unknown>;
runId: string;
score: number;
threadId?: string;
topicId: string;
}
const printJson = (data: unknown) => {
console.log(JSON.stringify(data, null, 2));
};
const outputJsonSuccess = (data: unknown) => {
const payload: JsonEnvelope<unknown> = {
data,
error: null,
ok: true,
version: JSON_VERSION,
};
printJson(payload);
};
const isRecord = (value: unknown): value is Record<string, unknown> =>
typeof value === 'object' && value !== null;
const toJsonError = (error: unknown): JsonError => {
if (error instanceof Error) {
const maybeData = (error as Error & { data?: { code?: string } }).data;
const code = maybeData?.code;
return {
code: typeof code === 'string' ? code : undefined,
message: error.message,
};
}
if (isRecord(error)) {
const code = typeof error.code === 'string' ? error.code : undefined;
const message = typeof error.message === 'string' ? error.message : 'Unknown error';
return { code, message };
}
return { message: String(error) };
};
const handleCommandError = (error: unknown, json: boolean) => {
const normalized = toJsonError(error);
if (json) {
const payload: JsonEnvelope<null> = {
data: null,
error: normalized,
ok: false,
version: JSON_VERSION,
};
printJson(payload);
} else {
log.error(normalized.message);
}
process.exit(1);
};
const parseScore = (value: string) => {
const score = Number(value);
if (!Number.isFinite(score)) {
throw new InvalidArgumentError(`Invalid score: ${value}`);
}
return score;
};
const parseBoolean = (value: string) => {
const normalized = value.trim().toLowerCase();
if (['1', 'true', 'yes'].includes(normalized)) return true;
if (['0', 'false', 'no'].includes(normalized)) return false;
throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
};
const parseResultJson = (value: string) => {
let parsed: unknown;
try {
parsed = JSON.parse(value);
} catch {
throw new InvalidArgumentError('Invalid JSON value for --result-json');
}
if (!isRecord(parsed) || Array.isArray(parsed)) {
throw new InvalidArgumentError('--result-json must be a JSON object');
}
return parsed;
};
const parseRunStatus = (value: string) => {
if (value !== 'completed' && value !== 'external') {
throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
}
return value as 'completed' | 'external';
};
const executeCommand = async (
options: JsonOption,
action: () => Promise<unknown>,
successMessage?: string,
) => {
try {
const data = await action();
if (options.json) {
outputJsonSuccess(data);
return;
}
if (successMessage) {
console.log(`${pc.green('OK')} ${successMessage}`);
return;
}
printJson(data);
} catch (error) {
handleCommandError(error, Boolean(options.json));
}
};
export function registerEvalCommand(program: Command) {
const evalCmd = program.command('eval').description('Manage external evaluation workflows');
const runCmd = evalCmd.command('run').description('Manage evaluation runs');
runCmd
.command('get')
.description('Get run information')
.requiredOption('--run-id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: RunGetOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runGet.query({ runId: options.runId });
}),
);
runCmd
.command('set-status')
.description('Set run status (external API supports completed or external)')
.requiredOption('--run-id <id>', 'Run ID')
.requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
.option('--json', 'Output JSON envelope')
.action(async (options: RunSetStatusOptions) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runSetStatus.mutate({
runId: options.runId,
status: options.status,
});
},
`Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
),
);
evalCmd
.command('dataset')
.description('Manage evaluation datasets')
.command('get')
.description('Get dataset information')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: DatasetGetOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
}),
);
evalCmd
.command('run-topics')
.description('Manage run topics')
.command('list')
.description('List topics in a run')
.requiredOption('--run-id <id>', 'Run ID')
.option('--only-external', 'Only return topics pending external evaluation')
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicsListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicsList.query({
onlyExternal: Boolean(options.onlyExternal),
runId: options.runId,
});
}),
);
evalCmd
.command('threads')
.description('Manage evaluation threads')
.command('list')
.description('List threads by topic')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--json', 'Output JSON envelope')
.action(async (options: ThreadsListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
}),
);
evalCmd
.command('messages')
.description('Manage evaluation messages')
.command('list')
.description('List messages by topic and optional thread')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID')
.option('--json', 'Output JSON envelope')
.action(async (options: MessagesListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.messagesList.query({
threadId: options.threadId,
topicId: options.topicId,
});
}),
);
evalCmd
.command('test-cases')
.description('Manage evaluation test cases')
.command('count')
.description('Count test cases by dataset')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: TestCasesCountOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
}),
);
evalCmd
.command('run-topic')
.description('Manage evaluation run-topic reporting')
.command('report-result')
.description('Report one evaluation result for a run topic')
.requiredOption('--run-id <id>', 'Run ID')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID (required for k > 1)')
.requiredOption('--score <score>', 'Evaluation score', parseScore)
.requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
.requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicReportResultOptions) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicReportResult.mutate({
correct: options.correct,
result: options.resultJson,
runId: options.runId,
score: options.score,
threadId: options.threadId,
topicId: options.topicId,
});
},
`Reported result for topic ${pc.bold(options.topicId)}`,
),
);
}

View File

@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
import { registerFileCommand } from './commands/file';
import { registerGenerateCommand } from './commands/generate';
import { registerKbCommand } from './commands/kb';
import { registerEvalCommand } from './commands/eval';
import { registerLoginCommand } from './commands/login';
import { registerLogoutCommand } from './commands/logout';
import { registerMemoryCommand } from './commands/memory';
@@ -44,5 +45,6 @@ registerModelCommand(program);
registerProviderCommand(program);
registerPluginCommand(program);
registerConfigCommand(program);
registerEvalCommand(program);
program.parse();