✨ feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
2026-03-26 13:19:34 +07:00 · 2026-03-10 09:53:26 +08:00
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions
--- a/apps/cli/src/commands/eval.test.ts
+++ b/apps/cli/src/commands/eval.test.ts
@@ -0,0 +1,285 @@
+import { Command } from 'commander';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { mockTrpcClient } = vi.hoisted(() => ({
+  mockTrpcClient: {
+    agentEvalExternal: {
+      datasetGet: { query: vi.fn() },
+      messagesList: { query: vi.fn() },
+      runGet: { query: vi.fn() },
+      runSetStatus: { mutate: vi.fn() },
+      runTopicReportResult: { mutate: vi.fn() },
+      runTopicsList: { query: vi.fn() },
+      testCasesCount: { query: vi.fn() },
+      threadsList: { query: vi.fn() },
+    },
+  },
+}));
+
+const { getTrpcClientMock } = vi.hoisted(() => ({
+  getTrpcClientMock: vi.fn(),
+}));
+
+vi.mock('../api/client', () => ({
+  getTrpcClient: getTrpcClientMock,
+}));
+
+vi.mock('../utils/logger', () => ({
+  log: {
+    debug: vi.fn(),
+    error: vi.fn(),
+    info: vi.fn(),
+    warn: vi.fn(),
+  },
+  setVerbose: vi.fn(),
+}));
+
+// eslint-disable-next-line import-x/first
+import { log } from '../utils/logger';
+// eslint-disable-next-line import-x/first
+import { registerEvalCommand } from './eval';
+
+describe('eval command', () => {
+  let exitSpy: ReturnType<typeof vi.spyOn>;
+  let logSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    getTrpcClientMock.mockResolvedValue(mockTrpcClient);
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
+    logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
+      for (const fn of Object.values(method)) {
+        (fn as ReturnType<typeof vi.fn>).mockReset();
+      }
+    }
+  });
+
+  afterEach(() => {
+    exitSpy.mockRestore();
+    logSpy.mockRestore();
+    vi.clearAllMocks();
+  });
+
+  const createProgram = () => {
+    const program = new Command();
+    program.exitOverride();
+    registerEvalCommand(program);
+    return program;
+  };
+
+  it('should call runGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
+      config: { k: 1 },
+      datasetId: 'dataset-1',
+      id: 'run-1',
+    });
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
+
+    expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: {
+        config: { k: 1 },
+        datasetId: 'dataset-1',
+        id: 'run-1',
+      },
+      error: null,
+      ok: true,
+      version: 'v1',
+    });
+  });
+
+  it('should call datasetGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
+      id: 'dataset-1',
+      metadata: { preset: 'deepsearchqa' },
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'dataset',
+      'get',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should pass onlyExternal to runTopicsList', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topics',
+      'list',
+      '--run-id',
+      'run-1',
+      '--only-external',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
+      onlyExternal: true,
+      runId: 'run-1',
+    });
+  });
+
+  it('should pass topicId and threadId to messagesList', async () => {
+    mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'messages',
+      'list',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should parse and report run-topic result', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topic',
+      'report-result',
+      '--run-id',
+      'run-1',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--score',
+      '0.91',
+      '--correct',
+      'true',
+      '--result-json',
+      '{"grade":"A"}',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
+      correct: true,
+      result: { grade: 'A' },
+      runId: 'run-1',
+      score: 0.91,
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should update run status', async () => {
+    mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
+      runId: 'run-1',
+      status: 'completed',
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'set-status',
+      '--run-id',
+      'run-1',
+      '--status',
+      'completed',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
+      runId: 'run-1',
+      status: 'completed',
+    });
+    expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
+  });
+
+  it('should output json error envelope when command fails', async () => {
+    const error = Object.assign(new Error('Run not found'), {
+      data: { code: 'NOT_FOUND' },
+    });
+    mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'get',
+      '--run-id',
+      'run-404',
+      '--json',
+    ]);
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: null,
+      error: { code: 'NOT_FOUND', message: 'Run not found' },
+      ok: false,
+      version: 'v1',
+    });
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+
+  it('should query test case count', async () => {
+    mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'test-cases',
+      'count',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should log plain error without --json', async () => {
+    mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
+
+    expect(log.error).toHaveBeenCalledWith('boom');
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+});
--- a/apps/cli/src/commands/eval.ts
+++ b/apps/cli/src/commands/eval.ts
@@ -0,0 +1,326 @@
+import type { Command } from 'commander';
+import { InvalidArgumentError } from 'commander';
+import pc from 'picocolors';
+
+import { getTrpcClient } from '../api/client';
+import { log } from '../utils/logger';
+
+const JSON_VERSION = 'v1' as const;
+
+interface JsonError {
+  code?: string;
+  message: string;
+}
+
+interface JsonEnvelope<T> {
+  data: T | null;
+  error: JsonError | null;
+  ok: boolean;
+  version: typeof JSON_VERSION;
+}
+
+interface JsonOption {
+  json?: boolean;
+}
+
+interface RunGetOptions extends JsonOption {
+  runId: string;
+}
+
+interface RunSetStatusOptions extends JsonOption {
+  runId: string;
+  status: 'completed' | 'external';
+}
+
+interface DatasetGetOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicsListOptions extends JsonOption {
+  onlyExternal?: boolean;
+  runId: string;
+}
+
+interface ThreadsListOptions extends JsonOption {
+  topicId: string;
+}
+
+interface MessagesListOptions extends JsonOption {
+  threadId?: string;
+  topicId: string;
+}
+
+interface TestCasesCountOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicReportResultOptions extends JsonOption {
+  correct: boolean;
+  resultJson: Record<string, unknown>;
+  runId: string;
+  score: number;
+  threadId?: string;
+  topicId: string;
+}
+
+const printJson = (data: unknown) => {
+  console.log(JSON.stringify(data, null, 2));
+};
+
+const outputJsonSuccess = (data: unknown) => {
+  const payload: JsonEnvelope<unknown> = {
+    data,
+    error: null,
+    ok: true,
+    version: JSON_VERSION,
+  };
+  printJson(payload);
+};
+
+const isRecord = (value: unknown): value is Record<string, unknown> =>
+  typeof value === 'object' && value !== null;
+
+const toJsonError = (error: unknown): JsonError => {
+  if (error instanceof Error) {
+    const maybeData = (error as Error & { data?: { code?: string } }).data;
+    const code = maybeData?.code;
+
+    return {
+      code: typeof code === 'string' ? code : undefined,
+      message: error.message,
+    };
+  }
+
+  if (isRecord(error)) {
+    const code = typeof error.code === 'string' ? error.code : undefined;
+    const message = typeof error.message === 'string' ? error.message : 'Unknown error';
+    return { code, message };
+  }
+
+  return { message: String(error) };
+};
+
+const handleCommandError = (error: unknown, json: boolean) => {
+  const normalized = toJsonError(error);
+
+  if (json) {
+    const payload: JsonEnvelope<null> = {
+      data: null,
+      error: normalized,
+      ok: false,
+      version: JSON_VERSION,
+    };
+    printJson(payload);
+  } else {
+    log.error(normalized.message);
+  }
+
+  process.exit(1);
+};
+
+const parseScore = (value: string) => {
+  const score = Number(value);
+  if (!Number.isFinite(score)) {
+    throw new InvalidArgumentError(`Invalid score: ${value}`);
+  }
+  return score;
+};
+
+const parseBoolean = (value: string) => {
+  const normalized = value.trim().toLowerCase();
+  if (['1', 'true', 'yes'].includes(normalized)) return true;
+  if (['0', 'false', 'no'].includes(normalized)) return false;
+  throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
+};
+
+const parseResultJson = (value: string) => {
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(value);
+  } catch {
+    throw new InvalidArgumentError('Invalid JSON value for --result-json');
+  }
+
+  if (!isRecord(parsed) || Array.isArray(parsed)) {
+    throw new InvalidArgumentError('--result-json must be a JSON object');
+  }
+
+  return parsed;
+};
+
+const parseRunStatus = (value: string) => {
+  if (value !== 'completed' && value !== 'external') {
+    throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
+  }
+
+  return value as 'completed' | 'external';
+};
+
+const executeCommand = async (
+  options: JsonOption,
+  action: () => Promise<unknown>,
+  successMessage?: string,
+) => {
+  try {
+    const data = await action();
+    if (options.json) {
+      outputJsonSuccess(data);
+      return;
+    }
+
+    if (successMessage) {
+      console.log(`${pc.green('OK')} ${successMessage}`);
+      return;
+    }
+
+    printJson(data);
+  } catch (error) {
+    handleCommandError(error, Boolean(options.json));
+  }
+};
+
+export function registerEvalCommand(program: Command) {
+  const evalCmd = program.command('eval').description('Manage external evaluation workflows');
+
+  const runCmd = evalCmd.command('run').description('Manage evaluation runs');
+
+  runCmd
+    .command('get')
+    .description('Get run information')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runGet.query({ runId: options.runId });
+      }),
+    );
+
+  runCmd
+    .command('set-status')
+    .description('Set run status (external API supports completed or external)')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunSetStatusOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runSetStatus.mutate({
+            runId: options.runId,
+            status: options.status,
+          });
+        },
+        `Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
+      ),
+    );
+
+  evalCmd
+    .command('dataset')
+    .description('Manage evaluation datasets')
+    .command('get')
+    .description('Get dataset information')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: DatasetGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topics')
+    .description('Manage run topics')
+    .command('list')
+    .description('List topics in a run')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--only-external', 'Only return topics pending external evaluation')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runTopicsList.query({
+          onlyExternal: Boolean(options.onlyExternal),
+          runId: options.runId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('threads')
+    .description('Manage evaluation threads')
+    .command('list')
+    .description('List threads by topic')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: ThreadsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
+      }),
+    );
+
+  evalCmd
+    .command('messages')
+    .description('Manage evaluation messages')
+    .command('list')
+    .description('List messages by topic and optional thread')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: MessagesListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.messagesList.query({
+          threadId: options.threadId,
+          topicId: options.topicId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('test-cases')
+    .description('Manage evaluation test cases')
+    .command('count')
+    .description('Count test cases by dataset')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: TestCasesCountOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topic')
+    .description('Manage evaluation run-topic reporting')
+    .command('report-result')
+    .description('Report one evaluation result for a run topic')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID (required for k > 1)')
+    .requiredOption('--score <score>', 'Evaluation score', parseScore)
+    .requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
+    .requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicReportResultOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runTopicReportResult.mutate({
+            correct: options.correct,
+            result: options.resultJson,
+            runId: options.runId,
+            score: options.score,
+            threadId: options.threadId,
+            topicId: options.topicId,
+          });
+        },
+        `Reported result for topic ${pc.bold(options.topicId)}`,
+      ),
+    );
+}
--- a/apps/cli/src/index.ts
+++ b/apps/cli/src/index.ts
@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
 import { registerFileCommand } from './commands/file';
 import { registerGenerateCommand } from './commands/generate';
 import { registerKbCommand } from './commands/kb';
+import { registerEvalCommand } from './commands/eval';
 import { registerLoginCommand } from './commands/login';
 import { registerLogoutCommand } from './commands/logout';
 import { registerMemoryCommand } from './commands/memory';
@@ -44,5 +45,6 @@ registerModelCommand(program);
 registerProviderCommand(program);
 registerPluginCommand(program);
 registerConfigCommand(program);
+registerEvalCommand(program);

 program.parse();