✨ feat(eval): add external scoring mode (#12729)

* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
2026-03-27 13:29:15 +07:00 · 2026-03-10 09:53:26 +08:00
parent 255a1c21a8
commit ea329113be
34 changed files with 1655 additions and 40 deletions
--- a/apps/cli/src/commands/eval.test.ts
+++ b/apps/cli/src/commands/eval.test.ts
@@ -0,0 +1,285 @@
+import { Command } from 'commander';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { mockTrpcClient } = vi.hoisted(() => ({
+  mockTrpcClient: {
+    agentEvalExternal: {
+      datasetGet: { query: vi.fn() },
+      messagesList: { query: vi.fn() },
+      runGet: { query: vi.fn() },
+      runSetStatus: { mutate: vi.fn() },
+      runTopicReportResult: { mutate: vi.fn() },
+      runTopicsList: { query: vi.fn() },
+      testCasesCount: { query: vi.fn() },
+      threadsList: { query: vi.fn() },
+    },
+  },
+}));
+
+const { getTrpcClientMock } = vi.hoisted(() => ({
+  getTrpcClientMock: vi.fn(),
+}));
+
+vi.mock('../api/client', () => ({
+  getTrpcClient: getTrpcClientMock,
+}));
+
+vi.mock('../utils/logger', () => ({
+  log: {
+    debug: vi.fn(),
+    error: vi.fn(),
+    info: vi.fn(),
+    warn: vi.fn(),
+  },
+  setVerbose: vi.fn(),
+}));
+
+// eslint-disable-next-line import-x/first
+import { log } from '../utils/logger';
+// eslint-disable-next-line import-x/first
+import { registerEvalCommand } from './eval';
+
+describe('eval command', () => {
+  let exitSpy: ReturnType<typeof vi.spyOn>;
+  let logSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    getTrpcClientMock.mockResolvedValue(mockTrpcClient);
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
+    logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
+      for (const fn of Object.values(method)) {
+        (fn as ReturnType<typeof vi.fn>).mockReset();
+      }
+    }
+  });
+
+  afterEach(() => {
+    exitSpy.mockRestore();
+    logSpy.mockRestore();
+    vi.clearAllMocks();
+  });
+
+  const createProgram = () => {
+    const program = new Command();
+    program.exitOverride();
+    registerEvalCommand(program);
+    return program;
+  };
+
+  it('should call runGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
+      config: { k: 1 },
+      datasetId: 'dataset-1',
+      id: 'run-1',
+    });
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
+
+    expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: {
+        config: { k: 1 },
+        datasetId: 'dataset-1',
+        id: 'run-1',
+      },
+      error: null,
+      ok: true,
+      version: 'v1',
+    });
+  });
+
+  it('should call datasetGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
+      id: 'dataset-1',
+      metadata: { preset: 'deepsearchqa' },
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'dataset',
+      'get',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should pass onlyExternal to runTopicsList', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topics',
+      'list',
+      '--run-id',
+      'run-1',
+      '--only-external',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
+      onlyExternal: true,
+      runId: 'run-1',
+    });
+  });
+
+  it('should pass topicId and threadId to messagesList', async () => {
+    mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'messages',
+      'list',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should parse and report run-topic result', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topic',
+      'report-result',
+      '--run-id',
+      'run-1',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--score',
+      '0.91',
+      '--correct',
+      'true',
+      '--result-json',
+      '{"grade":"A"}',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
+      correct: true,
+      result: { grade: 'A' },
+      runId: 'run-1',
+      score: 0.91,
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should update run status', async () => {
+    mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
+      runId: 'run-1',
+      status: 'completed',
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'set-status',
+      '--run-id',
+      'run-1',
+      '--status',
+      'completed',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
+      runId: 'run-1',
+      status: 'completed',
+    });
+    expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
+  });
+
+  it('should output json error envelope when command fails', async () => {
+    const error = Object.assign(new Error('Run not found'), {
+      data: { code: 'NOT_FOUND' },
+    });
+    mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'get',
+      '--run-id',
+      'run-404',
+      '--json',
+    ]);
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: null,
+      error: { code: 'NOT_FOUND', message: 'Run not found' },
+      ok: false,
+      version: 'v1',
+    });
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+
+  it('should query test case count', async () => {
+    mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'test-cases',
+      'count',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should log plain error without --json', async () => {
+    mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
+
+    expect(log.error).toHaveBeenCalledWith('boom');
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+});
--- a/apps/cli/src/commands/eval.ts
+++ b/apps/cli/src/commands/eval.ts
@@ -0,0 +1,326 @@
+import type { Command } from 'commander';
+import { InvalidArgumentError } from 'commander';
+import pc from 'picocolors';
+
+import { getTrpcClient } from '../api/client';
+import { log } from '../utils/logger';
+
+const JSON_VERSION = 'v1' as const;
+
+interface JsonError {
+  code?: string;
+  message: string;
+}
+
+interface JsonEnvelope<T> {
+  data: T | null;
+  error: JsonError | null;
+  ok: boolean;
+  version: typeof JSON_VERSION;
+}
+
+interface JsonOption {
+  json?: boolean;
+}
+
+interface RunGetOptions extends JsonOption {
+  runId: string;
+}
+
+interface RunSetStatusOptions extends JsonOption {
+  runId: string;
+  status: 'completed' | 'external';
+}
+
+interface DatasetGetOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicsListOptions extends JsonOption {
+  onlyExternal?: boolean;
+  runId: string;
+}
+
+interface ThreadsListOptions extends JsonOption {
+  topicId: string;
+}
+
+interface MessagesListOptions extends JsonOption {
+  threadId?: string;
+  topicId: string;
+}
+
+interface TestCasesCountOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicReportResultOptions extends JsonOption {
+  correct: boolean;
+  resultJson: Record<string, unknown>;
+  runId: string;
+  score: number;
+  threadId?: string;
+  topicId: string;
+}
+
+const printJson = (data: unknown) => {
+  console.log(JSON.stringify(data, null, 2));
+};
+
+const outputJsonSuccess = (data: unknown) => {
+  const payload: JsonEnvelope<unknown> = {
+    data,
+    error: null,
+    ok: true,
+    version: JSON_VERSION,
+  };
+  printJson(payload);
+};
+
+const isRecord = (value: unknown): value is Record<string, unknown> =>
+  typeof value === 'object' && value !== null;
+
+const toJsonError = (error: unknown): JsonError => {
+  if (error instanceof Error) {
+    const maybeData = (error as Error & { data?: { code?: string } }).data;
+    const code = maybeData?.code;
+
+    return {
+      code: typeof code === 'string' ? code : undefined,
+      message: error.message,
+    };
+  }
+
+  if (isRecord(error)) {
+    const code = typeof error.code === 'string' ? error.code : undefined;
+    const message = typeof error.message === 'string' ? error.message : 'Unknown error';
+    return { code, message };
+  }
+
+  return { message: String(error) };
+};
+
+const handleCommandError = (error: unknown, json: boolean) => {
+  const normalized = toJsonError(error);
+
+  if (json) {
+    const payload: JsonEnvelope<null> = {
+      data: null,
+      error: normalized,
+      ok: false,
+      version: JSON_VERSION,
+    };
+    printJson(payload);
+  } else {
+    log.error(normalized.message);
+  }
+
+  process.exit(1);
+};
+
+const parseScore = (value: string) => {
+  const score = Number(value);
+  if (!Number.isFinite(score)) {
+    throw new InvalidArgumentError(`Invalid score: ${value}`);
+  }
+  return score;
+};
+
+const parseBoolean = (value: string) => {
+  const normalized = value.trim().toLowerCase();
+  if (['1', 'true', 'yes'].includes(normalized)) return true;
+  if (['0', 'false', 'no'].includes(normalized)) return false;
+  throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
+};
+
+const parseResultJson = (value: string) => {
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(value);
+  } catch {
+    throw new InvalidArgumentError('Invalid JSON value for --result-json');
+  }
+
+  if (!isRecord(parsed) || Array.isArray(parsed)) {
+    throw new InvalidArgumentError('--result-json must be a JSON object');
+  }
+
+  return parsed;
+};
+
+const parseRunStatus = (value: string) => {
+  if (value !== 'completed' && value !== 'external') {
+    throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
+  }
+
+  return value as 'completed' | 'external';
+};
+
+const executeCommand = async (
+  options: JsonOption,
+  action: () => Promise<unknown>,
+  successMessage?: string,
+) => {
+  try {
+    const data = await action();
+    if (options.json) {
+      outputJsonSuccess(data);
+      return;
+    }
+
+    if (successMessage) {
+      console.log(`${pc.green('OK')} ${successMessage}`);
+      return;
+    }
+
+    printJson(data);
+  } catch (error) {
+    handleCommandError(error, Boolean(options.json));
+  }
+};
+
+export function registerEvalCommand(program: Command) {
+  const evalCmd = program.command('eval').description('Manage external evaluation workflows');
+
+  const runCmd = evalCmd.command('run').description('Manage evaluation runs');
+
+  runCmd
+    .command('get')
+    .description('Get run information')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runGet.query({ runId: options.runId });
+      }),
+    );
+
+  runCmd
+    .command('set-status')
+    .description('Set run status (external API supports completed or external)')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunSetStatusOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runSetStatus.mutate({
+            runId: options.runId,
+            status: options.status,
+          });
+        },
+        `Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
+      ),
+    );
+
+  evalCmd
+    .command('dataset')
+    .description('Manage evaluation datasets')
+    .command('get')
+    .description('Get dataset information')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: DatasetGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topics')
+    .description('Manage run topics')
+    .command('list')
+    .description('List topics in a run')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--only-external', 'Only return topics pending external evaluation')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runTopicsList.query({
+          onlyExternal: Boolean(options.onlyExternal),
+          runId: options.runId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('threads')
+    .description('Manage evaluation threads')
+    .command('list')
+    .description('List threads by topic')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: ThreadsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
+      }),
+    );
+
+  evalCmd
+    .command('messages')
+    .description('Manage evaluation messages')
+    .command('list')
+    .description('List messages by topic and optional thread')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: MessagesListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.messagesList.query({
+          threadId: options.threadId,
+          topicId: options.topicId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('test-cases')
+    .description('Manage evaluation test cases')
+    .command('count')
+    .description('Count test cases by dataset')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: TestCasesCountOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topic')
+    .description('Manage evaluation run-topic reporting')
+    .command('report-result')
+    .description('Report one evaluation result for a run topic')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID (required for k > 1)')
+    .requiredOption('--score <score>', 'Evaluation score', parseScore)
+    .requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
+    .requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicReportResultOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runTopicReportResult.mutate({
+            correct: options.correct,
+            result: options.resultJson,
+            runId: options.runId,
+            score: options.score,
+            threadId: options.threadId,
+            topicId: options.topicId,
+          });
+        },
+        `Reported result for topic ${pc.bold(options.topicId)}`,
+      ),
+    );
+}
--- a/apps/cli/src/index.ts
+++ b/apps/cli/src/index.ts
@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
 import { registerFileCommand } from './commands/file';
 import { registerGenerateCommand } from './commands/generate';
 import { registerKbCommand } from './commands/kb';
+import { registerEvalCommand } from './commands/eval';
 import { registerLoginCommand } from './commands/login';
 import { registerLogoutCommand } from './commands/logout';
 import { registerMemoryCommand } from './commands/memory';
@@ -44,5 +45,6 @@ registerModelCommand(program);
 registerProviderCommand(program);
 registerPluginCommand(program);
 registerConfigCommand(program);
+registerEvalCommand(program);

 program.parse();
--- a/locales/en-US/eval.json
+++ b/locales/en-US/eval.json
@@ -157,13 +157,15 @@
  "difficulty.easy": "Easy",
  "difficulty.hard": "Hard",
  "difficulty.medium": "Medium",
+  "evalMode.answer-relevance": "LLM Relevance",
+  "evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
  "evalMode.contains": "Contains Match",
  "evalMode.contains.desc": "Output must contain the expected text",
  "evalMode.equals": "Exact Match",
  "evalMode.equals.desc": "Output must be exactly the same as expected",
  "evalMode.label": "Eval Mode",
  "evalMode.llm-rubric": "LLM Judge",
-  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
+  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
  "evalMode.placeholder": "Select eval mode",
  "evalMode.prompt.label": "Judge Prompt",
  "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
@@ -256,12 +258,16 @@
  "run.running.hint": "Evaluation is running, results will appear shortly...",
  "run.status.aborted": "Aborted",
  "run.status.completed": "Completed",
+  "run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
  "run.status.error": "Run Error",
+  "run.status.external": "External",
+  "run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
  "run.status.failed": "Failed",
  "run.status.idle": "Idle",
  "run.status.pending": "Pending",
  "run.status.running": "Running",
  "run.status.timeout": "Timeout",
+  "sidebar": "Evaluation",
  "sidebar.benchmarks": "Benchmarks",
  "sidebar.dashboard": "Dashboard",
  "sidebar.datasets": "Datasets",
--- a/locales/zh-CN/eval.json
+++ b/locales/zh-CN/eval.json
@@ -161,6 +161,8 @@
  "evalMode.contains.desc": "输出中必须包含期望的文本",
  "evalMode.equals": "精确匹配",
  "evalMode.equals.desc": "输出必须与期望内容完全一致",
+  "evalMode.external": "外部评估",
+  "evalMode.external.desc": "智能体完成运行后，由外部系统提交评估结果",
  "evalMode.label": "评估模式",
  "evalMode.llm-rubric": "LLM 评判",
  "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
@@ -256,7 +258,10 @@
  "run.running.hint": "评测进行中，结果即将呈现...",
  "run.status.aborted": "已终止",
  "run.status.completed": "已完成",
+  "run.status.completed.tooltip": "评测已完成运行，所有结果已评估。",
  "run.status.error": "运行出错",
+  "run.status.external": "待外部评测",
+  "run.status.external.tooltip": "智能体已完成运行，等待外部系统提交评估结果。",
  "run.status.failed": "失败",
  "run.status.idle": "待开始",
  "run.status.pending": "等待中",
--- a/packages/database/src/models/agentEval/dataset.ts
+++ b/packages/database/src/models/agentEval/dataset.ts
@@ -50,6 +50,8 @@ export class AgentEvalDatasetModel {
        benchmarkId: agentEvalDatasets.benchmarkId,
        createdAt: agentEvalDatasets.createdAt,
        description: agentEvalDatasets.description,
+        evalConfig: agentEvalDatasets.evalConfig,
+        evalMode: agentEvalDatasets.evalMode,
        id: agentEvalDatasets.id,
        identifier: agentEvalDatasets.identifier,
        metadata: agentEvalDatasets.metadata,
--- a/packages/database/src/models/agentEval/run.ts
+++ b/packages/database/src/models/agentEval/run.ts
@@ -31,7 +31,7 @@ export class AgentEvalRunModel {
    datasetId?: string;
    limit?: number;
    offset?: number;
-    status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted';
+    status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted' | 'external';
  }) => {
    const conditions = [eq(agentEvalRuns.userId, this.userId)];

--- a/packages/database/src/schemas/agentEvals.ts
+++ b/packages/database/src/schemas/agentEvals.ts
@@ -43,6 +43,7 @@ const evalModes = [
  'similar',
  'levenshtein',
  'rubric',
+  'external',
 ] as const;

 // ============================================
@@ -181,7 +182,7 @@ export const agentEvalRuns = pgTable(
    name: text('name'),

    status: text('status', {
-      enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted'],
+      enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'],
    })
      .default('idle')
      .notNull(),
@@ -228,7 +229,7 @@ export const agentEvalRunTopics = pgTable(
      .notNull(),

    status: text('status', {
-      enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout'],
+      enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout', 'external', 'completed'],
    }),

    score: real('score'),
--- a/packages/eval-rubric/src/evaluate.ts
+++ b/packages/eval-rubric/src/evaluate.ts
@@ -87,12 +87,20 @@ export const evaluate = async (
      const candidates: string[] = JSON.parse(expected);
      const results: MatchResult[] = [];
      for (const c of candidates) {
-        results.push(await match({ actual: extracted, expected: c, rubric }, matchContext));
+        results.push(
+          await match(
+            { input: testCase.input, actual: extracted, expected: c, rubric },
+            matchContext,
+          ),
+        );
      }
      const best = results.reduce((a, b) => (a.score >= b.score ? a : b));
      result = best;
    } else {
-      result = await match({ actual: extracted, expected, rubric }, matchContext);
+      result = await match(
+        { input: testCase.input, actual: extracted, expected, rubric },
+        matchContext,
+      );
    }

    rubricResults.push({
--- a/packages/eval-rubric/src/matchers/external.ts
+++ b/packages/eval-rubric/src/matchers/external.ts
@@ -0,0 +1,9 @@
+import type { MatchResult } from './types';
+
+export const matchExternal = async (): Promise<MatchResult> => {
+  return {
+    passed: false,
+    score: 0,
+    reason: 'Waiting for external evaluation...',
+  };
+};
--- a/packages/eval-rubric/src/matchers/index.ts
+++ b/packages/eval-rubric/src/matchers/index.ts
@@ -4,8 +4,10 @@ import { matchAnyOf } from './anyOf';
 import { matchContains } from './contains';
 import { matchEndsWith } from './endsWith';
 import { matchEquals } from './equals';
+import { matchExternal } from './external';
 import { matchJsonSchema } from './jsonSchema';
 import { matchLevenshtein } from './levenshtein';
+import { matchLLMEq } from './llmEq';
 import { matchLLMRubric } from './llmRubric';
 import { matchNumeric } from './numeric';
 import { matchRegex } from './regex';
@@ -18,10 +20,15 @@ export type { GenerateObjectPayload, MatchContext, MatchResult } from './types';
 * Run a single rubric matcher against actual vs expected
 */
 export const match = async (
-  params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric },
+  params: {
+    input: string;
+    actual: string;
+    expected: string | undefined;
+    rubric: EvalBenchmarkRubric;
+  },
  context?: MatchContext,
 ): Promise<MatchResult> => {
-  const { actual, expected, rubric } = params;
+  const { actual, expected, rubric, input } = params;
  const { type, config } = rubric;

  switch (type) {
@@ -57,6 +64,10 @@ export const match = async (
      return matchLevenshtein(actual, expected, config);
    }

+    case 'answer-relevance': {
+      return matchLLMEq(input, actual, expected, rubric, context);
+    }
+
    case 'llm-rubric': {
      return matchLLMRubric(actual, expected, rubric, context);
    }
@@ -65,6 +76,10 @@ export const match = async (
      return matchJsonSchema(actual, config);
    }

+    case 'external': {
+      return matchExternal();
+    }
+
    default: {
      return {
        passed: false,
--- a/packages/eval-rubric/src/matchers/llmEq.ts
+++ b/packages/eval-rubric/src/matchers/llmEq.ts
@@ -0,0 +1,89 @@
+import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types';
+
+import type { MatchContext, MatchResult } from './types';
+
+const DEFAULT_SYSTEM_ROLE = [
+  'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.',
+  'Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.',
+  'Your judgement must be in the format and criteria specified below:',
+  "extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.",
+  'reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.',
+  'Scoring rules:',
+  'score: Return 1 only when extracted_final_answer clearly and unambiguously matches [correct_answer], or is within a small margin of error for numerical problems.',
+  'score: Return 0 when extracted_final_answer is incorrect, missing, ambiguous, non-equivalent, or when you are uncertain.',
+  'Treat uncertainty as incorrect (score = 0).',
+  'Respond with a JSON object containing ',
+  '"score" (number: 0 or 1)',
+  'and "reason" (brief explanation for the judgement).',
+].join('\n');
+
+const JUDGE_SCORE_SCHEMA: Record<string, unknown> = {
+  additionalProperties: false,
+  properties: {
+    score: {
+      description: 'Binary score for judgement: 1=correct, 0=incorrect/uncertain',
+      enum: [0, 1],
+      type: 'number',
+    },
+    reason: { description: 'Brief explanation for the judgement', type: 'string' },
+  },
+  required: ['score', 'reason'],
+  type: 'object',
+};
+
+function buildJudgeUserPrompt(
+  question: string,
+  actual: string,
+  expected: string | undefined,
+): string {
+  const parts = [`[question]\n${question}`, `[response]\n${actual}`];
+  if (expected) {
+    parts.push(`[correct_answer]\n${expected}`);
+  }
+  return parts.join('\n\n');
+}
+
+export const matchLLMEq = async (
+  question: string,
+  actual: string,
+  expected: string | undefined,
+  rubric: EvalBenchmarkRubric,
+  context?: MatchContext,
+): Promise<MatchResult> => {
+  if (!context?.generateObject) {
+    return { passed: false, reason: 'LLM judge not available', score: 0 };
+  }
+
+  const cfg = rubric.config as RubricConfigLLM;
+  const model = cfg.model || context.judgeModel;
+
+  if (!model) {
+    return { passed: false, reason: 'No judge model configured', score: 0 };
+  }
+
+  try {
+    const result = await context.generateObject({
+      messages: [
+        { content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' },
+        { content: buildJudgeUserPrompt(question, actual, expected), role: 'user' },
+      ],
+      model,
+      provider: cfg.provider,
+      schema: JUDGE_SCORE_SCHEMA,
+    });
+
+    const score = result?.score === 1 ? 1 : 0;
+
+    return {
+      passed: score === 1,
+      reason: result?.reason,
+      score,
+    };
+  } catch (error) {
+    return {
+      passed: false,
+      reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`,
+      score: 0,
+    };
+  }
+};
--- a/packages/eval-rubric/src/matchers/llmRubric.ts
+++ b/packages/eval-rubric/src/matchers/llmRubric.ts
@@ -64,6 +64,10 @@ export const matchLLMRubric = async (
      schema: JUDGE_SCORE_SCHEMA,
    });

+    if (!result?.score) {
+      return { passed: false, reason: 'LLM judge did not return a score', score: 0 };
+    }
+
    const score = Math.max(0, Math.min(1, result.score));
    const threshold = rubric.threshold ?? 0.6;

--- a/packages/types/src/eval/agentEval.ts
+++ b/packages/types/src/eval/agentEval.ts
@@ -34,7 +34,7 @@ export interface EvalTestCaseMetadata {
 /**
 * Evaluation run status
 */
-export type EvalRunStatus = 'aborted' | 'completed' | 'failed' | 'pending' | 'running';
+export type EvalRunStatus = 'aborted' | 'completed' | 'external' | 'failed' | 'pending' | 'running';

 /**
 * Evaluation run configuration
@@ -96,6 +96,7 @@ export interface EvalRunMetrics {
  cost?: number;
  duration?: number;
  errorCases?: number;
+  externalCases?: number;
  failedCases: number;
  llmCalls?: number;
  passAllK?: number;
@@ -183,6 +184,8 @@ export interface EvalRunTopicResult {
  completionReason?: string;
  operationId?: string;
  rubricScores?: EvalRubricScore[];
+  /** Set when evalMode is 'external' — agent finished, awaiting external scoring */
+  awaitingExternalEval?: boolean;
 }
 /*eslint-enable perfectionist/sort-interfaces */

@@ -194,14 +197,16 @@ export interface EvalThreadResult {
  cost?: number;
  duration?: number;
  error?: string;
+  llmCalls?: number;
  operationId?: string;
  passed?: boolean;
  rubricScores?: EvalRubricScore[];
  score?: number;
-  status?: 'error' | 'failed' | 'passed' | 'running' | 'timeout';
+  status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout' | 'completed';
  steps?: number;
  threadId: string;
  tokens?: number;
+  toolCalls?: number;
 }

 /**
--- a/packages/types/src/eval/agentEvalRun.ts
+++ b/packages/types/src/eval/agentEvalRun.ts
@@ -11,6 +11,7 @@ export type AgentEvalRunStatus =
  | 'failed'
  | 'idle'
  | 'pending'
+  | 'external'
  | 'running';

 export interface AgentEvalRunTargetAgent {
--- a/packages/types/src/eval/rubric.ts
+++ b/packages/types/src/eval/rubric.ts
@@ -22,6 +22,8 @@ export type RubricType =
  // Similarity
  | 'similar'
  | 'levenshtein'
+  // External evaluation
+  | 'external'
  // Composite
  | 'rubric';

--- a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
@@ -66,9 +66,18 @@ export const { POST } = serve<FinalizeRunPayload>(

    log('Metrics: %O', metrics);

-    // Step 4: Update run status (failed if all cases errored/timed out)
+    // Step 4: Update run status
+    // external: any topic awaits external scoring → whole run waits too
+    // failed: all cases are non-success (error/timeout)
+    // completed: everything else
    const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
-    const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+    const externalCount = metrics.externalCases || 0;
+    const runStatus =
+      externalCount > 0
+        ? 'external'
+        : nonSuccessCases >= metrics.totalCases
+          ? 'failed'
+          : 'completed';

    await context.run('agent-eval-run:update-run', async () => {
      const runModel = new AgentEvalRunModel(db, userId);
--- a/src/locales/default/eval.ts
+++ b/src/locales/default/eval.ts
@@ -173,9 +173,14 @@ export default {
  'evalMode.contains.desc': 'Output must contain the expected text',
  'evalMode.equals': 'Exact Match',
  'evalMode.equals.desc': 'Output must be exactly the same as expected',
+  'evalMode.external': 'External Eval',
+  'evalMode.external.desc': 'Agent runs to completion; scoring is handled by an external system',
  'evalMode.label': 'Eval Mode',
  'evalMode.llm-rubric': 'LLM Judge',
-  'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality',
+  'evalMode.llm-rubric.desc':
+    'Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)',
+  'evalMode.answer-relevance': 'LLM Relevance',
+  'evalMode.answer-relevance.desc': 'Use LLM to evaluate answer relevance (yes or no)',
  'evalMode.placeholder': 'Select eval mode',
  'evalMode.prompt.label': 'Judge Prompt',
  'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge',
@@ -204,6 +209,8 @@ export default {
  'run.idle.hint': 'Click Start to begin evaluation',
  'run.pending.hint': 'Evaluation is queued, waiting to start...',
  'run.running.hint': 'Evaluation is running, results will appear shortly...',
+  'run.external.hint':
+    'Running completed. Waiting for external system to submit evaluation results ...',

  'run.filter.active': 'Active',
  'run.filter.empty': 'No runs match the current filter.',
@@ -249,6 +256,9 @@ export default {
  'run.detail.report': 'Evaluation Summary',
  'run.detail.config': 'Evaluation Config',
  'run.detail.configSnapshot': 'Configuration Snapshot',
+  'run.detail.copyRunId': 'Copy Run ID',
+  'run.detail.copyRunIdFailed': 'Failed to copy Run ID',
+  'run.detail.copyRunIdSuccess': 'Run ID copied',
  'run.detail.dataset': 'Dataset',
  'run.detail.model': 'Model',
  'run.detail.overview': 'Overview',
@@ -279,7 +289,11 @@ export default {

  'run.status.aborted': 'Aborted',
  'run.status.completed': 'Completed',
+  'run.status.completed.tooltip': 'The run and external scoring are completed.',
  'run.status.error': 'Run Error',
+  'run.status.external': 'Awaiting Eval',
+  'run.status.external.tooltip':
+    'The agent has finished running. Waiting for an external system to submit evaluation results.',
  'run.status.failed': 'Failed',
  'run.status.idle': 'Idle',
  'run.status.pending': 'Pending',
--- a/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
@@ -208,6 +208,7 @@ const DatasetDetail = memo(() => {
              }}
            >
              <TestCaseTable
+                datasetEvalMode={dataset?.evalMode}
                diffFilter={diffFilter}
                pagination={pagination}
                search={search}
--- a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
@@ -238,6 +238,7 @@ const DatasetCard = memo<DatasetCardProps>(
            ) : (
              <TestCaseTable
                readOnly
+                datasetEvalMode={dataset.evalMode}
                diffFilter={diffFilter}
                pagination={pagination}
                search={search}
--- a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
@@ -83,6 +83,7 @@ const styles = createStaticStyles(({ css, cssVar }) => ({
 }));

 interface TestCaseTableProps {
+  datasetEvalMode?: string | null;
  diffFilter: 'all' | 'easy' | 'medium' | 'hard';
  onAddCase?: () => void;
  onDelete?: (testCase: any) => void;
@@ -106,6 +107,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
    total,
    search,
    diffFilter,
+    datasetEvalMode,
    pagination,
    onSearchChange,
    onDiffFilterChange,
@@ -170,10 +172,18 @@ const TestCaseTable = memo<TestCaseTableProps>(
          dataIndex: 'evalMode',
          key: 'evalMode',
          render: (text: string) => {
-            if (!text) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
+            const effective = text ?? datasetEvalMode;
+            if (!effective) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
+            const isInherited = !text && !!datasetEvalMode;
            return (
-              <span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
-                {t(`evalMode.${text}` as any)}
+              <span
+                style={{
+                  color: isInherited ? cssVar.colorTextQuaternary : cssVar.colorTextSecondary,
+                  fontSize: 12,
+                  fontStyle: isInherited ? 'italic' : 'normal',
+                }}
+              >
+                {t(`evalMode.${effective}` as any)}
              </span>
            );
          },
@@ -238,7 +248,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
      }

      return base;
-    }, [pagination, readOnly, onEdit, onDelete, t]);
+    }, [pagination, readOnly, onEdit, onDelete, t, datasetEvalMode]);

    return (
      <>
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
@@ -67,6 +67,8 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
  const { t } = useTranslation('eval');
  const status: string | null | undefined = record.status;

+  // return <div>{status}</div>;
+
  if (!status || status === 'pending')
    return <Badge status="default" text={<BadgeText>{t('run.status.pending')}</BadgeText>} />;

@@ -86,6 +88,17 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
  if (status === 'timeout')
    return <Badge color="orange" text={<BadgeText>{t('run.status.timeout')}</BadgeText>} />;

+  if (status === 'external') {
+    const badge = <Badge color="purple" text={<BadgeText>{t('run.status.external')}</BadgeText>} />;
+    return <Tooltip title={t('run.status.external.tooltip')}>{badge}</Tooltip>;
+  }
+
+  if (status === 'completed') {
+    // 完成代表运行完成 + 评测完成，不代表结果一定通过
+    const badge = <Badge color="blue" text={<BadgeText>{t('run.status.completed')}</BadgeText>} />;
+    return <Tooltip title={t('run.status.completed.tooltip')}>{badge}</Tooltip>;
+  }
+
  return <Badge status="default" text={<BadgeText>{status}</BadgeText>} />;
 });

@@ -99,15 +112,29 @@ const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => (

      if (thread.passed === true) {
        color = cssVar.colorSuccess;
+      } else if (thread.passed === false) {
+        color = cssVar.colorError;
+      }
+
+      if (thread.status === 'external') {
+        color = cssVar.colorWarning;
+      }
+
+      if (thread.status === 'completed') {
+        color = cssVar.colorPrimary;
      }

      const label = thread.error
        ? 'error'
        : thread.passed === true
          ? 'passed'
-          : thread.passed === false
+          : thread.passed === false && thread.status !== 'completed'
            ? 'failed'
-            : 'pending';
+            : thread.status === 'external'
+              ? 'Awaiting for external evaluation'
+              : thread.status === 'completed'
+                ? 'completed'
+                : 'pending';

      return (
        <Tooltip key={thread.threadId} title={label}>
@@ -406,6 +433,8 @@ const CaseResultsTable = memo<CaseResultsTableProps>(
              { label: t('table.filter.error'), value: 'error' },
              { label: t('table.filter.running'), value: 'running' },
              { label: t('run.status.pending'), value: 'pending' },
+              { label: t('run.status.external'), value: 'external' },
+              { label: t('run.status.completed'), value: 'completed' },
            ]}
            onChange={setStatusFilter}
          />
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
@@ -96,7 +96,7 @@ const useStyles = createStyles(({ css, token }) => ({
  `,
 }));

-const PendingState = memo(() => {
+const PendingState = memo(({ hint }: { hint?: string }) => {
  const { t } = useTranslation('eval');
  const { cx, styles } = useStyles();

@@ -119,7 +119,7 @@ const PendingState = memo(() => {
          <Icon icon={Clock} size={18} />
        </div>
      </div>
-      <div className={styles.hint}>{t('run.pending.hint')}</div>
+      <div className={styles.hint}>{hint}</div>
    </div>
  );
 });
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
@@ -2,10 +2,19 @@

 import { AGENT_PROFILE_URL } from '@lobechat/const';
 import type { AgentEvalRunDetail } from '@lobechat/types';
-import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
+import { ActionIcon, Avatar, copyToClipboard, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
 import { App, Button, Card, Tag, Typography } from 'antd';
 import { createStyles } from 'antd-style';
-import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react';
+import {
+  ArrowLeft,
+  ChevronDown,
+  ChevronUp,
+  Copy,
+  Pencil,
+  Play,
+  Square,
+  Trash2,
+} from 'lucide-react';
 import { memo, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import { Link, useNavigate } from 'react-router-dom';
@@ -170,6 +179,14 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
      window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
    }
  };
+  const handleCopyRunId = async () => {
+    try {
+      await copyToClipboard(run.id);
+      message.success(t('run.detail.copyRunIdSuccess'));
+    } catch {
+      message.error(t('run.detail.copyRunIdFailed'));
+    }
+  };

  const formatDate = (date?: Date | string) => {
    if (!date) return '';
@@ -194,6 +211,12 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
              <Typography.Title level={4} style={{ margin: 0 }}>
                {run.name || run.id.slice(0, 8)}
              </Typography.Title>
+              <ActionIcon
+                icon={Copy}
+                size="small"
+                title={t('run.detail.copyRunId')}
+                onClick={handleCopyRunId}
+              />
              <StatusBadge status={run.status} />
            </Flexbox>
            {/* Meta info row */}
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
@@ -104,7 +104,9 @@ const RunDetail = memo(() => {
          {runDetail.status === 'running' ? (
            <RunningState />
          ) : runDetail.status === 'pending' ? (
-            <PendingState />
+            <PendingState hint={t('run.pending.hint')} />
+          ) : runDetail.status === 'external' ? (
+            <PendingState hint={t('run.external.hint')} />
          ) : (
            <IdleState run={runDetail} />
          )}
--- a/src/routes/(main)/eval/config/datasetPresets.ts
+++ b/src/routes/(main)/eval/config/datasetPresets.ts
@@ -36,6 +36,26 @@ export interface DatasetPreset {
 }

 export const DATASET_PRESETS: Record<string, DatasetPreset> = {
+  'browsecomp': {
+    id: 'browsecomp',
+    category: 'research',
+    name: 'BrowseComp',
+    description: 'Measuring the ability for agents to browse the web, comprises 1,266 questions.',
+    icon: Globe,
+    formatDescription: 'format: Topic (category/tags), Question (input), Answer (expected)',
+    requiredFields: ['question', 'answer', 'problem_topic', 'canary'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['problem_topic'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
  // === Deep Research / QA Category ===
  'browsecomp-zh': {
    id: 'browsecomp-zh',
@@ -58,6 +78,129 @@ export const DATASET_PRESETS: Record<string, DatasetPreset> = {
    },
  },

+  'widesearch': {
+    id: 'widesearch',
+    category: 'research',
+    name: 'WideSearch',
+    description:
+      'Evaluating the capabilities of agents in broad information-seeking tasks, consisting of 200 questions.',
+    icon: Globe,
+    formatDescription: 'format: instance_id, query (input), evaluation (expected), language',
+    requiredFields: ['instance_id', 'query', 'evaluation', 'language'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['query'],
+      expected: ['evaluation'],
+      choices: [],
+      category: ['language'],
+      sortOrder: [],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  'hle-text': {
+    id: 'hle-text',
+    category: 'research',
+    name: "Humanity's Last Exam, HLE (Text Only)",
+    description:
+      "Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, consisting of 2150 questions.",
+    icon: Globe,
+    formatDescription:
+      'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category',
+    requiredFields: [
+      'id',
+      'question',
+      'answer',
+      'answer_type',
+      'rationale',
+      'raw_subject',
+      'category',
+    ],
+    optionalFields: ['canary'],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['category'],
+    },
+  },
+
+  'hle-verified': {
+    id: 'hle-verified',
+    category: 'research',
+    name: "Humanity's Last Exam, HLE (Verified Answers)",
+    description:
+      "A subset of Humanity's Last Exam (HLE) with verified answers, designed to evaluate the ability to produce correct answers rather than just plausible ones.",
+    icon: Globe,
+    formatDescription:
+      'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category, Verified_Classes',
+    requiredFields: [
+      'id',
+      'question',
+      'answer',
+      'answer_type',
+      'rationale',
+      'raw_subject',
+      'category',
+      'Verified_Classes',
+    ],
+    optionalFields: ['canary'],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['category'],
+    },
+  },
+
+  'deepsearchqa': {
+    id: 'deepsearchqa',
+    category: 'research',
+    name: 'DeepSearchQA',
+    description:
+      'A 900-prompt factuality benchmark from Google DeepMind, designed to evaluate agents on difficult multi-step information-seeking tasks across 17 different fields.',
+    icon: Globe,
+    formatDescription: 'problem, problem_category, answer, answer_type',
+    requiredFields: ['problem', 'answer', 'problem_category', 'answer_type'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['problem'],
+      expected: ['answer'],
+      choices: [],
+      category: ['problem_category'],
+      sortOrder: [],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  'sealqa': {
+    id: 'sealqa',
+    category: 'research',
+    name: 'SealQA',
+    description:
+      'SealQA is a new challenge benchmark for evaluating SEarch- Augmented Language models on fact-seeking questions where web search yields conflicting, noisy, or unhelpful results.',
+    icon: Globe,
+    formatDescription: 'format: question (input), answer (expected), topic (category)',
+    requiredFields: ['question', 'answer', 'topic', 'canary'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['topic'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
  'xbench': {
    id: 'xbench',
    category: 'research',
--- a/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx
+++ b/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx
@@ -157,6 +157,7 @@ const DatasetCreateModal = memo<DatasetCreateModalProps>(
                { label: t('evalMode.equals'), value: 'equals' },
                { label: t('evalMode.contains'), value: 'contains' },
                { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+                { label: t('evalMode.external'), value: 'external' },
              ]}
            />
          </Form.Item>
--- a/src/routes/(main)/eval/features/DatasetEditModal/index.tsx
+++ b/src/routes/(main)/eval/features/DatasetEditModal/index.tsx
@@ -131,14 +131,30 @@ const DatasetEditModal = memo<DatasetEditModalProps>(({ open, onCancel, dataset,
              { label: t('evalMode.equals'), value: 'equals' },
              { label: t('evalMode.contains'), value: 'contains' },
              { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+              { label: t('evalMode.answer-relevance'), value: 'answer-relevance' },
+              { label: t('evalMode.external'), value: 'external' },
            ]}
          />
        </Form.Item>

-        {evalModeValue === 'llm-rubric' && (
-          <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
-            <TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
-          </Form.Item>
+        {(evalModeValue === 'llm-rubric' || evalModeValue === 'answer-relevance') && (
+          <>
+            <Form.Item initialValue="aihubmix" label={'Provider'} name={['evalConfig', 'provider']}>
+              <TextArea placeholder={'LLM provider (e.g. openai, azure)'} rows={1} />
+            </Form.Item>
+            <Form.Item initialValue="gpt-5-nano" label={'Model'} name={['evalConfig', 'model']}>
+              <TextArea placeholder={'LLM model to use for evaluation (e.g. gpt-4)'} rows={1} />
+            </Form.Item>
+            <Form.Item label={'System Prompt'} name={['evalConfig', 'systemRole']}>
+              <TextArea placeholder={'Optional system prompt for the LLM judge'} rows={3} />
+            </Form.Item>
+            <Form.Item label={'Eval Prompt'} name={['evalConfig', 'criteria']}>
+              <TextArea placeholder={'Prompt template for the LLM judge'} rows={3} />
+            </Form.Item>
+            <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+              <TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
+            </Form.Item>
+          </>
        )}

        <Form.Item label={t('dataset.create.preset.label')} style={{ marginBottom: 0 }}>
--- a/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx
+++ b/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx
@@ -92,6 +92,14 @@ const autoInferMapping = (
    ? new Set(preset.fieldInference.sortOrder.map((s) => s.toLowerCase()))
    : SORT_ORDER_CANDIDATES;

+  const requiredCandidates = new Set<string>(
+    preset ? preset.requiredFields.map((s) => s.toLowerCase()) : [],
+  );
+
+  const optionalCandidates = new Set<string>(
+    preset ? preset.optionalFields.map((s) => s.toLowerCase()) : [],
+  );
+
  for (const h of headers) {
    const lower = h.toLowerCase().trim();
    if (!inputFound && inputCandidates.has(lower)) {
@@ -109,6 +117,10 @@ const autoInferMapping = (
    } else if (!sortOrderFound && sortOrderCandidates.has(lower)) {
      result[h] = 'sortOrder';
      sortOrderFound = true;
+    } else if (requiredCandidates.has(lower) || optionalCandidates.has(lower)) {
+      // If the field was claimed by the config but not matched by any candidate,
+      // assign it to metadata to ensure it's not missed
+      result[h] = 'metadata';
    } else {
      result[h] = 'ignore';
    }
--- a/src/routes/(main)/eval/features/StatusBadge.tsx
+++ b/src/routes/(main)/eval/features/StatusBadge.tsx
@@ -2,13 +2,14 @@

 import { Icon } from '@lobehub/ui';
 import { createStaticStyles } from 'antd-style';
-import { Activity, CheckCircle2, Clock, Pause, XCircle } from 'lucide-react';
+import { Activity, CheckCircle2, Clock, Hourglass, Pause, XCircle } from 'lucide-react';
 import { memo } from 'react';
 import { useTranslation } from 'react-i18next';

 const statusConfig: Record<string, { cls: string; icon: any }> = {
  aborted: { cls: 'default', icon: Pause },
  completed: { cls: 'success', icon: CheckCircle2 },
+  external: { cls: 'warning', icon: Hourglass },
  failed: { cls: 'error', icon: XCircle },
  idle: { cls: 'default', icon: Clock },
  pending: { cls: 'warning', icon: Clock },
--- a/src/server/routers/lambda/agentEval.ts
+++ b/src/server/routers/lambda/agentEval.ts
@@ -33,6 +33,7 @@ const rubricTypeSchema = z.enum([
  'similar',
  'levenshtein',
  'rubric',
+  'external',
 ]);

 const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passthrough();
@@ -621,7 +622,9 @@ export const agentEvalRouter = router({
      z.object({
        benchmarkId: z.string().optional(),
        datasetId: z.string().optional(),
-        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']).optional(),
+        status: z
+          .enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'])
+          .optional(),
        limit: z.number().min(1).max(100).default(50).optional(),
        offset: z.number().min(0).default(0).optional(),
      }),
@@ -871,7 +874,15 @@ export const agentEvalRouter = router({
    .input(
      z.object({
        id: z.string(),
-        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']),
+        status: z.enum([
+          'idle',
+          'pending',
+          'running',
+          'completed',
+          'failed',
+          'aborted',
+          'external',
+        ]),
      }),
    )
    .mutation(async ({ input, ctx }) => {
--- a/src/server/routers/lambda/agentEvalExternal.ts
+++ b/src/server/routers/lambda/agentEvalExternal.ts
@@ -0,0 +1,514 @@
+import type { EvalRunTopicResult, EvalThreadResult } from '@lobechat/types';
+import { TRPCError } from '@trpc/server';
+import { and, asc, eq, isNull } from 'drizzle-orm';
+import { z } from 'zod';
+
+import {
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import { ThreadModel } from '@/database/models/thread';
+import { messages } from '@/database/schemas';
+import { authedProcedure, router } from '@/libs/trpc/lambda';
+import { serverDatabase } from '@/libs/trpc/lambda/middleware';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+const runStatusSchema = z.enum([
+  'idle',
+  'pending',
+  'running',
+  'completed',
+  'failed',
+  'aborted',
+  'external',
+]);
+
+const reportResultItemSchema = z.object({
+  correct: z.boolean(),
+  result: z.record(z.unknown()).optional(),
+  score: z.number(),
+  threadId: z.string().optional(),
+  topicId: z.string(),
+});
+
+const toIsoString = (value?: Date | null) => (value ? value.toISOString() : undefined);
+
+const agentEvalExternalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
+  const { ctx } = opts;
+
+  return opts.next({
+    ctx: {
+      datasetModel: new AgentEvalDatasetModel(ctx.serverDB, ctx.userId),
+      runModel: new AgentEvalRunModel(ctx.serverDB, ctx.userId),
+      runService: new AgentEvalRunService(ctx.serverDB, ctx.userId),
+      runTopicModel: new AgentEvalRunTopicModel(ctx.serverDB, ctx.userId),
+      testCaseModel: new AgentEvalTestCaseModel(ctx.serverDB, ctx.userId),
+      threadModel: new ThreadModel(ctx.serverDB, ctx.userId),
+    },
+  });
+});
+
+type ReportResultInput = z.infer<typeof reportResultItemSchema> & { runId: string };
+
+const recomputeRunAggregation = async (
+  ctx: {
+    runModel: AgentEvalRunModel;
+    runService: AgentEvalRunService;
+    runTopicModel: AgentEvalRunTopicModel;
+  },
+  runId: string,
+) => {
+  const refreshedRun = await ctx.runModel.findById(runId);
+  if (!refreshedRun) return undefined;
+
+  const refreshedTopics = await ctx.runTopicModel.findByRunId(runId);
+  const metrics = await ctx.runService.evaluateAndFinalizeRun({
+    run: {
+      config: refreshedRun.config,
+      id: refreshedRun.id,
+      metrics: refreshedRun.metrics,
+      startedAt: refreshedRun.startedAt,
+    },
+    runTopics: refreshedTopics,
+  });
+
+  const hasAwaitingExternal = refreshedTopics.some(
+    (topic) =>
+      topic.status === 'external' ||
+      (topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
+  );
+  const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
+  const status = hasAwaitingExternal
+    ? 'external'
+    : nonSuccessCases >= metrics.totalCases
+      ? 'failed'
+      : 'completed';
+
+  await ctx.runModel.update(runId, { metrics, status });
+
+  return status;
+};
+
+const applyReportResult = async (
+  ctx: {
+    runModel: AgentEvalRunModel;
+    runTopicModel: AgentEvalRunTopicModel;
+    runService: AgentEvalRunService;
+    threadModel: ThreadModel;
+  },
+  input: ReportResultInput,
+  recomputeRun: boolean,
+) => {
+  const run = await ctx.runModel.findById(input.runId);
+  if (!run) {
+    throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+  }
+
+  const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
+  const runTopic = runTopics.find((item) => item.topicId === input.topicId);
+  if (!runTopic) {
+    throw new TRPCError({ code: 'NOT_FOUND', message: 'Run topic not found' });
+  }
+
+  const runK = run.config?.k ?? 1;
+  const rubricScores = [{ rubricId: 'external', score: input.score }];
+  const existingEvalResult = (runTopic.evalResult ?? {}) as EvalRunTopicResult &
+    Record<string, unknown>;
+  const externalResult = input.result ?? {};
+
+  let idempotent = false;
+  let reportedThreads: number;
+  let totalThreads: number;
+  let topicFinalized: boolean;
+
+  if (runK > 1) {
+    if (!input.threadId) {
+      throw new TRPCError({
+        code: 'BAD_REQUEST',
+        message: 'threadId is required when k > 1',
+      });
+    }
+
+    const allThreads = await ctx.threadModel.queryByTopicId(input.topicId);
+    const evalThreads = allThreads.filter((thread) => thread.type === 'eval');
+    const sourceThreads = evalThreads.length > 0 ? evalThreads : allThreads;
+    if (sourceThreads.length === 0) {
+      throw new TRPCError({
+        code: 'BAD_REQUEST',
+        message: 'No threads found for this topic',
+      });
+    }
+
+    const threads: EvalThreadResult[] =
+      (existingEvalResult.threads as EvalThreadResult[] | undefined)?.map((thread) => ({
+        ...thread,
+      })) ??
+      sourceThreads.map((thread) => ({
+        status: 'external',
+        threadId: thread.id,
+      }));
+
+    let targetIndex = threads.findIndex((thread) => thread.threadId === input.threadId);
+    if (targetIndex < 0) {
+      const existsInTopic = sourceThreads.some((thread) => thread.id === input.threadId);
+      if (!existsInTopic) {
+        throw new TRPCError({
+          code: 'NOT_FOUND',
+          message: 'Thread not found for this topic',
+        });
+      }
+
+      threads.push({ status: 'external', threadId: input.threadId });
+      targetIndex = threads.length - 1;
+    }
+
+    totalThreads = threads.length;
+    const targetThread = threads[targetIndex];
+    const alreadyReported =
+      targetThread.status === 'completed' &&
+      targetThread.score === input.score &&
+      targetThread.passed === input.correct;
+    if (alreadyReported) {
+      idempotent = true;
+    } else {
+      threads[targetIndex] = {
+        ...targetThread,
+        passed: input.correct,
+        rubricScores,
+        score: input.score,
+        status: 'completed',
+      };
+
+      const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
+        string,
+        unknown
+      >;
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: true,
+        externalThreadResults: {
+          ...existingThreadResults,
+          [input.threadId]: externalResult,
+        },
+        threads,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        status: 'external',
+      });
+    }
+
+    reportedThreads = threads.filter(
+      (thread) => thread.status === 'completed' && typeof thread.score === 'number',
+    ).length;
+    topicFinalized = reportedThreads >= totalThreads;
+
+    if (topicFinalized) {
+      const finalThreads = threads;
+      const totalScore = finalThreads.reduce((acc, thread) => acc + (thread.score ?? 0), 0);
+      const avgScore = totalScore / finalThreads.length;
+      const passAtK = finalThreads.some((thread) => thread.passed === true);
+      const passAllK = finalThreads.every((thread) => thread.passed === true);
+
+      const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
+        string,
+        unknown
+      >;
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: false,
+        externalThreadResults: {
+          ...existingThreadResults,
+          [input.threadId]: externalResult,
+        },
+        passAllK,
+        passAtK,
+        rubricScores: [{ rubricId: 'external', score: avgScore }],
+        threads: finalThreads,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        passed: passAtK,
+        score: avgScore,
+        status: passAtK ? 'passed' : 'failed',
+      });
+    }
+  } else {
+    const alreadyReported =
+      runTopic.status === (input.correct ? 'passed' : 'failed') &&
+      runTopic.score === input.score &&
+      runTopic.passed === input.correct;
+    if (alreadyReported) {
+      idempotent = true;
+    } else {
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: false,
+        externalResult,
+        rubricScores,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        passed: input.correct,
+        score: input.score,
+        status: input.correct ? 'passed' : 'failed',
+      });
+    }
+
+    reportedThreads = 1;
+    totalThreads = 1;
+    topicFinalized = true;
+  }
+
+  let runStatus: string | undefined;
+  if (recomputeRun) {
+    runStatus = await recomputeRunAggregation(ctx, input.runId);
+  }
+
+  return {
+    idempotent,
+    reportedThreads,
+    runId: input.runId,
+    runStatus,
+    success: true,
+    threadId: input.threadId,
+    topicFinalized,
+    topicId: input.topicId,
+    totalThreads,
+  };
+};
+
+export const agentEvalExternalRouter = router({
+  datasetGet: agentEvalExternalProcedure
+    .input(z.object({ datasetId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const dataset = await ctx.datasetModel.findById(input.datasetId);
+      if (!dataset) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Dataset not found' });
+      }
+
+      const metadata = (dataset.metadata ?? {}) as Record<string, unknown>;
+
+      return {
+        benchmarkId: dataset.benchmarkId,
+        id: dataset.id,
+        identifier: dataset.identifier,
+        metadata,
+        name: dataset.name,
+      };
+    }),
+
+  messagesList: agentEvalExternalProcedure
+    .input(z.object({ threadId: z.string().optional(), topicId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const conditions = [
+        eq(messages.userId, ctx.userId),
+        eq(messages.topicId, input.topicId),
+        isNull(messages.messageGroupId),
+      ];
+      if (input.threadId) conditions.push(eq(messages.threadId, input.threadId));
+
+      const rows = await ctx.serverDB
+        .select({
+          content: messages.content,
+          createdAt: messages.createdAt,
+          id: messages.id,
+          role: messages.role,
+          threadId: messages.threadId,
+          topicId: messages.topicId,
+        })
+        .from(messages)
+        .where(and(...conditions))
+        .orderBy(asc(messages.createdAt));
+
+      return rows.map((row) => ({
+        content: row.content,
+        createdAt: toIsoString(row.createdAt),
+        id: row.id,
+        role: row.role,
+        threadId: row.threadId,
+        topicId: row.topicId,
+      }));
+    }),
+
+  reportResult: agentEvalExternalProcedure
+    .input(
+      z.object({
+        correct: z.boolean(),
+        result: z.record(z.unknown()).optional(),
+        runId: z.string(),
+        score: z.number(),
+        threadId: z.string().optional(),
+        topicId: z.string(),
+      }),
+    )
+    .mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
+
+  reportResultsBatch: agentEvalExternalProcedure
+    .input(z.object({ items: z.array(reportResultItemSchema).min(1), runId: z.string() }))
+    .mutation(async ({ ctx, input }) => {
+      const receipts = [];
+
+      for (const item of input.items) {
+        receipts.push(await applyReportResult(ctx, { ...item, runId: input.runId }, false));
+      }
+
+      const runStatus = await recomputeRunAggregation(ctx, input.runId);
+
+      return {
+        items: receipts,
+        runId: input.runId,
+        runStatus,
+        success: true,
+      };
+    }),
+
+  runGet: agentEvalExternalProcedure
+    .input(z.object({ runId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+      const config = { ...run.config, k: run.config?.k ?? 1 };
+
+      return {
+        config,
+        createdAt: run.createdAt,
+        datasetId: run.datasetId,
+        id: run.id,
+        metrics: run.metrics ?? undefined,
+        name: run.name,
+        startedAt: run.startedAt,
+        status: run.status,
+        targetAgentId: run.targetAgentId,
+      };
+    }),
+
+  runSetStatus: agentEvalExternalProcedure
+    .input(z.object({ runId: z.string(), status: runStatusSchema }))
+    .mutation(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      if (input.status !== 'completed' && input.status !== 'external') {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: 'External endpoint only supports setting status to completed or external',
+        });
+      }
+
+      if (run.status !== 'external' && run.status !== 'completed') {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Only external runs can be finalized via this endpoint. current=${run.status}`,
+        });
+      }
+
+      if (input.status === 'completed') {
+        const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
+        const hasAwaitingExternal = runTopics.some(
+          (topic) =>
+            topic.status === 'external' ||
+            (topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
+        );
+        if (hasAwaitingExternal) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: 'Cannot set run to completed while external evaluation is pending',
+          });
+        }
+
+        const metrics = await ctx.runService.evaluateAndFinalizeRun({
+          run: { config: run.config, id: run.id, metrics: run.metrics, startedAt: run.startedAt },
+          runTopics,
+        });
+        const updated = await ctx.runModel.update(input.runId, { metrics, status: 'completed' });
+
+        return {
+          metrics,
+          runId: input.runId,
+          status: updated?.status ?? 'completed',
+          success: true,
+        };
+      }
+
+      const updated = await ctx.runModel.update(input.runId, { status: 'external' });
+
+      return {
+        runId: input.runId,
+        status: updated?.status ?? 'external',
+        success: true,
+      };
+    }),
+
+  runTopicReportResult: agentEvalExternalProcedure
+    .input(
+      z.object({
+        correct: z.boolean(),
+        result: z.record(z.unknown()).optional(),
+        runId: z.string(),
+        score: z.number(),
+        threadId: z.string().optional(),
+        topicId: z.string(),
+      }),
+    )
+    .mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
+
+  runTopicsList: agentEvalExternalProcedure
+    .input(z.object({ onlyExternal: z.boolean().default(false).optional(), runId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      const allRunTopics = await ctx.runTopicModel.findByRunId(input.runId);
+      const runTopics = input.onlyExternal
+        ? allRunTopics.filter((topic) => topic.status === 'external')
+        : allRunTopics;
+
+      return runTopics.map((topic) => {
+        const testCase = topic.testCase;
+
+        return {
+          createdAt: topic.createdAt,
+          evalResult: topic.evalResult,
+          passed: topic.passed,
+          runId: topic.runId,
+          score: topic.score,
+          status: topic.status,
+          testCase,
+          testCaseId: topic.testCaseId,
+          topic: topic.topic,
+          topicId: topic.topicId,
+        };
+      });
+    }),
+
+  testCasesCount: agentEvalExternalProcedure
+    .input(z.object({ datasetId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const count = await ctx.testCaseModel.countByDatasetId(input.datasetId);
+      return { count };
+    }),
+
+  threadsList: agentEvalExternalProcedure
+    .input(z.object({ topicId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const threads = await ctx.threadModel.queryByTopicId(input.topicId);
+
+      return threads.map((thread) => ({
+        id: thread.id,
+        topicId: thread.topicId,
+        type: thread.type,
+      }));
+    }),
+});
--- a/src/server/routers/lambda/index.ts
+++ b/src/server/routers/lambda/index.ts
@@ -12,6 +12,7 @@ import { agentRouter } from './agent';
 import { agentBotProviderRouter } from './agentBotProvider';
 import { agentCronJobRouter } from './agentCronJob';
 import { agentEvalRouter } from './agentEval';
+import { agentEvalExternalRouter } from './agentEvalExternal';
 import { agentGroupRouter } from './agentGroup';
 import { agentSkillsRouter } from './agentSkills';
 import { aiAgentRouter } from './aiAgent';
@@ -57,6 +58,7 @@ export const lambdaRouter = router({
  agentBotProvider: agentBotProviderRouter,
  agentCronJob: agentCronJobRouter,
  agentEval: agentEvalRouter,
+  agentEvalExternal: agentEvalExternalRouter,
  agentSkills: agentSkillsRouter,
  aiAgent: aiAgentRouter,
  aiChat: aiChatRouter,
--- a/src/server/services/agentEvalRun/index.ts
+++ b/src/server/services/agentEvalRun/index.ts
@@ -512,6 +512,7 @@ export class AgentEvalRunService {
    const passedCases = allTopics.filter((t) => t.status === 'passed').length;
    const failedCases = allTopics.filter((t) => t.status === 'failed').length;
    const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const externalCasesRT = allTopics.filter((t) => t.status === 'external').length;
    const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;

    let sumCost = 0;
@@ -556,6 +557,7 @@ export class AgentEvalRunService {
        completedCases: completedCount,
        cost: sumCost ? roundCost(sumCost) : undefined,
        errorCases,
+        externalCases: externalCasesRT || undefined,
        failedCases,
        llmCalls: sumLlmCalls || undefined,
        passedCases,
@@ -667,6 +669,17 @@ export class AgentEvalRunService {
    const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
    const evalConfig = testCase.evalConfig ?? dataset.evalConfig;

+    // ── External eval mode: agent finished, hand off to external scorer ──
+    if (evalMode === 'external') {
+      return {
+        ...baseMeta,
+        awaitingExternalEval: true,
+        passed: undefined,
+        score: undefined,
+        status: 'external',
+      };
+    }
+
    let effectiveRubrics: EvalBenchmarkRubric[];
    if (evalMode) {
      effectiveRubrics = [
@@ -722,6 +735,7 @@ export class AgentEvalRunService {
      passed?: boolean;
      rubricScores?: Array<{ reason?: string; rubricId: string; score: number }>;
      score?: number;
+      status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout';
      steps?: number;
      threadId: string;
      tokens?: number;
@@ -737,6 +751,14 @@ export class AgentEvalRunService {
        passed: meta.passed as boolean | undefined,
        rubricScores: meta.rubricScores as any,
        score: meta.score as number | undefined,
+        status: meta.status as
+          | 'error'
+          | 'external'
+          | 'failed'
+          | 'passed'
+          | 'running'
+          | 'timeout'
+          | undefined,
        steps: meta.steps as number | undefined,
        threadId: t.id,
        tokens: meta.tokens as number | undefined,
@@ -744,6 +766,20 @@ export class AgentEvalRunService {
      };
    });

+    // ── External eval mode: if all threads await external scoring, propagate that status ──
+    const allExternal = threadResults.every((t) => t.status === 'external');
+    if (allExternal) {
+      await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
+        evalResult: {
+          awaitingExternalEval: true,
+          completionReason: 'external',
+          threads: threadResults,
+        } satisfies EvalRunTopicResult,
+        status: 'external',
+      });
+      return;
+    }
+
    // pass@k: at least one thread passed
    const anyPassed = threadResults.some((t) => t.passed === true);
    // pass^k: all threads passed
@@ -888,7 +924,7 @@ export class AgentEvalRunService {
    if (runTopic) {
      // Skip if topic is already in a terminal state (e.g. timeout marked by checkAndHandleRunTimeout).
      // The interrupted agent still fires the completion webhook, but we must not overwrite the result.
-      const terminalStates = ['passed', 'failed', 'error', 'timeout'];
+      const terminalStates = ['passed', 'failed', 'error', 'timeout', 'external'];
      if (runTopic.status && terminalStates.includes(runTopic.status)) {
        // Fall through to progress tracking below without modifying this topic
      } else {
@@ -945,11 +981,15 @@ export class AgentEvalRunService {
    // Aggregate real-time metrics from all RunTopics
    const allTopics = await this.runTopicModel.findByRunId(runId);
    const completedCount = allTopics.filter(
-      (t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
+      (t) =>
+        (t.evalResult && 'completionReason' in t.evalResult) ||
+        t.status === 'timeout' ||
+        t.status === 'external',
    ).length;
    const passedCases = allTopics.filter((t) => t.status === 'passed').length;
    const failedCases = allTopics.filter((t) => t.status === 'failed').length;
    const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const externalCasesTraj = allTopics.filter((t) => t.status === 'external').length;
    const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;

    let sumCost = 0;
@@ -995,6 +1035,7 @@ export class AgentEvalRunService {
        completedCases: completedCount,
        cost: sumCost ? roundCost(sumCost) : undefined,
        errorCases,
+        externalCases: externalCasesTraj || undefined,
        failedCases,
        llmCalls: sumLlmCalls || undefined,
        passedCases,
@@ -1048,6 +1089,7 @@ export class AgentEvalRunService {
    let passedCases = 0;
    let failedCases = 0;
    let errorCases = 0;
+    let externalCases = 0;
    let timeoutCases = 0;
    let totalScore = 0;
    // Sum of per-case averages (for per-case display)
@@ -1088,19 +1130,27 @@ export class AgentEvalRunService {
        failedCases++;
      } else if (runTopic.status === 'error') {
        errorCases++;
+      } else if (runTopic.status === 'external') {
+        externalCases++;
      } else if (runTopic.status === 'timeout') {
        timeoutCases++;
      }

-      // Only accumulate scores for evaluated (non-error, non-timeout) cases
-      if (runTopic.status !== 'error' && runTopic.status !== 'timeout' && runTopic.score != null) {
-        totalScore += runTopic.score;
-      }
-
-      // Accumulate per-rubric scores from existing evalResult (exclude error/timeout cases)
+      // Only accumulate scores for evaluated (non-error, non-timeout, non-external) cases
      if (
        runTopic.status !== 'error' &&
        runTopic.status !== 'timeout' &&
+        runTopic.status !== 'external' &&
+        runTopic.score != null
+      ) {
+        totalScore += runTopic.score;
+      }
+
+      // Accumulate per-rubric scores from existing evalResult (exclude error/timeout/external cases)
+      if (
+        runTopic.status !== 'error' &&
+        runTopic.status !== 'timeout' &&
+        runTopic.status !== 'external' &&
        existingResult?.rubricScores
      ) {
        for (const rs of existingResult.rubricScores) {
@@ -1138,6 +1188,7 @@ export class AgentEvalRunService {
      cost: sumCost ? roundCost(sumCost) : undefined,
      duration: wallClockDuration || undefined,
      errorCases,
+      externalCases: externalCases || undefined,
      failedCases,
      llmCalls: sumLlmCalls || undefined,
      passRate: totalCases > 0 ? passedCases / totalCases : 0,
@@ -1216,6 +1267,15 @@ export class AgentEvalRunService {
    const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
    const evalConfig = testCase.evalConfig ?? dataset.evalConfig;

+    // ── External eval mode: agent finished, hand off to external scorer ──
+    if (evalMode === 'external') {
+      await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+        evalResult: { ...existingResult, awaitingExternalEval: true },
+        status: 'external',
+      });
+      return;
+    }
+
    let effectiveRubrics: EvalBenchmarkRubric[];
    if (evalMode) {
      effectiveRubrics = [
@@ -1324,7 +1384,13 @@ export class AgentEvalRunService {
      });

      const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
-      const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+      const externalCount = metrics.externalCases || 0;
+      const runStatus =
+        externalCount > 0
+          ? 'external'
+          : nonSuccessCases >= metrics.totalCases
+            ? 'failed'
+            : 'completed';

      await this.runModel.update(run.id, { metrics, status: runStatus });
    } else {