From ea329113be24c64d5e9dbaf98140ea61e2cd9ad4 Mon Sep 17 00:00:00 2001
From: Rylan Cai <67412196+cy948@users.noreply.github.com>
Date: Tue, 10 Mar 2026 09:53:26 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(eval):=20add=20external=20scor?=
 =?UTF-8?q?ing=20mode=20(#12729)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* wip: add llm relevant & BrowseComp

* wip: add widesearch desc

* wip: dsqa, hle, widesearch

* wip: add dsqa

* wip: add awaiting eval status for runs

* wip: add awaiting status for run

* wip: adjust hle-verified

* :bug: fix: browsecomp topics

* :memo: docs: add annotations

* wip: add awaiting status for pass@k

* wip: add complete status

* wip: update theard dots

* wip: update run status page

* wip: remove useless impl

* wip: update prompt

* :sparkles: feat: add external eval routes

* wip: add eval cli

* :bug: fix: support authoritize in no browser environment

* wip: pass tests

* :recycle: refactor: remove tests

* :recycle: refactor: mo camel case
---
 apps/cli/src/commands/eval.test.ts            | 285 ++++++++++
 apps/cli/src/commands/eval.ts                 | 326 +++++++++++
 apps/cli/src/index.ts                         |   2 +
 locales/en-US/eval.json                       |   8 +-
 locales/zh-CN/eval.json                       |   5 +
 .../database/src/models/agentEval/dataset.ts  |   2 +
 packages/database/src/models/agentEval/run.ts |   2 +-
 packages/database/src/schemas/agentEvals.ts   |   5 +-
 packages/eval-rubric/src/evaluate.ts          |  12 +-
 packages/eval-rubric/src/matchers/external.ts |   9 +
 packages/eval-rubric/src/matchers/index.ts    |  19 +-
 packages/eval-rubric/src/matchers/llmEq.ts    |  89 +++
 .../eval-rubric/src/matchers/llmRubric.ts     |   4 +
 packages/types/src/eval/agentEval.ts          |   9 +-
 packages/types/src/eval/agentEvalRun.ts       |   1 +
 packages/types/src/eval/rubric.ts             |   2 +
 .../agent-eval-run/finalize-run/route.ts      |  13 +-
 src/locales/default/eval.ts                   |  16 +-
 .../datasets/[datasetId]/index.tsx            |   1 +
 .../features/DatasetsTab/DatasetCard.tsx      |   1 +
 .../features/DatasetsTab/TestCaseTable.tsx    |  18 +-
 .../features/CaseResultsTable/index.tsx       |  33 +-
 .../[runId]/features/PendingState/index.tsx   |   4 +-
 .../runs/[runId]/features/RunHeader/index.tsx |  27 +-
 .../[benchmarkId]/runs/[runId]/index.tsx      |   4 +-
 .../(main)/eval/config/datasetPresets.ts      | 143 +++++
 .../features/DatasetCreateModal/index.tsx     |   1 +
 .../eval/features/DatasetEditModal/index.tsx  |  24 +-
 .../DatasetImportModal/MappingStep.tsx        |  12 +
 .../(main)/eval/features/StatusBadge.tsx      |   3 +-
 src/server/routers/lambda/agentEval.ts        |  15 +-
 .../routers/lambda/agentEvalExternal.ts       | 514 ++++++++++++++++++
 src/server/routers/lambda/index.ts            |   2 +
 src/server/services/agentEvalRun/index.ts     |  84 ++-
 34 files changed, 1655 insertions(+), 40 deletions(-)
 create mode 100644 apps/cli/src/commands/eval.test.ts
 create mode 100644 apps/cli/src/commands/eval.ts
 create mode 100644 packages/eval-rubric/src/matchers/external.ts
 create mode 100644 packages/eval-rubric/src/matchers/llmEq.ts
 create mode 100644 src/server/routers/lambda/agentEvalExternal.ts

diff --git a/apps/cli/src/commands/eval.test.ts b/apps/cli/src/commands/eval.test.ts
new file mode 100644
index 0000000000..f402567ba0
--- /dev/null
+++ b/apps/cli/src/commands/eval.test.ts
@@ -0,0 +1,285 @@
+import { Command } from 'commander';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { mockTrpcClient } = vi.hoisted(() => ({
+  mockTrpcClient: {
+    agentEvalExternal: {
+      datasetGet: { query: vi.fn() },
+      messagesList: { query: vi.fn() },
+      runGet: { query: vi.fn() },
+      runSetStatus: { mutate: vi.fn() },
+      runTopicReportResult: { mutate: vi.fn() },
+      runTopicsList: { query: vi.fn() },
+      testCasesCount: { query: vi.fn() },
+      threadsList: { query: vi.fn() },
+    },
+  },
+}));
+
+const { getTrpcClientMock } = vi.hoisted(() => ({
+  getTrpcClientMock: vi.fn(),
+}));
+
+vi.mock('../api/client', () => ({
+  getTrpcClient: getTrpcClientMock,
+}));
+
+vi.mock('../utils/logger', () => ({
+  log: {
+    debug: vi.fn(),
+    error: vi.fn(),
+    info: vi.fn(),
+    warn: vi.fn(),
+  },
+  setVerbose: vi.fn(),
+}));
+
+// eslint-disable-next-line import-x/first
+import { log } from '../utils/logger';
+// eslint-disable-next-line import-x/first
+import { registerEvalCommand } from './eval';
+
+describe('eval command', () => {
+  let exitSpy: ReturnType<typeof vi.spyOn>;
+  let logSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    getTrpcClientMock.mockResolvedValue(mockTrpcClient);
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
+    logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
+      for (const fn of Object.values(method)) {
+        (fn as ReturnType<typeof vi.fn>).mockReset();
+      }
+    }
+  });
+
+  afterEach(() => {
+    exitSpy.mockRestore();
+    logSpy.mockRestore();
+    vi.clearAllMocks();
+  });
+
+  const createProgram = () => {
+    const program = new Command();
+    program.exitOverride();
+    registerEvalCommand(program);
+    return program;
+  };
+
+  it('should call runGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
+      config: { k: 1 },
+      datasetId: 'dataset-1',
+      id: 'run-1',
+    });
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
+
+    expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: {
+        config: { k: 1 },
+        datasetId: 'dataset-1',
+        id: 'run-1',
+      },
+      error: null,
+      ok: true,
+      version: 'v1',
+    });
+  });
+
+  it('should call datasetGet and output json envelope', async () => {
+    mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
+      id: 'dataset-1',
+      metadata: { preset: 'deepsearchqa' },
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'dataset',
+      'get',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should pass onlyExternal to runTopicsList', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topics',
+      'list',
+      '--run-id',
+      'run-1',
+      '--only-external',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
+      onlyExternal: true,
+      runId: 'run-1',
+    });
+  });
+
+  it('should pass topicId and threadId to messagesList', async () => {
+    mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'messages',
+      'list',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should parse and report run-topic result', async () => {
+    mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run-topic',
+      'report-result',
+      '--run-id',
+      'run-1',
+      '--topic-id',
+      'topic-1',
+      '--thread-id',
+      'thread-1',
+      '--score',
+      '0.91',
+      '--correct',
+      'true',
+      '--result-json',
+      '{"grade":"A"}',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
+      correct: true,
+      result: { grade: 'A' },
+      runId: 'run-1',
+      score: 0.91,
+      threadId: 'thread-1',
+      topicId: 'topic-1',
+    });
+  });
+
+  it('should update run status', async () => {
+    mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
+      runId: 'run-1',
+      status: 'completed',
+      success: true,
+    });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'set-status',
+      '--run-id',
+      'run-1',
+      '--status',
+      'completed',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
+      runId: 'run-1',
+      status: 'completed',
+    });
+    expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
+  });
+
+  it('should output json error envelope when command fails', async () => {
+    const error = Object.assign(new Error('Run not found'), {
+      data: { code: 'NOT_FOUND' },
+    });
+    mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'run',
+      'get',
+      '--run-id',
+      'run-404',
+      '--json',
+    ]);
+
+    const payload = JSON.parse(logSpy.mock.calls[0][0]);
+    expect(payload).toEqual({
+      data: null,
+      error: { code: 'NOT_FOUND', message: 'Run not found' },
+      ok: false,
+      version: 'v1',
+    });
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+
+  it('should query test case count', async () => {
+    mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
+
+    const program = createProgram();
+    await program.parseAsync([
+      'node',
+      'test',
+      'eval',
+      'test-cases',
+      'count',
+      '--dataset-id',
+      'dataset-1',
+      '--json',
+    ]);
+
+    expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
+      datasetId: 'dataset-1',
+    });
+  });
+
+  it('should log plain error without --json', async () => {
+    mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
+
+    const program = createProgram();
+    await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
+
+    expect(log.error).toHaveBeenCalledWith('boom');
+    expect(exitSpy).toHaveBeenCalledWith(1);
+  });
+});
diff --git a/apps/cli/src/commands/eval.ts b/apps/cli/src/commands/eval.ts
new file mode 100644
index 0000000000..9ef6b2a4a6
--- /dev/null
+++ b/apps/cli/src/commands/eval.ts
@@ -0,0 +1,326 @@
+import type { Command } from 'commander';
+import { InvalidArgumentError } from 'commander';
+import pc from 'picocolors';
+
+import { getTrpcClient } from '../api/client';
+import { log } from '../utils/logger';
+
+const JSON_VERSION = 'v1' as const;
+
+interface JsonError {
+  code?: string;
+  message: string;
+}
+
+interface JsonEnvelope<T> {
+  data: T | null;
+  error: JsonError | null;
+  ok: boolean;
+  version: typeof JSON_VERSION;
+}
+
+interface JsonOption {
+  json?: boolean;
+}
+
+interface RunGetOptions extends JsonOption {
+  runId: string;
+}
+
+interface RunSetStatusOptions extends JsonOption {
+  runId: string;
+  status: 'completed' | 'external';
+}
+
+interface DatasetGetOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicsListOptions extends JsonOption {
+  onlyExternal?: boolean;
+  runId: string;
+}
+
+interface ThreadsListOptions extends JsonOption {
+  topicId: string;
+}
+
+interface MessagesListOptions extends JsonOption {
+  threadId?: string;
+  topicId: string;
+}
+
+interface TestCasesCountOptions extends JsonOption {
+  datasetId: string;
+}
+
+interface RunTopicReportResultOptions extends JsonOption {
+  correct: boolean;
+  resultJson: Record<string, unknown>;
+  runId: string;
+  score: number;
+  threadId?: string;
+  topicId: string;
+}
+
+const printJson = (data: unknown) => {
+  console.log(JSON.stringify(data, null, 2));
+};
+
+const outputJsonSuccess = (data: unknown) => {
+  const payload: JsonEnvelope<unknown> = {
+    data,
+    error: null,
+    ok: true,
+    version: JSON_VERSION,
+  };
+  printJson(payload);
+};
+
+const isRecord = (value: unknown): value is Record<string, unknown> =>
+  typeof value === 'object' && value !== null;
+
+const toJsonError = (error: unknown): JsonError => {
+  if (error instanceof Error) {
+    const maybeData = (error as Error & { data?: { code?: string } }).data;
+    const code = maybeData?.code;
+
+    return {
+      code: typeof code === 'string' ? code : undefined,
+      message: error.message,
+    };
+  }
+
+  if (isRecord(error)) {
+    const code = typeof error.code === 'string' ? error.code : undefined;
+    const message = typeof error.message === 'string' ? error.message : 'Unknown error';
+    return { code, message };
+  }
+
+  return { message: String(error) };
+};
+
+const handleCommandError = (error: unknown, json: boolean) => {
+  const normalized = toJsonError(error);
+
+  if (json) {
+    const payload: JsonEnvelope<null> = {
+      data: null,
+      error: normalized,
+      ok: false,
+      version: JSON_VERSION,
+    };
+    printJson(payload);
+  } else {
+    log.error(normalized.message);
+  }
+
+  process.exit(1);
+};
+
+const parseScore = (value: string) => {
+  const score = Number(value);
+  if (!Number.isFinite(score)) {
+    throw new InvalidArgumentError(`Invalid score: ${value}`);
+  }
+  return score;
+};
+
+const parseBoolean = (value: string) => {
+  const normalized = value.trim().toLowerCase();
+  if (['1', 'true', 'yes'].includes(normalized)) return true;
+  if (['0', 'false', 'no'].includes(normalized)) return false;
+  throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
+};
+
+const parseResultJson = (value: string) => {
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(value);
+  } catch {
+    throw new InvalidArgumentError('Invalid JSON value for --result-json');
+  }
+
+  if (!isRecord(parsed) || Array.isArray(parsed)) {
+    throw new InvalidArgumentError('--result-json must be a JSON object');
+  }
+
+  return parsed;
+};
+
+const parseRunStatus = (value: string) => {
+  if (value !== 'completed' && value !== 'external') {
+    throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
+  }
+
+  return value as 'completed' | 'external';
+};
+
+const executeCommand = async (
+  options: JsonOption,
+  action: () => Promise<unknown>,
+  successMessage?: string,
+) => {
+  try {
+    const data = await action();
+    if (options.json) {
+      outputJsonSuccess(data);
+      return;
+    }
+
+    if (successMessage) {
+      console.log(`${pc.green('OK')} ${successMessage}`);
+      return;
+    }
+
+    printJson(data);
+  } catch (error) {
+    handleCommandError(error, Boolean(options.json));
+  }
+};
+
+export function registerEvalCommand(program: Command) {
+  const evalCmd = program.command('eval').description('Manage external evaluation workflows');
+
+  const runCmd = evalCmd.command('run').description('Manage evaluation runs');
+
+  runCmd
+    .command('get')
+    .description('Get run information')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runGet.query({ runId: options.runId });
+      }),
+    );
+
+  runCmd
+    .command('set-status')
+    .description('Set run status (external API supports completed or external)')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunSetStatusOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runSetStatus.mutate({
+            runId: options.runId,
+            status: options.status,
+          });
+        },
+        `Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
+      ),
+    );
+
+  evalCmd
+    .command('dataset')
+    .description('Manage evaluation datasets')
+    .command('get')
+    .description('Get dataset information')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: DatasetGetOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topics')
+    .description('Manage run topics')
+    .command('list')
+    .description('List topics in a run')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .option('--only-external', 'Only return topics pending external evaluation')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.runTopicsList.query({
+          onlyExternal: Boolean(options.onlyExternal),
+          runId: options.runId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('threads')
+    .description('Manage evaluation threads')
+    .command('list')
+    .description('List threads by topic')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: ThreadsListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
+      }),
+    );
+
+  evalCmd
+    .command('messages')
+    .description('Manage evaluation messages')
+    .command('list')
+    .description('List messages by topic and optional thread')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: MessagesListOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.messagesList.query({
+          threadId: options.threadId,
+          topicId: options.topicId,
+        });
+      }),
+    );
+
+  evalCmd
+    .command('test-cases')
+    .description('Manage evaluation test cases')
+    .command('count')
+    .description('Count test cases by dataset')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: TestCasesCountOptions) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  evalCmd
+    .command('run-topic')
+    .description('Manage evaluation run-topic reporting')
+    .command('report-result')
+    .description('Report one evaluation result for a run topic')
+    .requiredOption('--run-id <id>', 'Run ID')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID (required for k > 1)')
+    .requiredOption('--score <score>', 'Evaluation score', parseScore)
+    .requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
+    .requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: RunTopicReportResultOptions) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEvalExternal.runTopicReportResult.mutate({
+            correct: options.correct,
+            result: options.resultJson,
+            runId: options.runId,
+            score: options.score,
+            threadId: options.threadId,
+            topicId: options.topicId,
+          });
+        },
+        `Reported result for topic ${pc.bold(options.topicId)}`,
+      ),
+    );
+}
diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
index 0340d01859..9ef4df98db 100644
--- a/apps/cli/src/index.ts
+++ b/apps/cli/src/index.ts
@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
 import { registerFileCommand } from './commands/file';
 import { registerGenerateCommand } from './commands/generate';
 import { registerKbCommand } from './commands/kb';
+import { registerEvalCommand } from './commands/eval';
 import { registerLoginCommand } from './commands/login';
 import { registerLogoutCommand } from './commands/logout';
 import { registerMemoryCommand } from './commands/memory';
@@ -44,5 +45,6 @@ registerModelCommand(program);
 registerProviderCommand(program);
 registerPluginCommand(program);
 registerConfigCommand(program);
+registerEvalCommand(program);
 
 program.parse();
diff --git a/locales/en-US/eval.json b/locales/en-US/eval.json
index e24ef71cc7..a2c7564561 100644
--- a/locales/en-US/eval.json
+++ b/locales/en-US/eval.json
@@ -157,13 +157,15 @@
   "difficulty.easy": "Easy",
   "difficulty.hard": "Hard",
   "difficulty.medium": "Medium",
+  "evalMode.answer-relevance": "LLM Relevance",
+  "evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
   "evalMode.contains": "Contains Match",
   "evalMode.contains.desc": "Output must contain the expected text",
   "evalMode.equals": "Exact Match",
   "evalMode.equals.desc": "Output must be exactly the same as expected",
   "evalMode.label": "Eval Mode",
   "evalMode.llm-rubric": "LLM Judge",
-  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
+  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
   "evalMode.placeholder": "Select eval mode",
   "evalMode.prompt.label": "Judge Prompt",
   "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
@@ -256,12 +258,16 @@
   "run.running.hint": "Evaluation is running, results will appear shortly...",
   "run.status.aborted": "Aborted",
   "run.status.completed": "Completed",
+  "run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
   "run.status.error": "Run Error",
+  "run.status.external": "External",
+  "run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
   "run.status.failed": "Failed",
   "run.status.idle": "Idle",
   "run.status.pending": "Pending",
   "run.status.running": "Running",
   "run.status.timeout": "Timeout",
+  "sidebar": "Evaluation",
   "sidebar.benchmarks": "Benchmarks",
   "sidebar.dashboard": "Dashboard",
   "sidebar.datasets": "Datasets",
diff --git a/locales/zh-CN/eval.json b/locales/zh-CN/eval.json
index f4502aeb92..8896b4bf8d 100644
--- a/locales/zh-CN/eval.json
+++ b/locales/zh-CN/eval.json
@@ -161,6 +161,8 @@
   "evalMode.contains.desc": "输出中必须包含期望的文本",
   "evalMode.equals": "精确匹配",
   "evalMode.equals.desc": "输出必须与期望内容完全一致",
+  "evalMode.external": "外部评估",
+  "evalMode.external.desc": "智能体完成运行后，由外部系统提交评估结果",
   "evalMode.label": "评估模式",
   "evalMode.llm-rubric": "LLM 评判",
   "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
@@ -256,7 +258,10 @@
   "run.running.hint": "评测进行中，结果即将呈现...",
   "run.status.aborted": "已终止",
   "run.status.completed": "已完成",
+  "run.status.completed.tooltip": "评测已完成运行，所有结果已评估。",
   "run.status.error": "运行出错",
+  "run.status.external": "待外部评测",
+  "run.status.external.tooltip": "智能体已完成运行，等待外部系统提交评估结果。",
   "run.status.failed": "失败",
   "run.status.idle": "待开始",
   "run.status.pending": "等待中",
diff --git a/packages/database/src/models/agentEval/dataset.ts b/packages/database/src/models/agentEval/dataset.ts
index 8413acc43d..f4a33256ec 100644
--- a/packages/database/src/models/agentEval/dataset.ts
+++ b/packages/database/src/models/agentEval/dataset.ts
@@ -50,6 +50,8 @@ export class AgentEvalDatasetModel {
         benchmarkId: agentEvalDatasets.benchmarkId,
         createdAt: agentEvalDatasets.createdAt,
         description: agentEvalDatasets.description,
+        evalConfig: agentEvalDatasets.evalConfig,
+        evalMode: agentEvalDatasets.evalMode,
         id: agentEvalDatasets.id,
         identifier: agentEvalDatasets.identifier,
         metadata: agentEvalDatasets.metadata,
diff --git a/packages/database/src/models/agentEval/run.ts b/packages/database/src/models/agentEval/run.ts
index 0cc6dc89b5..4642b7c9da 100644
--- a/packages/database/src/models/agentEval/run.ts
+++ b/packages/database/src/models/agentEval/run.ts
@@ -31,7 +31,7 @@ export class AgentEvalRunModel {
     datasetId?: string;
     limit?: number;
     offset?: number;
-    status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted';
+    status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted' | 'external';
   }) => {
     const conditions = [eq(agentEvalRuns.userId, this.userId)];
 
diff --git a/packages/database/src/schemas/agentEvals.ts b/packages/database/src/schemas/agentEvals.ts
index 027e2eabea..8dc5bc2341 100644
--- a/packages/database/src/schemas/agentEvals.ts
+++ b/packages/database/src/schemas/agentEvals.ts
@@ -43,6 +43,7 @@ const evalModes = [
   'similar',
   'levenshtein',
   'rubric',
+  'external',
 ] as const;
 
 // ============================================
@@ -181,7 +182,7 @@ export const agentEvalRuns = pgTable(
     name: text('name'),
 
     status: text('status', {
-      enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted'],
+      enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'],
     })
       .default('idle')
       .notNull(),
@@ -228,7 +229,7 @@ export const agentEvalRunTopics = pgTable(
       .notNull(),
 
     status: text('status', {
-      enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout'],
+      enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout', 'external', 'completed'],
     }),
 
     score: real('score'),
diff --git a/packages/eval-rubric/src/evaluate.ts b/packages/eval-rubric/src/evaluate.ts
index 63262c8178..14c717f2da 100644
--- a/packages/eval-rubric/src/evaluate.ts
+++ b/packages/eval-rubric/src/evaluate.ts
@@ -87,12 +87,20 @@ export const evaluate = async (
       const candidates: string[] = JSON.parse(expected);
       const results: MatchResult[] = [];
       for (const c of candidates) {
-        results.push(await match({ actual: extracted, expected: c, rubric }, matchContext));
+        results.push(
+          await match(
+            { input: testCase.input, actual: extracted, expected: c, rubric },
+            matchContext,
+          ),
+        );
       }
       const best = results.reduce((a, b) => (a.score >= b.score ? a : b));
       result = best;
     } else {
-      result = await match({ actual: extracted, expected, rubric }, matchContext);
+      result = await match(
+        { input: testCase.input, actual: extracted, expected, rubric },
+        matchContext,
+      );
     }
 
     rubricResults.push({
diff --git a/packages/eval-rubric/src/matchers/external.ts b/packages/eval-rubric/src/matchers/external.ts
new file mode 100644
index 0000000000..1390ed5edb
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/external.ts
@@ -0,0 +1,9 @@
+import type { MatchResult } from './types';
+
+export const matchExternal = async (): Promise<MatchResult> => {
+  return {
+    passed: false,
+    score: 0,
+    reason: 'Waiting for external evaluation...',
+  };
+};
diff --git a/packages/eval-rubric/src/matchers/index.ts b/packages/eval-rubric/src/matchers/index.ts
index fa89733daa..aa17a339c9 100644
--- a/packages/eval-rubric/src/matchers/index.ts
+++ b/packages/eval-rubric/src/matchers/index.ts
@@ -4,8 +4,10 @@ import { matchAnyOf } from './anyOf';
 import { matchContains } from './contains';
 import { matchEndsWith } from './endsWith';
 import { matchEquals } from './equals';
+import { matchExternal } from './external';
 import { matchJsonSchema } from './jsonSchema';
 import { matchLevenshtein } from './levenshtein';
+import { matchLLMEq } from './llmEq';
 import { matchLLMRubric } from './llmRubric';
 import { matchNumeric } from './numeric';
 import { matchRegex } from './regex';
@@ -18,10 +20,15 @@ export type { GenerateObjectPayload, MatchContext, MatchResult } from './types';
  * Run a single rubric matcher against actual vs expected
  */
 export const match = async (
-  params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric },
+  params: {
+    input: string;
+    actual: string;
+    expected: string | undefined;
+    rubric: EvalBenchmarkRubric;
+  },
   context?: MatchContext,
 ): Promise<MatchResult> => {
-  const { actual, expected, rubric } = params;
+  const { actual, expected, rubric, input } = params;
   const { type, config } = rubric;
 
   switch (type) {
@@ -57,6 +64,10 @@ export const match = async (
       return matchLevenshtein(actual, expected, config);
     }
 
+    case 'answer-relevance': {
+      return matchLLMEq(input, actual, expected, rubric, context);
+    }
+
     case 'llm-rubric': {
       return matchLLMRubric(actual, expected, rubric, context);
     }
@@ -65,6 +76,10 @@ export const match = async (
       return matchJsonSchema(actual, config);
     }
 
+    case 'external': {
+      return matchExternal();
+    }
+
     default: {
       return {
         passed: false,
diff --git a/packages/eval-rubric/src/matchers/llmEq.ts b/packages/eval-rubric/src/matchers/llmEq.ts
new file mode 100644
index 0000000000..671d5296ef
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/llmEq.ts
@@ -0,0 +1,89 @@
+import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types';
+
+import type { MatchContext, MatchResult } from './types';
+
+const DEFAULT_SYSTEM_ROLE = [
+  'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.',
+  'Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.',
+  'Your judgement must be in the format and criteria specified below:',
+  "extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.",
+  'reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.',
+  'Scoring rules:',
+  'score: Return 1 only when extracted_final_answer clearly and unambiguously matches [correct_answer], or is within a small margin of error for numerical problems.',
+  'score: Return 0 when extracted_final_answer is incorrect, missing, ambiguous, non-equivalent, or when you are uncertain.',
+  'Treat uncertainty as incorrect (score = 0).',
+  'Respond with a JSON object containing ',
+  '"score" (number: 0 or 1)',
+  'and "reason" (brief explanation for the judgement).',
+].join('\n');
+
+const JUDGE_SCORE_SCHEMA: Record<string, unknown> = {
+  additionalProperties: false,
+  properties: {
+    score: {
+      description: 'Binary score for judgement: 1=correct, 0=incorrect/uncertain',
+      enum: [0, 1],
+      type: 'number',
+    },
+    reason: { description: 'Brief explanation for the judgement', type: 'string' },
+  },
+  required: ['score', 'reason'],
+  type: 'object',
+};
+
+function buildJudgeUserPrompt(
+  question: string,
+  actual: string,
+  expected: string | undefined,
+): string {
+  const parts = [`[question]\n${question}`, `[response]\n${actual}`];
+  if (expected) {
+    parts.push(`[correct_answer]\n${expected}`);
+  }
+  return parts.join('\n\n');
+}
+
+export const matchLLMEq = async (
+  question: string,
+  actual: string,
+  expected: string | undefined,
+  rubric: EvalBenchmarkRubric,
+  context?: MatchContext,
+): Promise<MatchResult> => {
+  if (!context?.generateObject) {
+    return { passed: false, reason: 'LLM judge not available', score: 0 };
+  }
+
+  const cfg = rubric.config as RubricConfigLLM;
+  const model = cfg.model || context.judgeModel;
+
+  if (!model) {
+    return { passed: false, reason: 'No judge model configured', score: 0 };
+  }
+
+  try {
+    const result = await context.generateObject({
+      messages: [
+        { content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' },
+        { content: buildJudgeUserPrompt(question, actual, expected), role: 'user' },
+      ],
+      model,
+      provider: cfg.provider,
+      schema: JUDGE_SCORE_SCHEMA,
+    });
+
+    const score = result?.score === 1 ? 1 : 0;
+
+    return {
+      passed: score === 1,
+      reason: result?.reason,
+      score,
+    };
+  } catch (error) {
+    return {
+      passed: false,
+      reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`,
+      score: 0,
+    };
+  }
+};
diff --git a/packages/eval-rubric/src/matchers/llmRubric.ts b/packages/eval-rubric/src/matchers/llmRubric.ts
index 6c5a4212f8..48758ed4e7 100644
--- a/packages/eval-rubric/src/matchers/llmRubric.ts
+++ b/packages/eval-rubric/src/matchers/llmRubric.ts
@@ -64,6 +64,10 @@ export const matchLLMRubric = async (
       schema: JUDGE_SCORE_SCHEMA,
     });
 
+    if (!result?.score) {
+      return { passed: false, reason: 'LLM judge did not return a score', score: 0 };
+    }
+
     const score = Math.max(0, Math.min(1, result.score));
     const threshold = rubric.threshold ?? 0.6;
 
diff --git a/packages/types/src/eval/agentEval.ts b/packages/types/src/eval/agentEval.ts
index 83be7d75fd..6cf994bcc4 100644
--- a/packages/types/src/eval/agentEval.ts
+++ b/packages/types/src/eval/agentEval.ts
@@ -34,7 +34,7 @@ export interface EvalTestCaseMetadata {
 /**
  * Evaluation run status
  */
-export type EvalRunStatus = 'aborted' | 'completed' | 'failed' | 'pending' | 'running';
+export type EvalRunStatus = 'aborted' | 'completed' | 'external' | 'failed' | 'pending' | 'running';
 
 /**
  * Evaluation run configuration
@@ -96,6 +96,7 @@ export interface EvalRunMetrics {
   cost?: number;
   duration?: number;
   errorCases?: number;
+  externalCases?: number;
   failedCases: number;
   llmCalls?: number;
   passAllK?: number;
@@ -183,6 +184,8 @@ export interface EvalRunTopicResult {
   completionReason?: string;
   operationId?: string;
   rubricScores?: EvalRubricScore[];
+  /** Set when evalMode is 'external' — agent finished, awaiting external scoring */
+  awaitingExternalEval?: boolean;
 }
 /*eslint-enable perfectionist/sort-interfaces */
 
@@ -194,14 +197,16 @@ export interface EvalThreadResult {
   cost?: number;
   duration?: number;
   error?: string;
+  llmCalls?: number;
   operationId?: string;
   passed?: boolean;
   rubricScores?: EvalRubricScore[];
   score?: number;
-  status?: 'error' | 'failed' | 'passed' | 'running' | 'timeout';
+  status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout' | 'completed';
   steps?: number;
   threadId: string;
   tokens?: number;
+  toolCalls?: number;
 }
 
 /**
diff --git a/packages/types/src/eval/agentEvalRun.ts b/packages/types/src/eval/agentEvalRun.ts
index 2e609c0461..24506c6ed1 100644
--- a/packages/types/src/eval/agentEvalRun.ts
+++ b/packages/types/src/eval/agentEvalRun.ts
@@ -11,6 +11,7 @@ export type AgentEvalRunStatus =
   | 'failed'
   | 'idle'
   | 'pending'
+  | 'external'
   | 'running';
 
 export interface AgentEvalRunTargetAgent {
diff --git a/packages/types/src/eval/rubric.ts b/packages/types/src/eval/rubric.ts
index 5c9721988d..758d469ebd 100644
--- a/packages/types/src/eval/rubric.ts
+++ b/packages/types/src/eval/rubric.ts
@@ -22,6 +22,8 @@ export type RubricType =
   // Similarity
   | 'similar'
   | 'levenshtein'
+  // External evaluation
+  | 'external'
   // Composite
   | 'rubric';
 
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
index 63a92bc8db..23d8e9a9be 100644
--- a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
@@ -66,9 +66,18 @@ export const { POST } = serve<FinalizeRunPayload>(
 
     log('Metrics: %O', metrics);
 
-    // Step 4: Update run status (failed if all cases errored/timed out)
+    // Step 4: Update run status
+    // external: any topic awaits external scoring → whole run waits too
+    // failed: all cases are non-success (error/timeout)
+    // completed: everything else
     const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
-    const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+    const externalCount = metrics.externalCases || 0;
+    const runStatus =
+      externalCount > 0
+        ? 'external'
+        : nonSuccessCases >= metrics.totalCases
+          ? 'failed'
+          : 'completed';
 
     await context.run('agent-eval-run:update-run', async () => {
       const runModel = new AgentEvalRunModel(db, userId);
diff --git a/src/locales/default/eval.ts b/src/locales/default/eval.ts
index bca7b5d7cf..99823eb2c6 100644
--- a/src/locales/default/eval.ts
+++ b/src/locales/default/eval.ts
@@ -173,9 +173,14 @@ export default {
   'evalMode.contains.desc': 'Output must contain the expected text',
   'evalMode.equals': 'Exact Match',
   'evalMode.equals.desc': 'Output must be exactly the same as expected',
+  'evalMode.external': 'External Eval',
+  'evalMode.external.desc': 'Agent runs to completion; scoring is handled by an external system',
   'evalMode.label': 'Eval Mode',
   'evalMode.llm-rubric': 'LLM Judge',
-  'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality',
+  'evalMode.llm-rubric.desc':
+    'Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)',
+  'evalMode.answer-relevance': 'LLM Relevance',
+  'evalMode.answer-relevance.desc': 'Use LLM to evaluate answer relevance (yes or no)',
   'evalMode.placeholder': 'Select eval mode',
   'evalMode.prompt.label': 'Judge Prompt',
   'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge',
@@ -204,6 +209,8 @@ export default {
   'run.idle.hint': 'Click Start to begin evaluation',
   'run.pending.hint': 'Evaluation is queued, waiting to start...',
   'run.running.hint': 'Evaluation is running, results will appear shortly...',
+  'run.external.hint':
+    'Running completed. Waiting for external system to submit evaluation results ...',
 
   'run.filter.active': 'Active',
   'run.filter.empty': 'No runs match the current filter.',
@@ -249,6 +256,9 @@ export default {
   'run.detail.report': 'Evaluation Summary',
   'run.detail.config': 'Evaluation Config',
   'run.detail.configSnapshot': 'Configuration Snapshot',
+  'run.detail.copyRunId': 'Copy Run ID',
+  'run.detail.copyRunIdFailed': 'Failed to copy Run ID',
+  'run.detail.copyRunIdSuccess': 'Run ID copied',
   'run.detail.dataset': 'Dataset',
   'run.detail.model': 'Model',
   'run.detail.overview': 'Overview',
@@ -279,7 +289,11 @@ export default {
 
   'run.status.aborted': 'Aborted',
   'run.status.completed': 'Completed',
+  'run.status.completed.tooltip': 'The run and external scoring are completed.',
   'run.status.error': 'Run Error',
+  'run.status.external': 'Awaiting Eval',
+  'run.status.external.tooltip':
+    'The agent has finished running. Waiting for an external system to submit evaluation results.',
   'run.status.failed': 'Failed',
   'run.status.idle': 'Idle',
   'run.status.pending': 'Pending',
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
index 2f55f35012..c56e5c4d99 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
@@ -208,6 +208,7 @@ const DatasetDetail = memo(() => {
               }}
             >
               <TestCaseTable
+                datasetEvalMode={dataset?.evalMode}
                 diffFilter={diffFilter}
                 pagination={pagination}
                 search={search}
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
index d5ea0a2f6c..0c406fcdd0 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
@@ -238,6 +238,7 @@ const DatasetCard = memo<DatasetCardProps>(
             ) : (
               <TestCaseTable
                 readOnly
+                datasetEvalMode={dataset.evalMode}
                 diffFilter={diffFilter}
                 pagination={pagination}
                 search={search}
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
index 1e72a910f9..09f293d7dd 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
@@ -83,6 +83,7 @@ const styles = createStaticStyles(({ css, cssVar }) => ({
 }));
 
 interface TestCaseTableProps {
+  datasetEvalMode?: string | null;
   diffFilter: 'all' | 'easy' | 'medium' | 'hard';
   onAddCase?: () => void;
   onDelete?: (testCase: any) => void;
@@ -106,6 +107,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
     total,
     search,
     diffFilter,
+    datasetEvalMode,
     pagination,
     onSearchChange,
     onDiffFilterChange,
@@ -170,10 +172,18 @@ const TestCaseTable = memo<TestCaseTableProps>(
           dataIndex: 'evalMode',
           key: 'evalMode',
           render: (text: string) => {
-            if (!text) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
+            const effective = text ?? datasetEvalMode;
+            if (!effective) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
+            const isInherited = !text && !!datasetEvalMode;
             return (
-              <span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
-                {t(`evalMode.${text}` as any)}
+              <span
+                style={{
+                  color: isInherited ? cssVar.colorTextQuaternary : cssVar.colorTextSecondary,
+                  fontSize: 12,
+                  fontStyle: isInherited ? 'italic' : 'normal',
+                }}
+              >
+                {t(`evalMode.${effective}` as any)}
               </span>
             );
           },
@@ -238,7 +248,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
       }
 
       return base;
-    }, [pagination, readOnly, onEdit, onDelete, t]);
+    }, [pagination, readOnly, onEdit, onDelete, t, datasetEvalMode]);
 
     return (
       <>
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
index 002bb2a9af..87a97ee372 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
@@ -67,6 +67,8 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
   const { t } = useTranslation('eval');
   const status: string | null | undefined = record.status;
 
+  // return <div>{status}</div>;
+
   if (!status || status === 'pending')
     return <Badge status="default" text={<BadgeText>{t('run.status.pending')}</BadgeText>} />;
 
@@ -86,6 +88,17 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
   if (status === 'timeout')
     return <Badge color="orange" text={<BadgeText>{t('run.status.timeout')}</BadgeText>} />;
 
+  if (status === 'external') {
+    const badge = <Badge color="purple" text={<BadgeText>{t('run.status.external')}</BadgeText>} />;
+    return <Tooltip title={t('run.status.external.tooltip')}>{badge}</Tooltip>;
+  }
+
+  if (status === 'completed') {
+    // 完成代表运行完成 + 评测完成，不代表结果一定通过
+    const badge = <Badge color="blue" text={<BadgeText>{t('run.status.completed')}</BadgeText>} />;
+    return <Tooltip title={t('run.status.completed.tooltip')}>{badge}</Tooltip>;
+  }
+
   return <Badge status="default" text={<BadgeText>{status}</BadgeText>} />;
 });
 
@@ -99,15 +112,29 @@ const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => (
 
       if (thread.passed === true) {
         color = cssVar.colorSuccess;
+      } else if (thread.passed === false) {
+        color = cssVar.colorError;
+      }
+
+      if (thread.status === 'external') {
+        color = cssVar.colorWarning;
+      }
+
+      if (thread.status === 'completed') {
+        color = cssVar.colorPrimary;
       }
 
       const label = thread.error
         ? 'error'
         : thread.passed === true
           ? 'passed'
-          : thread.passed === false
+          : thread.passed === false && thread.status !== 'completed'
             ? 'failed'
-            : 'pending';
+            : thread.status === 'external'
+              ? 'Awaiting for external evaluation'
+              : thread.status === 'completed'
+                ? 'completed'
+                : 'pending';
 
       return (
         <Tooltip key={thread.threadId} title={label}>
@@ -406,6 +433,8 @@ const CaseResultsTable = memo<CaseResultsTableProps>(
               { label: t('table.filter.error'), value: 'error' },
               { label: t('table.filter.running'), value: 'running' },
               { label: t('run.status.pending'), value: 'pending' },
+              { label: t('run.status.external'), value: 'external' },
+              { label: t('run.status.completed'), value: 'completed' },
             ]}
             onChange={setStatusFilter}
           />
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
index 31dcfd1d72..88cd7f9766 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
@@ -96,7 +96,7 @@ const useStyles = createStyles(({ css, token }) => ({
   `,
 }));
 
-const PendingState = memo(() => {
+const PendingState = memo(({ hint }: { hint?: string }) => {
   const { t } = useTranslation('eval');
   const { cx, styles } = useStyles();
 
@@ -119,7 +119,7 @@ const PendingState = memo(() => {
           <Icon icon={Clock} size={18} />
         </div>
       </div>
-      <div className={styles.hint}>{t('run.pending.hint')}</div>
+      <div className={styles.hint}>{hint}</div>
     </div>
   );
 });
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
index dc811ee957..5d9dc12e00 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
@@ -2,10 +2,19 @@
 
 import { AGENT_PROFILE_URL } from '@lobechat/const';
 import type { AgentEvalRunDetail } from '@lobechat/types';
-import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
+import { ActionIcon, Avatar, copyToClipboard, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
 import { App, Button, Card, Tag, Typography } from 'antd';
 import { createStyles } from 'antd-style';
-import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react';
+import {
+  ArrowLeft,
+  ChevronDown,
+  ChevronUp,
+  Copy,
+  Pencil,
+  Play,
+  Square,
+  Trash2,
+} from 'lucide-react';
 import { memo, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import { Link, useNavigate } from 'react-router-dom';
@@ -170,6 +179,14 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
       window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
     }
   };
+  const handleCopyRunId = async () => {
+    try {
+      await copyToClipboard(run.id);
+      message.success(t('run.detail.copyRunIdSuccess'));
+    } catch {
+      message.error(t('run.detail.copyRunIdFailed'));
+    }
+  };
 
   const formatDate = (date?: Date | string) => {
     if (!date) return '';
@@ -194,6 +211,12 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
               <Typography.Title level={4} style={{ margin: 0 }}>
                 {run.name || run.id.slice(0, 8)}
               </Typography.Title>
+              <ActionIcon
+                icon={Copy}
+                size="small"
+                title={t('run.detail.copyRunId')}
+                onClick={handleCopyRunId}
+              />
               <StatusBadge status={run.status} />
             </Flexbox>
             {/* Meta info row */}
diff --git a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
index b34e9cadf1..e9842d722c 100644
--- a/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
+++ b/src/routes/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
@@ -104,7 +104,9 @@ const RunDetail = memo(() => {
           {runDetail.status === 'running' ? (
             <RunningState />
           ) : runDetail.status === 'pending' ? (
-            <PendingState />
+            <PendingState hint={t('run.pending.hint')} />
+          ) : runDetail.status === 'external' ? (
+            <PendingState hint={t('run.external.hint')} />
           ) : (
             <IdleState run={runDetail} />
           )}
diff --git a/src/routes/(main)/eval/config/datasetPresets.ts b/src/routes/(main)/eval/config/datasetPresets.ts
index 842ff1fda0..fd1d742211 100644
--- a/src/routes/(main)/eval/config/datasetPresets.ts
+++ b/src/routes/(main)/eval/config/datasetPresets.ts
@@ -36,6 +36,26 @@ export interface DatasetPreset {
 }
 
 export const DATASET_PRESETS: Record<string, DatasetPreset> = {
+  'browsecomp': {
+    id: 'browsecomp',
+    category: 'research',
+    name: 'BrowseComp',
+    description: 'Measuring the ability for agents to browse the web, comprises 1,266 questions.',
+    icon: Globe,
+    formatDescription: 'format: Topic (category/tags), Question (input), Answer (expected)',
+    requiredFields: ['question', 'answer', 'problem_topic', 'canary'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['problem_topic'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
   // === Deep Research / QA Category ===
   'browsecomp-zh': {
     id: 'browsecomp-zh',
@@ -58,6 +78,129 @@ export const DATASET_PRESETS: Record<string, DatasetPreset> = {
     },
   },
 
+  'widesearch': {
+    id: 'widesearch',
+    category: 'research',
+    name: 'WideSearch',
+    description:
+      'Evaluating the capabilities of agents in broad information-seeking tasks, consisting of 200 questions.',
+    icon: Globe,
+    formatDescription: 'format: instance_id, query (input), evaluation (expected), language',
+    requiredFields: ['instance_id', 'query', 'evaluation', 'language'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['query'],
+      expected: ['evaluation'],
+      choices: [],
+      category: ['language'],
+      sortOrder: [],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  'hle-text': {
+    id: 'hle-text',
+    category: 'research',
+    name: "Humanity's Last Exam, HLE (Text Only)",
+    description:
+      "Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, consisting of 2150 questions.",
+    icon: Globe,
+    formatDescription:
+      'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category',
+    requiredFields: [
+      'id',
+      'question',
+      'answer',
+      'answer_type',
+      'rationale',
+      'raw_subject',
+      'category',
+    ],
+    optionalFields: ['canary'],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['category'],
+    },
+  },
+
+  'hle-verified': {
+    id: 'hle-verified',
+    category: 'research',
+    name: "Humanity's Last Exam, HLE (Verified Answers)",
+    description:
+      "A subset of Humanity's Last Exam (HLE) with verified answers, designed to evaluate the ability to produce correct answers rather than just plausible ones.",
+    icon: Globe,
+    formatDescription:
+      'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category, Verified_Classes',
+    requiredFields: [
+      'id',
+      'question',
+      'answer',
+      'answer_type',
+      'rationale',
+      'raw_subject',
+      'category',
+      'Verified_Classes',
+    ],
+    optionalFields: ['canary'],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['category'],
+    },
+  },
+
+  'deepsearchqa': {
+    id: 'deepsearchqa',
+    category: 'research',
+    name: 'DeepSearchQA',
+    description:
+      'A 900-prompt factuality benchmark from Google DeepMind, designed to evaluate agents on difficult multi-step information-seeking tasks across 17 different fields.',
+    icon: Globe,
+    formatDescription: 'problem, problem_category, answer, answer_type',
+    requiredFields: ['problem', 'answer', 'problem_category', 'answer_type'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['problem'],
+      expected: ['answer'],
+      choices: [],
+      category: ['problem_category'],
+      sortOrder: [],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  'sealqa': {
+    id: 'sealqa',
+    category: 'research',
+    name: 'SealQA',
+    description:
+      'SealQA is a new challenge benchmark for evaluating SEarch- Augmented Language models on fact-seeking questions where web search yields conflicting, noisy, or unhelpful results.',
+    icon: Globe,
+    formatDescription: 'format: question (input), answer (expected), topic (category)',
+    requiredFields: ['question', 'answer', 'topic', 'canary'],
+    optionalFields: [],
+    fieldInference: {
+      input: ['question'],
+      expected: ['answer'],
+      choices: [],
+      category: ['topic'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
   'xbench': {
     id: 'xbench',
     category: 'research',
diff --git a/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx b/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx
index 8fc8f58a15..76dadff60c 100644
--- a/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx
+++ b/src/routes/(main)/eval/features/DatasetCreateModal/index.tsx
@@ -157,6 +157,7 @@ const DatasetCreateModal = memo<DatasetCreateModalProps>(
                 { label: t('evalMode.equals'), value: 'equals' },
                 { label: t('evalMode.contains'), value: 'contains' },
                 { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+                { label: t('evalMode.external'), value: 'external' },
               ]}
             />
           </Form.Item>
diff --git a/src/routes/(main)/eval/features/DatasetEditModal/index.tsx b/src/routes/(main)/eval/features/DatasetEditModal/index.tsx
index c4871dac34..e86fb1b14c 100644
--- a/src/routes/(main)/eval/features/DatasetEditModal/index.tsx
+++ b/src/routes/(main)/eval/features/DatasetEditModal/index.tsx
@@ -131,14 +131,30 @@ const DatasetEditModal = memo<DatasetEditModalProps>(({ open, onCancel, dataset,
               { label: t('evalMode.equals'), value: 'equals' },
               { label: t('evalMode.contains'), value: 'contains' },
               { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+              { label: t('evalMode.answer-relevance'), value: 'answer-relevance' },
+              { label: t('evalMode.external'), value: 'external' },
             ]}
           />
         </Form.Item>
 
-        {evalModeValue === 'llm-rubric' && (
-          <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
-            <TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
-          </Form.Item>
+        {(evalModeValue === 'llm-rubric' || evalModeValue === 'answer-relevance') && (
+          <>
+            <Form.Item initialValue="aihubmix" label={'Provider'} name={['evalConfig', 'provider']}>
+              <TextArea placeholder={'LLM provider (e.g. openai, azure)'} rows={1} />
+            </Form.Item>
+            <Form.Item initialValue="gpt-5-nano" label={'Model'} name={['evalConfig', 'model']}>
+              <TextArea placeholder={'LLM model to use for evaluation (e.g. gpt-4)'} rows={1} />
+            </Form.Item>
+            <Form.Item label={'System Prompt'} name={['evalConfig', 'systemRole']}>
+              <TextArea placeholder={'Optional system prompt for the LLM judge'} rows={3} />
+            </Form.Item>
+            <Form.Item label={'Eval Prompt'} name={['evalConfig', 'criteria']}>
+              <TextArea placeholder={'Prompt template for the LLM judge'} rows={3} />
+            </Form.Item>
+            <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+              <TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
+            </Form.Item>
+          </>
         )}
 
         <Form.Item label={t('dataset.create.preset.label')} style={{ marginBottom: 0 }}>
diff --git a/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx b/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx
index 520b83d0e1..bbf1621676 100644
--- a/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx
+++ b/src/routes/(main)/eval/features/DatasetImportModal/MappingStep.tsx
@@ -92,6 +92,14 @@ const autoInferMapping = (
     ? new Set(preset.fieldInference.sortOrder.map((s) => s.toLowerCase()))
     : SORT_ORDER_CANDIDATES;
 
+  const requiredCandidates = new Set<string>(
+    preset ? preset.requiredFields.map((s) => s.toLowerCase()) : [],
+  );
+
+  const optionalCandidates = new Set<string>(
+    preset ? preset.optionalFields.map((s) => s.toLowerCase()) : [],
+  );
+
   for (const h of headers) {
     const lower = h.toLowerCase().trim();
     if (!inputFound && inputCandidates.has(lower)) {
@@ -109,6 +117,10 @@ const autoInferMapping = (
     } else if (!sortOrderFound && sortOrderCandidates.has(lower)) {
       result[h] = 'sortOrder';
       sortOrderFound = true;
+    } else if (requiredCandidates.has(lower) || optionalCandidates.has(lower)) {
+      // If the field was claimed by the config but not matched by any candidate,
+      // assign it to metadata to ensure it's not missed
+      result[h] = 'metadata';
     } else {
       result[h] = 'ignore';
     }
diff --git a/src/routes/(main)/eval/features/StatusBadge.tsx b/src/routes/(main)/eval/features/StatusBadge.tsx
index 64350ccca8..f35d16fb77 100644
--- a/src/routes/(main)/eval/features/StatusBadge.tsx
+++ b/src/routes/(main)/eval/features/StatusBadge.tsx
@@ -2,13 +2,14 @@
 
 import { Icon } from '@lobehub/ui';
 import { createStaticStyles } from 'antd-style';
-import { Activity, CheckCircle2, Clock, Pause, XCircle } from 'lucide-react';
+import { Activity, CheckCircle2, Clock, Hourglass, Pause, XCircle } from 'lucide-react';
 import { memo } from 'react';
 import { useTranslation } from 'react-i18next';
 
 const statusConfig: Record<string, { cls: string; icon: any }> = {
   aborted: { cls: 'default', icon: Pause },
   completed: { cls: 'success', icon: CheckCircle2 },
+  external: { cls: 'warning', icon: Hourglass },
   failed: { cls: 'error', icon: XCircle },
   idle: { cls: 'default', icon: Clock },
   pending: { cls: 'warning', icon: Clock },
diff --git a/src/server/routers/lambda/agentEval.ts b/src/server/routers/lambda/agentEval.ts
index 85ca2f7044..aa2199253c 100644
--- a/src/server/routers/lambda/agentEval.ts
+++ b/src/server/routers/lambda/agentEval.ts
@@ -33,6 +33,7 @@ const rubricTypeSchema = z.enum([
   'similar',
   'levenshtein',
   'rubric',
+  'external',
 ]);
 
 const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passthrough();
@@ -621,7 +622,9 @@ export const agentEvalRouter = router({
       z.object({
         benchmarkId: z.string().optional(),
         datasetId: z.string().optional(),
-        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']).optional(),
+        status: z
+          .enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'])
+          .optional(),
         limit: z.number().min(1).max(100).default(50).optional(),
         offset: z.number().min(0).default(0).optional(),
       }),
@@ -871,7 +874,15 @@ export const agentEvalRouter = router({
     .input(
       z.object({
         id: z.string(),
-        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']),
+        status: z.enum([
+          'idle',
+          'pending',
+          'running',
+          'completed',
+          'failed',
+          'aborted',
+          'external',
+        ]),
       }),
     )
     .mutation(async ({ input, ctx }) => {
diff --git a/src/server/routers/lambda/agentEvalExternal.ts b/src/server/routers/lambda/agentEvalExternal.ts
new file mode 100644
index 0000000000..907cabe6e6
--- /dev/null
+++ b/src/server/routers/lambda/agentEvalExternal.ts
@@ -0,0 +1,514 @@
+import type { EvalRunTopicResult, EvalThreadResult } from '@lobechat/types';
+import { TRPCError } from '@trpc/server';
+import { and, asc, eq, isNull } from 'drizzle-orm';
+import { z } from 'zod';
+
+import {
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import { ThreadModel } from '@/database/models/thread';
+import { messages } from '@/database/schemas';
+import { authedProcedure, router } from '@/libs/trpc/lambda';
+import { serverDatabase } from '@/libs/trpc/lambda/middleware';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+const runStatusSchema = z.enum([
+  'idle',
+  'pending',
+  'running',
+  'completed',
+  'failed',
+  'aborted',
+  'external',
+]);
+
+const reportResultItemSchema = z.object({
+  correct: z.boolean(),
+  result: z.record(z.unknown()).optional(),
+  score: z.number(),
+  threadId: z.string().optional(),
+  topicId: z.string(),
+});
+
+const toIsoString = (value?: Date | null) => (value ? value.toISOString() : undefined);
+
+const agentEvalExternalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
+  const { ctx } = opts;
+
+  return opts.next({
+    ctx: {
+      datasetModel: new AgentEvalDatasetModel(ctx.serverDB, ctx.userId),
+      runModel: new AgentEvalRunModel(ctx.serverDB, ctx.userId),
+      runService: new AgentEvalRunService(ctx.serverDB, ctx.userId),
+      runTopicModel: new AgentEvalRunTopicModel(ctx.serverDB, ctx.userId),
+      testCaseModel: new AgentEvalTestCaseModel(ctx.serverDB, ctx.userId),
+      threadModel: new ThreadModel(ctx.serverDB, ctx.userId),
+    },
+  });
+});
+
+type ReportResultInput = z.infer<typeof reportResultItemSchema> & { runId: string };
+
+const recomputeRunAggregation = async (
+  ctx: {
+    runModel: AgentEvalRunModel;
+    runService: AgentEvalRunService;
+    runTopicModel: AgentEvalRunTopicModel;
+  },
+  runId: string,
+) => {
+  const refreshedRun = await ctx.runModel.findById(runId);
+  if (!refreshedRun) return undefined;
+
+  const refreshedTopics = await ctx.runTopicModel.findByRunId(runId);
+  const metrics = await ctx.runService.evaluateAndFinalizeRun({
+    run: {
+      config: refreshedRun.config,
+      id: refreshedRun.id,
+      metrics: refreshedRun.metrics,
+      startedAt: refreshedRun.startedAt,
+    },
+    runTopics: refreshedTopics,
+  });
+
+  const hasAwaitingExternal = refreshedTopics.some(
+    (topic) =>
+      topic.status === 'external' ||
+      (topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
+  );
+  const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
+  const status = hasAwaitingExternal
+    ? 'external'
+    : nonSuccessCases >= metrics.totalCases
+      ? 'failed'
+      : 'completed';
+
+  await ctx.runModel.update(runId, { metrics, status });
+
+  return status;
+};
+
+const applyReportResult = async (
+  ctx: {
+    runModel: AgentEvalRunModel;
+    runTopicModel: AgentEvalRunTopicModel;
+    runService: AgentEvalRunService;
+    threadModel: ThreadModel;
+  },
+  input: ReportResultInput,
+  recomputeRun: boolean,
+) => {
+  const run = await ctx.runModel.findById(input.runId);
+  if (!run) {
+    throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+  }
+
+  const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
+  const runTopic = runTopics.find((item) => item.topicId === input.topicId);
+  if (!runTopic) {
+    throw new TRPCError({ code: 'NOT_FOUND', message: 'Run topic not found' });
+  }
+
+  const runK = run.config?.k ?? 1;
+  const rubricScores = [{ rubricId: 'external', score: input.score }];
+  const existingEvalResult = (runTopic.evalResult ?? {}) as EvalRunTopicResult &
+    Record<string, unknown>;
+  const externalResult = input.result ?? {};
+
+  let idempotent = false;
+  let reportedThreads: number;
+  let totalThreads: number;
+  let topicFinalized: boolean;
+
+  if (runK > 1) {
+    if (!input.threadId) {
+      throw new TRPCError({
+        code: 'BAD_REQUEST',
+        message: 'threadId is required when k > 1',
+      });
+    }
+
+    const allThreads = await ctx.threadModel.queryByTopicId(input.topicId);
+    const evalThreads = allThreads.filter((thread) => thread.type === 'eval');
+    const sourceThreads = evalThreads.length > 0 ? evalThreads : allThreads;
+    if (sourceThreads.length === 0) {
+      throw new TRPCError({
+        code: 'BAD_REQUEST',
+        message: 'No threads found for this topic',
+      });
+    }
+
+    const threads: EvalThreadResult[] =
+      (existingEvalResult.threads as EvalThreadResult[] | undefined)?.map((thread) => ({
+        ...thread,
+      })) ??
+      sourceThreads.map((thread) => ({
+        status: 'external',
+        threadId: thread.id,
+      }));
+
+    let targetIndex = threads.findIndex((thread) => thread.threadId === input.threadId);
+    if (targetIndex < 0) {
+      const existsInTopic = sourceThreads.some((thread) => thread.id === input.threadId);
+      if (!existsInTopic) {
+        throw new TRPCError({
+          code: 'NOT_FOUND',
+          message: 'Thread not found for this topic',
+        });
+      }
+
+      threads.push({ status: 'external', threadId: input.threadId });
+      targetIndex = threads.length - 1;
+    }
+
+    totalThreads = threads.length;
+    const targetThread = threads[targetIndex];
+    const alreadyReported =
+      targetThread.status === 'completed' &&
+      targetThread.score === input.score &&
+      targetThread.passed === input.correct;
+    if (alreadyReported) {
+      idempotent = true;
+    } else {
+      threads[targetIndex] = {
+        ...targetThread,
+        passed: input.correct,
+        rubricScores,
+        score: input.score,
+        status: 'completed',
+      };
+
+      const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
+        string,
+        unknown
+      >;
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: true,
+        externalThreadResults: {
+          ...existingThreadResults,
+          [input.threadId]: externalResult,
+        },
+        threads,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        status: 'external',
+      });
+    }
+
+    reportedThreads = threads.filter(
+      (thread) => thread.status === 'completed' && typeof thread.score === 'number',
+    ).length;
+    topicFinalized = reportedThreads >= totalThreads;
+
+    if (topicFinalized) {
+      const finalThreads = threads;
+      const totalScore = finalThreads.reduce((acc, thread) => acc + (thread.score ?? 0), 0);
+      const avgScore = totalScore / finalThreads.length;
+      const passAtK = finalThreads.some((thread) => thread.passed === true);
+      const passAllK = finalThreads.every((thread) => thread.passed === true);
+
+      const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
+        string,
+        unknown
+      >;
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: false,
+        externalThreadResults: {
+          ...existingThreadResults,
+          [input.threadId]: externalResult,
+        },
+        passAllK,
+        passAtK,
+        rubricScores: [{ rubricId: 'external', score: avgScore }],
+        threads: finalThreads,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        passed: passAtK,
+        score: avgScore,
+        status: passAtK ? 'passed' : 'failed',
+      });
+    }
+  } else {
+    const alreadyReported =
+      runTopic.status === (input.correct ? 'passed' : 'failed') &&
+      runTopic.score === input.score &&
+      runTopic.passed === input.correct;
+    if (alreadyReported) {
+      idempotent = true;
+    } else {
+      const nextEvalResult = {
+        ...existingEvalResult,
+        awaitingExternalEval: false,
+        externalResult,
+        rubricScores,
+      } satisfies EvalRunTopicResult & Record<string, unknown>;
+
+      await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
+        evalResult: nextEvalResult,
+        passed: input.correct,
+        score: input.score,
+        status: input.correct ? 'passed' : 'failed',
+      });
+    }
+
+    reportedThreads = 1;
+    totalThreads = 1;
+    topicFinalized = true;
+  }
+
+  let runStatus: string | undefined;
+  if (recomputeRun) {
+    runStatus = await recomputeRunAggregation(ctx, input.runId);
+  }
+
+  return {
+    idempotent,
+    reportedThreads,
+    runId: input.runId,
+    runStatus,
+    success: true,
+    threadId: input.threadId,
+    topicFinalized,
+    topicId: input.topicId,
+    totalThreads,
+  };
+};
+
+export const agentEvalExternalRouter = router({
+  datasetGet: agentEvalExternalProcedure
+    .input(z.object({ datasetId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const dataset = await ctx.datasetModel.findById(input.datasetId);
+      if (!dataset) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Dataset not found' });
+      }
+
+      const metadata = (dataset.metadata ?? {}) as Record<string, unknown>;
+
+      return {
+        benchmarkId: dataset.benchmarkId,
+        id: dataset.id,
+        identifier: dataset.identifier,
+        metadata,
+        name: dataset.name,
+      };
+    }),
+
+  messagesList: agentEvalExternalProcedure
+    .input(z.object({ threadId: z.string().optional(), topicId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const conditions = [
+        eq(messages.userId, ctx.userId),
+        eq(messages.topicId, input.topicId),
+        isNull(messages.messageGroupId),
+      ];
+      if (input.threadId) conditions.push(eq(messages.threadId, input.threadId));
+
+      const rows = await ctx.serverDB
+        .select({
+          content: messages.content,
+          createdAt: messages.createdAt,
+          id: messages.id,
+          role: messages.role,
+          threadId: messages.threadId,
+          topicId: messages.topicId,
+        })
+        .from(messages)
+        .where(and(...conditions))
+        .orderBy(asc(messages.createdAt));
+
+      return rows.map((row) => ({
+        content: row.content,
+        createdAt: toIsoString(row.createdAt),
+        id: row.id,
+        role: row.role,
+        threadId: row.threadId,
+        topicId: row.topicId,
+      }));
+    }),
+
+  reportResult: agentEvalExternalProcedure
+    .input(
+      z.object({
+        correct: z.boolean(),
+        result: z.record(z.unknown()).optional(),
+        runId: z.string(),
+        score: z.number(),
+        threadId: z.string().optional(),
+        topicId: z.string(),
+      }),
+    )
+    .mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
+
+  reportResultsBatch: agentEvalExternalProcedure
+    .input(z.object({ items: z.array(reportResultItemSchema).min(1), runId: z.string() }))
+    .mutation(async ({ ctx, input }) => {
+      const receipts = [];
+
+      for (const item of input.items) {
+        receipts.push(await applyReportResult(ctx, { ...item, runId: input.runId }, false));
+      }
+
+      const runStatus = await recomputeRunAggregation(ctx, input.runId);
+
+      return {
+        items: receipts,
+        runId: input.runId,
+        runStatus,
+        success: true,
+      };
+    }),
+
+  runGet: agentEvalExternalProcedure
+    .input(z.object({ runId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+      const config = { ...run.config, k: run.config?.k ?? 1 };
+
+      return {
+        config,
+        createdAt: run.createdAt,
+        datasetId: run.datasetId,
+        id: run.id,
+        metrics: run.metrics ?? undefined,
+        name: run.name,
+        startedAt: run.startedAt,
+        status: run.status,
+        targetAgentId: run.targetAgentId,
+      };
+    }),
+
+  runSetStatus: agentEvalExternalProcedure
+    .input(z.object({ runId: z.string(), status: runStatusSchema }))
+    .mutation(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      if (input.status !== 'completed' && input.status !== 'external') {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: 'External endpoint only supports setting status to completed or external',
+        });
+      }
+
+      if (run.status !== 'external' && run.status !== 'completed') {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Only external runs can be finalized via this endpoint. current=${run.status}`,
+        });
+      }
+
+      if (input.status === 'completed') {
+        const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
+        const hasAwaitingExternal = runTopics.some(
+          (topic) =>
+            topic.status === 'external' ||
+            (topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
+        );
+        if (hasAwaitingExternal) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: 'Cannot set run to completed while external evaluation is pending',
+          });
+        }
+
+        const metrics = await ctx.runService.evaluateAndFinalizeRun({
+          run: { config: run.config, id: run.id, metrics: run.metrics, startedAt: run.startedAt },
+          runTopics,
+        });
+        const updated = await ctx.runModel.update(input.runId, { metrics, status: 'completed' });
+
+        return {
+          metrics,
+          runId: input.runId,
+          status: updated?.status ?? 'completed',
+          success: true,
+        };
+      }
+
+      const updated = await ctx.runModel.update(input.runId, { status: 'external' });
+
+      return {
+        runId: input.runId,
+        status: updated?.status ?? 'external',
+        success: true,
+      };
+    }),
+
+  runTopicReportResult: agentEvalExternalProcedure
+    .input(
+      z.object({
+        correct: z.boolean(),
+        result: z.record(z.unknown()).optional(),
+        runId: z.string(),
+        score: z.number(),
+        threadId: z.string().optional(),
+        topicId: z.string(),
+      }),
+    )
+    .mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
+
+  runTopicsList: agentEvalExternalProcedure
+    .input(z.object({ onlyExternal: z.boolean().default(false).optional(), runId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      const allRunTopics = await ctx.runTopicModel.findByRunId(input.runId);
+      const runTopics = input.onlyExternal
+        ? allRunTopics.filter((topic) => topic.status === 'external')
+        : allRunTopics;
+
+      return runTopics.map((topic) => {
+        const testCase = topic.testCase;
+
+        return {
+          createdAt: topic.createdAt,
+          evalResult: topic.evalResult,
+          passed: topic.passed,
+          runId: topic.runId,
+          score: topic.score,
+          status: topic.status,
+          testCase,
+          testCaseId: topic.testCaseId,
+          topic: topic.topic,
+          topicId: topic.topicId,
+        };
+      });
+    }),
+
+  testCasesCount: agentEvalExternalProcedure
+    .input(z.object({ datasetId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const count = await ctx.testCaseModel.countByDatasetId(input.datasetId);
+      return { count };
+    }),
+
+  threadsList: agentEvalExternalProcedure
+    .input(z.object({ topicId: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const threads = await ctx.threadModel.queryByTopicId(input.topicId);
+
+      return threads.map((thread) => ({
+        id: thread.id,
+        topicId: thread.topicId,
+        type: thread.type,
+      }));
+    }),
+});
diff --git a/src/server/routers/lambda/index.ts b/src/server/routers/lambda/index.ts
index 285b0f9030..c8e31896b1 100644
--- a/src/server/routers/lambda/index.ts
+++ b/src/server/routers/lambda/index.ts
@@ -12,6 +12,7 @@ import { agentRouter } from './agent';
 import { agentBotProviderRouter } from './agentBotProvider';
 import { agentCronJobRouter } from './agentCronJob';
 import { agentEvalRouter } from './agentEval';
+import { agentEvalExternalRouter } from './agentEvalExternal';
 import { agentGroupRouter } from './agentGroup';
 import { agentSkillsRouter } from './agentSkills';
 import { aiAgentRouter } from './aiAgent';
@@ -57,6 +58,7 @@ export const lambdaRouter = router({
   agentBotProvider: agentBotProviderRouter,
   agentCronJob: agentCronJobRouter,
   agentEval: agentEvalRouter,
+  agentEvalExternal: agentEvalExternalRouter,
   agentSkills: agentSkillsRouter,
   aiAgent: aiAgentRouter,
   aiChat: aiChatRouter,
diff --git a/src/server/services/agentEvalRun/index.ts b/src/server/services/agentEvalRun/index.ts
index 865fdf3955..3e80b4a072 100644
--- a/src/server/services/agentEvalRun/index.ts
+++ b/src/server/services/agentEvalRun/index.ts
@@ -512,6 +512,7 @@ export class AgentEvalRunService {
     const passedCases = allTopics.filter((t) => t.status === 'passed').length;
     const failedCases = allTopics.filter((t) => t.status === 'failed').length;
     const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const externalCasesRT = allTopics.filter((t) => t.status === 'external').length;
     const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
 
     let sumCost = 0;
@@ -556,6 +557,7 @@ export class AgentEvalRunService {
         completedCases: completedCount,
         cost: sumCost ? roundCost(sumCost) : undefined,
         errorCases,
+        externalCases: externalCasesRT || undefined,
         failedCases,
         llmCalls: sumLlmCalls || undefined,
         passedCases,
@@ -667,6 +669,17 @@ export class AgentEvalRunService {
     const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
     const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
 
+    // ── External eval mode: agent finished, hand off to external scorer ──
+    if (evalMode === 'external') {
+      return {
+        ...baseMeta,
+        awaitingExternalEval: true,
+        passed: undefined,
+        score: undefined,
+        status: 'external',
+      };
+    }
+
     let effectiveRubrics: EvalBenchmarkRubric[];
     if (evalMode) {
       effectiveRubrics = [
@@ -722,6 +735,7 @@ export class AgentEvalRunService {
       passed?: boolean;
       rubricScores?: Array<{ reason?: string; rubricId: string; score: number }>;
       score?: number;
+      status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout';
       steps?: number;
       threadId: string;
       tokens?: number;
@@ -737,6 +751,14 @@ export class AgentEvalRunService {
         passed: meta.passed as boolean | undefined,
         rubricScores: meta.rubricScores as any,
         score: meta.score as number | undefined,
+        status: meta.status as
+          | 'error'
+          | 'external'
+          | 'failed'
+          | 'passed'
+          | 'running'
+          | 'timeout'
+          | undefined,
         steps: meta.steps as number | undefined,
         threadId: t.id,
         tokens: meta.tokens as number | undefined,
@@ -744,6 +766,20 @@ export class AgentEvalRunService {
       };
     });
 
+    // ── External eval mode: if all threads await external scoring, propagate that status ──
+    const allExternal = threadResults.every((t) => t.status === 'external');
+    if (allExternal) {
+      await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
+        evalResult: {
+          awaitingExternalEval: true,
+          completionReason: 'external',
+          threads: threadResults,
+        } satisfies EvalRunTopicResult,
+        status: 'external',
+      });
+      return;
+    }
+
     // pass@k: at least one thread passed
     const anyPassed = threadResults.some((t) => t.passed === true);
     // pass^k: all threads passed
@@ -888,7 +924,7 @@ export class AgentEvalRunService {
     if (runTopic) {
       // Skip if topic is already in a terminal state (e.g. timeout marked by checkAndHandleRunTimeout).
       // The interrupted agent still fires the completion webhook, but we must not overwrite the result.
-      const terminalStates = ['passed', 'failed', 'error', 'timeout'];
+      const terminalStates = ['passed', 'failed', 'error', 'timeout', 'external'];
       if (runTopic.status && terminalStates.includes(runTopic.status)) {
         // Fall through to progress tracking below without modifying this topic
       } else {
@@ -945,11 +981,15 @@ export class AgentEvalRunService {
     // Aggregate real-time metrics from all RunTopics
     const allTopics = await this.runTopicModel.findByRunId(runId);
     const completedCount = allTopics.filter(
-      (t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
+      (t) =>
+        (t.evalResult && 'completionReason' in t.evalResult) ||
+        t.status === 'timeout' ||
+        t.status === 'external',
     ).length;
     const passedCases = allTopics.filter((t) => t.status === 'passed').length;
     const failedCases = allTopics.filter((t) => t.status === 'failed').length;
     const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const externalCasesTraj = allTopics.filter((t) => t.status === 'external').length;
     const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
 
     let sumCost = 0;
@@ -995,6 +1035,7 @@ export class AgentEvalRunService {
         completedCases: completedCount,
         cost: sumCost ? roundCost(sumCost) : undefined,
         errorCases,
+        externalCases: externalCasesTraj || undefined,
         failedCases,
         llmCalls: sumLlmCalls || undefined,
         passedCases,
@@ -1048,6 +1089,7 @@ export class AgentEvalRunService {
     let passedCases = 0;
     let failedCases = 0;
     let errorCases = 0;
+    let externalCases = 0;
     let timeoutCases = 0;
     let totalScore = 0;
     // Sum of per-case averages (for per-case display)
@@ -1088,19 +1130,27 @@ export class AgentEvalRunService {
         failedCases++;
       } else if (runTopic.status === 'error') {
         errorCases++;
+      } else if (runTopic.status === 'external') {
+        externalCases++;
       } else if (runTopic.status === 'timeout') {
         timeoutCases++;
       }
 
-      // Only accumulate scores for evaluated (non-error, non-timeout) cases
-      if (runTopic.status !== 'error' && runTopic.status !== 'timeout' && runTopic.score != null) {
-        totalScore += runTopic.score;
-      }
-
-      // Accumulate per-rubric scores from existing evalResult (exclude error/timeout cases)
+      // Only accumulate scores for evaluated (non-error, non-timeout, non-external) cases
       if (
         runTopic.status !== 'error' &&
         runTopic.status !== 'timeout' &&
+        runTopic.status !== 'external' &&
+        runTopic.score != null
+      ) {
+        totalScore += runTopic.score;
+      }
+
+      // Accumulate per-rubric scores from existing evalResult (exclude error/timeout/external cases)
+      if (
+        runTopic.status !== 'error' &&
+        runTopic.status !== 'timeout' &&
+        runTopic.status !== 'external' &&
         existingResult?.rubricScores
       ) {
         for (const rs of existingResult.rubricScores) {
@@ -1138,6 +1188,7 @@ export class AgentEvalRunService {
       cost: sumCost ? roundCost(sumCost) : undefined,
       duration: wallClockDuration || undefined,
       errorCases,
+      externalCases: externalCases || undefined,
       failedCases,
       llmCalls: sumLlmCalls || undefined,
       passRate: totalCases > 0 ? passedCases / totalCases : 0,
@@ -1216,6 +1267,15 @@ export class AgentEvalRunService {
     const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
     const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
 
+    // ── External eval mode: agent finished, hand off to external scorer ──
+    if (evalMode === 'external') {
+      await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+        evalResult: { ...existingResult, awaitingExternalEval: true },
+        status: 'external',
+      });
+      return;
+    }
+
     let effectiveRubrics: EvalBenchmarkRubric[];
     if (evalMode) {
       effectiveRubrics = [
@@ -1324,7 +1384,13 @@ export class AgentEvalRunService {
       });
 
       const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
-      const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+      const externalCount = metrics.externalCases || 0;
+      const runStatus =
+        externalCount > 0
+          ? 'external'
+          : nonSuccessCases >= metrics.totalCases
+            ? 'failed'
+            : 'completed';
 
       await this.runModel.update(run.id, { metrics, status: runStatus });
     } else {