mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-27 13:29:15 +07:00
✨ feat(eval): add external scoring mode (#12729)
* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
This commit is contained in:
285
apps/cli/src/commands/eval.test.ts
Normal file
285
apps/cli/src/commands/eval.test.ts
Normal file
@@ -0,0 +1,285 @@
|
||||
import { Command } from 'commander';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
const { mockTrpcClient } = vi.hoisted(() => ({
|
||||
mockTrpcClient: {
|
||||
agentEvalExternal: {
|
||||
datasetGet: { query: vi.fn() },
|
||||
messagesList: { query: vi.fn() },
|
||||
runGet: { query: vi.fn() },
|
||||
runSetStatus: { mutate: vi.fn() },
|
||||
runTopicReportResult: { mutate: vi.fn() },
|
||||
runTopicsList: { query: vi.fn() },
|
||||
testCasesCount: { query: vi.fn() },
|
||||
threadsList: { query: vi.fn() },
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
const { getTrpcClientMock } = vi.hoisted(() => ({
|
||||
getTrpcClientMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock('../api/client', () => ({
|
||||
getTrpcClient: getTrpcClientMock,
|
||||
}));
|
||||
|
||||
vi.mock('../utils/logger', () => ({
|
||||
log: {
|
||||
debug: vi.fn(),
|
||||
error: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
},
|
||||
setVerbose: vi.fn(),
|
||||
}));
|
||||
|
||||
// eslint-disable-next-line import-x/first
|
||||
import { log } from '../utils/logger';
|
||||
// eslint-disable-next-line import-x/first
|
||||
import { registerEvalCommand } from './eval';
|
||||
|
||||
describe('eval command', () => {
|
||||
let exitSpy: ReturnType<typeof vi.spyOn>;
|
||||
let logSpy: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
getTrpcClientMock.mockResolvedValue(mockTrpcClient);
|
||||
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
|
||||
logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
|
||||
|
||||
for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
|
||||
for (const fn of Object.values(method)) {
|
||||
(fn as ReturnType<typeof vi.fn>).mockReset();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
exitSpy.mockRestore();
|
||||
logSpy.mockRestore();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
const createProgram = () => {
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerEvalCommand(program);
|
||||
return program;
|
||||
};
|
||||
|
||||
it('should call runGet and output json envelope', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
|
||||
config: { k: 1 },
|
||||
datasetId: 'dataset-1',
|
||||
id: 'run-1',
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
|
||||
|
||||
const payload = JSON.parse(logSpy.mock.calls[0][0]);
|
||||
expect(payload).toEqual({
|
||||
data: {
|
||||
config: { k: 1 },
|
||||
datasetId: 'dataset-1',
|
||||
id: 'run-1',
|
||||
},
|
||||
error: null,
|
||||
ok: true,
|
||||
version: 'v1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should call datasetGet and output json envelope', async () => {
|
||||
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
|
||||
id: 'dataset-1',
|
||||
metadata: { preset: 'deepsearchqa' },
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'dataset',
|
||||
'get',
|
||||
'--dataset-id',
|
||||
'dataset-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
|
||||
datasetId: 'dataset-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should pass onlyExternal to runTopicsList', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run-topics',
|
||||
'list',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--only-external',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
|
||||
onlyExternal: true,
|
||||
runId: 'run-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should pass topicId and threadId to messagesList', async () => {
|
||||
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'messages',
|
||||
'list',
|
||||
'--topic-id',
|
||||
'topic-1',
|
||||
'--thread-id',
|
||||
'thread-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
|
||||
threadId: 'thread-1',
|
||||
topicId: 'topic-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse and report run-topic result', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
|
||||
success: true,
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run-topic',
|
||||
'report-result',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--topic-id',
|
||||
'topic-1',
|
||||
'--thread-id',
|
||||
'thread-1',
|
||||
'--score',
|
||||
'0.91',
|
||||
'--correct',
|
||||
'true',
|
||||
'--result-json',
|
||||
'{"grade":"A"}',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
|
||||
correct: true,
|
||||
result: { grade: 'A' },
|
||||
runId: 'run-1',
|
||||
score: 0.91,
|
||||
threadId: 'thread-1',
|
||||
topicId: 'topic-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should update run status', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
|
||||
runId: 'run-1',
|
||||
status: 'completed',
|
||||
success: true,
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run',
|
||||
'set-status',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--status',
|
||||
'completed',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
|
||||
runId: 'run-1',
|
||||
status: 'completed',
|
||||
});
|
||||
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
|
||||
});
|
||||
|
||||
it('should output json error envelope when command fails', async () => {
|
||||
const error = Object.assign(new Error('Run not found'), {
|
||||
data: { code: 'NOT_FOUND' },
|
||||
});
|
||||
mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run',
|
||||
'get',
|
||||
'--run-id',
|
||||
'run-404',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
const payload = JSON.parse(logSpy.mock.calls[0][0]);
|
||||
expect(payload).toEqual({
|
||||
data: null,
|
||||
error: { code: 'NOT_FOUND', message: 'Run not found' },
|
||||
ok: false,
|
||||
version: 'v1',
|
||||
});
|
||||
expect(exitSpy).toHaveBeenCalledWith(1);
|
||||
});
|
||||
|
||||
it('should query test case count', async () => {
|
||||
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'test-cases',
|
||||
'count',
|
||||
'--dataset-id',
|
||||
'dataset-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
|
||||
datasetId: 'dataset-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should log plain error without --json', async () => {
|
||||
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
|
||||
|
||||
expect(log.error).toHaveBeenCalledWith('boom');
|
||||
expect(exitSpy).toHaveBeenCalledWith(1);
|
||||
});
|
||||
});
|
||||
326
apps/cli/src/commands/eval.ts
Normal file
326
apps/cli/src/commands/eval.ts
Normal file
@@ -0,0 +1,326 @@
|
||||
import type { Command } from 'commander';
|
||||
import { InvalidArgumentError } from 'commander';
|
||||
import pc from 'picocolors';
|
||||
|
||||
import { getTrpcClient } from '../api/client';
|
||||
import { log } from '../utils/logger';
|
||||
|
||||
const JSON_VERSION = 'v1' as const;
|
||||
|
||||
interface JsonError {
|
||||
code?: string;
|
||||
message: string;
|
||||
}
|
||||
|
||||
interface JsonEnvelope<T> {
|
||||
data: T | null;
|
||||
error: JsonError | null;
|
||||
ok: boolean;
|
||||
version: typeof JSON_VERSION;
|
||||
}
|
||||
|
||||
interface JsonOption {
|
||||
json?: boolean;
|
||||
}
|
||||
|
||||
interface RunGetOptions extends JsonOption {
|
||||
runId: string;
|
||||
}
|
||||
|
||||
interface RunSetStatusOptions extends JsonOption {
|
||||
runId: string;
|
||||
status: 'completed' | 'external';
|
||||
}
|
||||
|
||||
interface DatasetGetOptions extends JsonOption {
|
||||
datasetId: string;
|
||||
}
|
||||
|
||||
interface RunTopicsListOptions extends JsonOption {
|
||||
onlyExternal?: boolean;
|
||||
runId: string;
|
||||
}
|
||||
|
||||
interface ThreadsListOptions extends JsonOption {
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
interface MessagesListOptions extends JsonOption {
|
||||
threadId?: string;
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
interface TestCasesCountOptions extends JsonOption {
|
||||
datasetId: string;
|
||||
}
|
||||
|
||||
interface RunTopicReportResultOptions extends JsonOption {
|
||||
correct: boolean;
|
||||
resultJson: Record<string, unknown>;
|
||||
runId: string;
|
||||
score: number;
|
||||
threadId?: string;
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
const printJson = (data: unknown) => {
|
||||
console.log(JSON.stringify(data, null, 2));
|
||||
};
|
||||
|
||||
const outputJsonSuccess = (data: unknown) => {
|
||||
const payload: JsonEnvelope<unknown> = {
|
||||
data,
|
||||
error: null,
|
||||
ok: true,
|
||||
version: JSON_VERSION,
|
||||
};
|
||||
printJson(payload);
|
||||
};
|
||||
|
||||
const isRecord = (value: unknown): value is Record<string, unknown> =>
|
||||
typeof value === 'object' && value !== null;
|
||||
|
||||
const toJsonError = (error: unknown): JsonError => {
|
||||
if (error instanceof Error) {
|
||||
const maybeData = (error as Error & { data?: { code?: string } }).data;
|
||||
const code = maybeData?.code;
|
||||
|
||||
return {
|
||||
code: typeof code === 'string' ? code : undefined,
|
||||
message: error.message,
|
||||
};
|
||||
}
|
||||
|
||||
if (isRecord(error)) {
|
||||
const code = typeof error.code === 'string' ? error.code : undefined;
|
||||
const message = typeof error.message === 'string' ? error.message : 'Unknown error';
|
||||
return { code, message };
|
||||
}
|
||||
|
||||
return { message: String(error) };
|
||||
};
|
||||
|
||||
const handleCommandError = (error: unknown, json: boolean) => {
|
||||
const normalized = toJsonError(error);
|
||||
|
||||
if (json) {
|
||||
const payload: JsonEnvelope<null> = {
|
||||
data: null,
|
||||
error: normalized,
|
||||
ok: false,
|
||||
version: JSON_VERSION,
|
||||
};
|
||||
printJson(payload);
|
||||
} else {
|
||||
log.error(normalized.message);
|
||||
}
|
||||
|
||||
process.exit(1);
|
||||
};
|
||||
|
||||
const parseScore = (value: string) => {
|
||||
const score = Number(value);
|
||||
if (!Number.isFinite(score)) {
|
||||
throw new InvalidArgumentError(`Invalid score: ${value}`);
|
||||
}
|
||||
return score;
|
||||
};
|
||||
|
||||
const parseBoolean = (value: string) => {
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (['1', 'true', 'yes'].includes(normalized)) return true;
|
||||
if (['0', 'false', 'no'].includes(normalized)) return false;
|
||||
throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
|
||||
};
|
||||
|
||||
const parseResultJson = (value: string) => {
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(value);
|
||||
} catch {
|
||||
throw new InvalidArgumentError('Invalid JSON value for --result-json');
|
||||
}
|
||||
|
||||
if (!isRecord(parsed) || Array.isArray(parsed)) {
|
||||
throw new InvalidArgumentError('--result-json must be a JSON object');
|
||||
}
|
||||
|
||||
return parsed;
|
||||
};
|
||||
|
||||
const parseRunStatus = (value: string) => {
|
||||
if (value !== 'completed' && value !== 'external') {
|
||||
throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
|
||||
}
|
||||
|
||||
return value as 'completed' | 'external';
|
||||
};
|
||||
|
||||
const executeCommand = async (
|
||||
options: JsonOption,
|
||||
action: () => Promise<unknown>,
|
||||
successMessage?: string,
|
||||
) => {
|
||||
try {
|
||||
const data = await action();
|
||||
if (options.json) {
|
||||
outputJsonSuccess(data);
|
||||
return;
|
||||
}
|
||||
|
||||
if (successMessage) {
|
||||
console.log(`${pc.green('OK')} ${successMessage}`);
|
||||
return;
|
||||
}
|
||||
|
||||
printJson(data);
|
||||
} catch (error) {
|
||||
handleCommandError(error, Boolean(options.json));
|
||||
}
|
||||
};
|
||||
|
||||
export function registerEvalCommand(program: Command) {
|
||||
const evalCmd = program.command('eval').description('Manage external evaluation workflows');
|
||||
|
||||
const runCmd = evalCmd.command('run').description('Manage evaluation runs');
|
||||
|
||||
runCmd
|
||||
.command('get')
|
||||
.description('Get run information')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunGetOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runGet.query({ runId: options.runId });
|
||||
}),
|
||||
);
|
||||
|
||||
runCmd
|
||||
.command('set-status')
|
||||
.description('Set run status (external API supports completed or external)')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunSetStatusOptions) =>
|
||||
executeCommand(
|
||||
options,
|
||||
async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runSetStatus.mutate({
|
||||
runId: options.runId,
|
||||
status: options.status,
|
||||
});
|
||||
},
|
||||
`Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
|
||||
),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('dataset')
|
||||
.description('Manage evaluation datasets')
|
||||
.command('get')
|
||||
.description('Get dataset information')
|
||||
.requiredOption('--dataset-id <id>', 'Dataset ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: DatasetGetOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('run-topics')
|
||||
.description('Manage run topics')
|
||||
.command('list')
|
||||
.description('List topics in a run')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.option('--only-external', 'Only return topics pending external evaluation')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunTopicsListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runTopicsList.query({
|
||||
onlyExternal: Boolean(options.onlyExternal),
|
||||
runId: options.runId,
|
||||
});
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('threads')
|
||||
.description('Manage evaluation threads')
|
||||
.command('list')
|
||||
.description('List threads by topic')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: ThreadsListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('messages')
|
||||
.description('Manage evaluation messages')
|
||||
.command('list')
|
||||
.description('List messages by topic and optional thread')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--thread-id <id>', 'Thread ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: MessagesListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.messagesList.query({
|
||||
threadId: options.threadId,
|
||||
topicId: options.topicId,
|
||||
});
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('test-cases')
|
||||
.description('Manage evaluation test cases')
|
||||
.command('count')
|
||||
.description('Count test cases by dataset')
|
||||
.requiredOption('--dataset-id <id>', 'Dataset ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: TestCasesCountOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('run-topic')
|
||||
.description('Manage evaluation run-topic reporting')
|
||||
.command('report-result')
|
||||
.description('Report one evaluation result for a run topic')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--thread-id <id>', 'Thread ID (required for k > 1)')
|
||||
.requiredOption('--score <score>', 'Evaluation score', parseScore)
|
||||
.requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
|
||||
.requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunTopicReportResultOptions) =>
|
||||
executeCommand(
|
||||
options,
|
||||
async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runTopicReportResult.mutate({
|
||||
correct: options.correct,
|
||||
result: options.resultJson,
|
||||
runId: options.runId,
|
||||
score: options.score,
|
||||
threadId: options.threadId,
|
||||
topicId: options.topicId,
|
||||
});
|
||||
},
|
||||
`Reported result for topic ${pc.bold(options.topicId)}`,
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
|
||||
import { registerFileCommand } from './commands/file';
|
||||
import { registerGenerateCommand } from './commands/generate';
|
||||
import { registerKbCommand } from './commands/kb';
|
||||
import { registerEvalCommand } from './commands/eval';
|
||||
import { registerLoginCommand } from './commands/login';
|
||||
import { registerLogoutCommand } from './commands/logout';
|
||||
import { registerMemoryCommand } from './commands/memory';
|
||||
@@ -44,5 +45,6 @@ registerModelCommand(program);
|
||||
registerProviderCommand(program);
|
||||
registerPluginCommand(program);
|
||||
registerConfigCommand(program);
|
||||
registerEvalCommand(program);
|
||||
|
||||
program.parse();
|
||||
|
||||
@@ -157,13 +157,15 @@
|
||||
"difficulty.easy": "Easy",
|
||||
"difficulty.hard": "Hard",
|
||||
"difficulty.medium": "Medium",
|
||||
"evalMode.answer-relevance": "LLM Relevance",
|
||||
"evalMode.answer-relevance.desc": "Use LLM to evaluate answer relevance (yes or no)",
|
||||
"evalMode.contains": "Contains Match",
|
||||
"evalMode.contains.desc": "Output must contain the expected text",
|
||||
"evalMode.equals": "Exact Match",
|
||||
"evalMode.equals.desc": "Output must be exactly the same as expected",
|
||||
"evalMode.label": "Eval Mode",
|
||||
"evalMode.llm-rubric": "LLM Judge",
|
||||
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
|
||||
"evalMode.llm-rubric.desc": "Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)",
|
||||
"evalMode.placeholder": "Select eval mode",
|
||||
"evalMode.prompt.label": "Judge Prompt",
|
||||
"evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
|
||||
@@ -256,12 +258,16 @@
|
||||
"run.running.hint": "Evaluation is running, results will appear shortly...",
|
||||
"run.status.aborted": "Aborted",
|
||||
"run.status.completed": "Completed",
|
||||
"run.status.completed.tooltip": "This evaluation has completed running all test cases and scoring.",
|
||||
"run.status.error": "Run Error",
|
||||
"run.status.external": "External",
|
||||
"run.status.external.tooltip": "This evaluation is waiting for external scoring. Results will be updated when scoring is complete.",
|
||||
"run.status.failed": "Failed",
|
||||
"run.status.idle": "Idle",
|
||||
"run.status.pending": "Pending",
|
||||
"run.status.running": "Running",
|
||||
"run.status.timeout": "Timeout",
|
||||
"sidebar": "Evaluation",
|
||||
"sidebar.benchmarks": "Benchmarks",
|
||||
"sidebar.dashboard": "Dashboard",
|
||||
"sidebar.datasets": "Datasets",
|
||||
|
||||
@@ -161,6 +161,8 @@
|
||||
"evalMode.contains.desc": "输出中必须包含期望的文本",
|
||||
"evalMode.equals": "精确匹配",
|
||||
"evalMode.equals.desc": "输出必须与期望内容完全一致",
|
||||
"evalMode.external": "外部评估",
|
||||
"evalMode.external.desc": "智能体完成运行后,由外部系统提交评估结果",
|
||||
"evalMode.label": "评估模式",
|
||||
"evalMode.llm-rubric": "LLM 评判",
|
||||
"evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
|
||||
@@ -256,7 +258,10 @@
|
||||
"run.running.hint": "评测进行中,结果即将呈现...",
|
||||
"run.status.aborted": "已终止",
|
||||
"run.status.completed": "已完成",
|
||||
"run.status.completed.tooltip": "评测已完成运行,所有结果已评估。",
|
||||
"run.status.error": "运行出错",
|
||||
"run.status.external": "待外部评测",
|
||||
"run.status.external.tooltip": "智能体已完成运行,等待外部系统提交评估结果。",
|
||||
"run.status.failed": "失败",
|
||||
"run.status.idle": "待开始",
|
||||
"run.status.pending": "等待中",
|
||||
|
||||
@@ -50,6 +50,8 @@ export class AgentEvalDatasetModel {
|
||||
benchmarkId: agentEvalDatasets.benchmarkId,
|
||||
createdAt: agentEvalDatasets.createdAt,
|
||||
description: agentEvalDatasets.description,
|
||||
evalConfig: agentEvalDatasets.evalConfig,
|
||||
evalMode: agentEvalDatasets.evalMode,
|
||||
id: agentEvalDatasets.id,
|
||||
identifier: agentEvalDatasets.identifier,
|
||||
metadata: agentEvalDatasets.metadata,
|
||||
|
||||
@@ -31,7 +31,7 @@ export class AgentEvalRunModel {
|
||||
datasetId?: string;
|
||||
limit?: number;
|
||||
offset?: number;
|
||||
status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted';
|
||||
status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted' | 'external';
|
||||
}) => {
|
||||
const conditions = [eq(agentEvalRuns.userId, this.userId)];
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@ const evalModes = [
|
||||
'similar',
|
||||
'levenshtein',
|
||||
'rubric',
|
||||
'external',
|
||||
] as const;
|
||||
|
||||
// ============================================
|
||||
@@ -181,7 +182,7 @@ export const agentEvalRuns = pgTable(
|
||||
name: text('name'),
|
||||
|
||||
status: text('status', {
|
||||
enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted'],
|
||||
enum: ['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'],
|
||||
})
|
||||
.default('idle')
|
||||
.notNull(),
|
||||
@@ -228,7 +229,7 @@ export const agentEvalRunTopics = pgTable(
|
||||
.notNull(),
|
||||
|
||||
status: text('status', {
|
||||
enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout'],
|
||||
enum: ['pending', 'running', 'passed', 'failed', 'error', 'timeout', 'external', 'completed'],
|
||||
}),
|
||||
|
||||
score: real('score'),
|
||||
|
||||
@@ -87,12 +87,20 @@ export const evaluate = async (
|
||||
const candidates: string[] = JSON.parse(expected);
|
||||
const results: MatchResult[] = [];
|
||||
for (const c of candidates) {
|
||||
results.push(await match({ actual: extracted, expected: c, rubric }, matchContext));
|
||||
results.push(
|
||||
await match(
|
||||
{ input: testCase.input, actual: extracted, expected: c, rubric },
|
||||
matchContext,
|
||||
),
|
||||
);
|
||||
}
|
||||
const best = results.reduce((a, b) => (a.score >= b.score ? a : b));
|
||||
result = best;
|
||||
} else {
|
||||
result = await match({ actual: extracted, expected, rubric }, matchContext);
|
||||
result = await match(
|
||||
{ input: testCase.input, actual: extracted, expected, rubric },
|
||||
matchContext,
|
||||
);
|
||||
}
|
||||
|
||||
rubricResults.push({
|
||||
|
||||
9
packages/eval-rubric/src/matchers/external.ts
Normal file
9
packages/eval-rubric/src/matchers/external.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import type { MatchResult } from './types';
|
||||
|
||||
export const matchExternal = async (): Promise<MatchResult> => {
|
||||
return {
|
||||
passed: false,
|
||||
score: 0,
|
||||
reason: 'Waiting for external evaluation...',
|
||||
};
|
||||
};
|
||||
@@ -4,8 +4,10 @@ import { matchAnyOf } from './anyOf';
|
||||
import { matchContains } from './contains';
|
||||
import { matchEndsWith } from './endsWith';
|
||||
import { matchEquals } from './equals';
|
||||
import { matchExternal } from './external';
|
||||
import { matchJsonSchema } from './jsonSchema';
|
||||
import { matchLevenshtein } from './levenshtein';
|
||||
import { matchLLMEq } from './llmEq';
|
||||
import { matchLLMRubric } from './llmRubric';
|
||||
import { matchNumeric } from './numeric';
|
||||
import { matchRegex } from './regex';
|
||||
@@ -18,10 +20,15 @@ export type { GenerateObjectPayload, MatchContext, MatchResult } from './types';
|
||||
* Run a single rubric matcher against actual vs expected
|
||||
*/
|
||||
export const match = async (
|
||||
params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric },
|
||||
params: {
|
||||
input: string;
|
||||
actual: string;
|
||||
expected: string | undefined;
|
||||
rubric: EvalBenchmarkRubric;
|
||||
},
|
||||
context?: MatchContext,
|
||||
): Promise<MatchResult> => {
|
||||
const { actual, expected, rubric } = params;
|
||||
const { actual, expected, rubric, input } = params;
|
||||
const { type, config } = rubric;
|
||||
|
||||
switch (type) {
|
||||
@@ -57,6 +64,10 @@ export const match = async (
|
||||
return matchLevenshtein(actual, expected, config);
|
||||
}
|
||||
|
||||
case 'answer-relevance': {
|
||||
return matchLLMEq(input, actual, expected, rubric, context);
|
||||
}
|
||||
|
||||
case 'llm-rubric': {
|
||||
return matchLLMRubric(actual, expected, rubric, context);
|
||||
}
|
||||
@@ -65,6 +76,10 @@ export const match = async (
|
||||
return matchJsonSchema(actual, config);
|
||||
}
|
||||
|
||||
case 'external': {
|
||||
return matchExternal();
|
||||
}
|
||||
|
||||
default: {
|
||||
return {
|
||||
passed: false,
|
||||
|
||||
89
packages/eval-rubric/src/matchers/llmEq.ts
Normal file
89
packages/eval-rubric/src/matchers/llmEq.ts
Normal file
@@ -0,0 +1,89 @@
|
||||
import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types';
|
||||
|
||||
import type { MatchContext, MatchResult } from './types';
|
||||
|
||||
const DEFAULT_SYSTEM_ROLE = [
|
||||
'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.',
|
||||
'Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.',
|
||||
'Your judgement must be in the format and criteria specified below:',
|
||||
"extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.",
|
||||
'reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.',
|
||||
'Scoring rules:',
|
||||
'score: Return 1 only when extracted_final_answer clearly and unambiguously matches [correct_answer], or is within a small margin of error for numerical problems.',
|
||||
'score: Return 0 when extracted_final_answer is incorrect, missing, ambiguous, non-equivalent, or when you are uncertain.',
|
||||
'Treat uncertainty as incorrect (score = 0).',
|
||||
'Respond with a JSON object containing ',
|
||||
'"score" (number: 0 or 1)',
|
||||
'and "reason" (brief explanation for the judgement).',
|
||||
].join('\n');
|
||||
|
||||
const JUDGE_SCORE_SCHEMA: Record<string, unknown> = {
|
||||
additionalProperties: false,
|
||||
properties: {
|
||||
score: {
|
||||
description: 'Binary score for judgement: 1=correct, 0=incorrect/uncertain',
|
||||
enum: [0, 1],
|
||||
type: 'number',
|
||||
},
|
||||
reason: { description: 'Brief explanation for the judgement', type: 'string' },
|
||||
},
|
||||
required: ['score', 'reason'],
|
||||
type: 'object',
|
||||
};
|
||||
|
||||
function buildJudgeUserPrompt(
|
||||
question: string,
|
||||
actual: string,
|
||||
expected: string | undefined,
|
||||
): string {
|
||||
const parts = [`[question]\n${question}`, `[response]\n${actual}`];
|
||||
if (expected) {
|
||||
parts.push(`[correct_answer]\n${expected}`);
|
||||
}
|
||||
return parts.join('\n\n');
|
||||
}
|
||||
|
||||
export const matchLLMEq = async (
|
||||
question: string,
|
||||
actual: string,
|
||||
expected: string | undefined,
|
||||
rubric: EvalBenchmarkRubric,
|
||||
context?: MatchContext,
|
||||
): Promise<MatchResult> => {
|
||||
if (!context?.generateObject) {
|
||||
return { passed: false, reason: 'LLM judge not available', score: 0 };
|
||||
}
|
||||
|
||||
const cfg = rubric.config as RubricConfigLLM;
|
||||
const model = cfg.model || context.judgeModel;
|
||||
|
||||
if (!model) {
|
||||
return { passed: false, reason: 'No judge model configured', score: 0 };
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await context.generateObject({
|
||||
messages: [
|
||||
{ content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' },
|
||||
{ content: buildJudgeUserPrompt(question, actual, expected), role: 'user' },
|
||||
],
|
||||
model,
|
||||
provider: cfg.provider,
|
||||
schema: JUDGE_SCORE_SCHEMA,
|
||||
});
|
||||
|
||||
const score = result?.score === 1 ? 1 : 0;
|
||||
|
||||
return {
|
||||
passed: score === 1,
|
||||
reason: result?.reason,
|
||||
score,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
passed: false,
|
||||
reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
score: 0,
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -64,6 +64,10 @@ export const matchLLMRubric = async (
|
||||
schema: JUDGE_SCORE_SCHEMA,
|
||||
});
|
||||
|
||||
if (!result?.score) {
|
||||
return { passed: false, reason: 'LLM judge did not return a score', score: 0 };
|
||||
}
|
||||
|
||||
const score = Math.max(0, Math.min(1, result.score));
|
||||
const threshold = rubric.threshold ?? 0.6;
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ export interface EvalTestCaseMetadata {
|
||||
/**
|
||||
* Evaluation run status
|
||||
*/
|
||||
export type EvalRunStatus = 'aborted' | 'completed' | 'failed' | 'pending' | 'running';
|
||||
export type EvalRunStatus = 'aborted' | 'completed' | 'external' | 'failed' | 'pending' | 'running';
|
||||
|
||||
/**
|
||||
* Evaluation run configuration
|
||||
@@ -96,6 +96,7 @@ export interface EvalRunMetrics {
|
||||
cost?: number;
|
||||
duration?: number;
|
||||
errorCases?: number;
|
||||
externalCases?: number;
|
||||
failedCases: number;
|
||||
llmCalls?: number;
|
||||
passAllK?: number;
|
||||
@@ -183,6 +184,8 @@ export interface EvalRunTopicResult {
|
||||
completionReason?: string;
|
||||
operationId?: string;
|
||||
rubricScores?: EvalRubricScore[];
|
||||
/** Set when evalMode is 'external' — agent finished, awaiting external scoring */
|
||||
awaitingExternalEval?: boolean;
|
||||
}
|
||||
/*eslint-enable perfectionist/sort-interfaces */
|
||||
|
||||
@@ -194,14 +197,16 @@ export interface EvalThreadResult {
|
||||
cost?: number;
|
||||
duration?: number;
|
||||
error?: string;
|
||||
llmCalls?: number;
|
||||
operationId?: string;
|
||||
passed?: boolean;
|
||||
rubricScores?: EvalRubricScore[];
|
||||
score?: number;
|
||||
status?: 'error' | 'failed' | 'passed' | 'running' | 'timeout';
|
||||
status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout' | 'completed';
|
||||
steps?: number;
|
||||
threadId: string;
|
||||
tokens?: number;
|
||||
toolCalls?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -11,6 +11,7 @@ export type AgentEvalRunStatus =
|
||||
| 'failed'
|
||||
| 'idle'
|
||||
| 'pending'
|
||||
| 'external'
|
||||
| 'running';
|
||||
|
||||
export interface AgentEvalRunTargetAgent {
|
||||
|
||||
@@ -22,6 +22,8 @@ export type RubricType =
|
||||
// Similarity
|
||||
| 'similar'
|
||||
| 'levenshtein'
|
||||
// External evaluation
|
||||
| 'external'
|
||||
// Composite
|
||||
| 'rubric';
|
||||
|
||||
|
||||
@@ -66,9 +66,18 @@ export const { POST } = serve<FinalizeRunPayload>(
|
||||
|
||||
log('Metrics: %O', metrics);
|
||||
|
||||
// Step 4: Update run status (failed if all cases errored/timed out)
|
||||
// Step 4: Update run status
|
||||
// external: any topic awaits external scoring → whole run waits too
|
||||
// failed: all cases are non-success (error/timeout)
|
||||
// completed: everything else
|
||||
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
|
||||
const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
|
||||
const externalCount = metrics.externalCases || 0;
|
||||
const runStatus =
|
||||
externalCount > 0
|
||||
? 'external'
|
||||
: nonSuccessCases >= metrics.totalCases
|
||||
? 'failed'
|
||||
: 'completed';
|
||||
|
||||
await context.run('agent-eval-run:update-run', async () => {
|
||||
const runModel = new AgentEvalRunModel(db, userId);
|
||||
|
||||
@@ -173,9 +173,14 @@ export default {
|
||||
'evalMode.contains.desc': 'Output must contain the expected text',
|
||||
'evalMode.equals': 'Exact Match',
|
||||
'evalMode.equals.desc': 'Output must be exactly the same as expected',
|
||||
'evalMode.external': 'External Eval',
|
||||
'evalMode.external.desc': 'Agent runs to completion; scoring is handled by an external system',
|
||||
'evalMode.label': 'Eval Mode',
|
||||
'evalMode.llm-rubric': 'LLM Judge',
|
||||
'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality',
|
||||
'evalMode.llm-rubric.desc':
|
||||
'Use LLM to evaluate output quality based on custom criteria (0.0 to 1.0)',
|
||||
'evalMode.answer-relevance': 'LLM Relevance',
|
||||
'evalMode.answer-relevance.desc': 'Use LLM to evaluate answer relevance (yes or no)',
|
||||
'evalMode.placeholder': 'Select eval mode',
|
||||
'evalMode.prompt.label': 'Judge Prompt',
|
||||
'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge',
|
||||
@@ -204,6 +209,8 @@ export default {
|
||||
'run.idle.hint': 'Click Start to begin evaluation',
|
||||
'run.pending.hint': 'Evaluation is queued, waiting to start...',
|
||||
'run.running.hint': 'Evaluation is running, results will appear shortly...',
|
||||
'run.external.hint':
|
||||
'Running completed. Waiting for external system to submit evaluation results ...',
|
||||
|
||||
'run.filter.active': 'Active',
|
||||
'run.filter.empty': 'No runs match the current filter.',
|
||||
@@ -249,6 +256,9 @@ export default {
|
||||
'run.detail.report': 'Evaluation Summary',
|
||||
'run.detail.config': 'Evaluation Config',
|
||||
'run.detail.configSnapshot': 'Configuration Snapshot',
|
||||
'run.detail.copyRunId': 'Copy Run ID',
|
||||
'run.detail.copyRunIdFailed': 'Failed to copy Run ID',
|
||||
'run.detail.copyRunIdSuccess': 'Run ID copied',
|
||||
'run.detail.dataset': 'Dataset',
|
||||
'run.detail.model': 'Model',
|
||||
'run.detail.overview': 'Overview',
|
||||
@@ -279,7 +289,11 @@ export default {
|
||||
|
||||
'run.status.aborted': 'Aborted',
|
||||
'run.status.completed': 'Completed',
|
||||
'run.status.completed.tooltip': 'The run and external scoring are completed.',
|
||||
'run.status.error': 'Run Error',
|
||||
'run.status.external': 'Awaiting Eval',
|
||||
'run.status.external.tooltip':
|
||||
'The agent has finished running. Waiting for an external system to submit evaluation results.',
|
||||
'run.status.failed': 'Failed',
|
||||
'run.status.idle': 'Idle',
|
||||
'run.status.pending': 'Pending',
|
||||
|
||||
@@ -208,6 +208,7 @@ const DatasetDetail = memo(() => {
|
||||
}}
|
||||
>
|
||||
<TestCaseTable
|
||||
datasetEvalMode={dataset?.evalMode}
|
||||
diffFilter={diffFilter}
|
||||
pagination={pagination}
|
||||
search={search}
|
||||
|
||||
@@ -238,6 +238,7 @@ const DatasetCard = memo<DatasetCardProps>(
|
||||
) : (
|
||||
<TestCaseTable
|
||||
readOnly
|
||||
datasetEvalMode={dataset.evalMode}
|
||||
diffFilter={diffFilter}
|
||||
pagination={pagination}
|
||||
search={search}
|
||||
|
||||
@@ -83,6 +83,7 @@ const styles = createStaticStyles(({ css, cssVar }) => ({
|
||||
}));
|
||||
|
||||
interface TestCaseTableProps {
|
||||
datasetEvalMode?: string | null;
|
||||
diffFilter: 'all' | 'easy' | 'medium' | 'hard';
|
||||
onAddCase?: () => void;
|
||||
onDelete?: (testCase: any) => void;
|
||||
@@ -106,6 +107,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
|
||||
total,
|
||||
search,
|
||||
diffFilter,
|
||||
datasetEvalMode,
|
||||
pagination,
|
||||
onSearchChange,
|
||||
onDiffFilterChange,
|
||||
@@ -170,10 +172,18 @@ const TestCaseTable = memo<TestCaseTableProps>(
|
||||
dataIndex: 'evalMode',
|
||||
key: 'evalMode',
|
||||
render: (text: string) => {
|
||||
if (!text) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
|
||||
const effective = text ?? datasetEvalMode;
|
||||
if (!effective) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
|
||||
const isInherited = !text && !!datasetEvalMode;
|
||||
return (
|
||||
<span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
|
||||
{t(`evalMode.${text}` as any)}
|
||||
<span
|
||||
style={{
|
||||
color: isInherited ? cssVar.colorTextQuaternary : cssVar.colorTextSecondary,
|
||||
fontSize: 12,
|
||||
fontStyle: isInherited ? 'italic' : 'normal',
|
||||
}}
|
||||
>
|
||||
{t(`evalMode.${effective}` as any)}
|
||||
</span>
|
||||
);
|
||||
},
|
||||
@@ -238,7 +248,7 @@ const TestCaseTable = memo<TestCaseTableProps>(
|
||||
}
|
||||
|
||||
return base;
|
||||
}, [pagination, readOnly, onEdit, onDelete, t]);
|
||||
}, [pagination, readOnly, onEdit, onDelete, t, datasetEvalMode]);
|
||||
|
||||
return (
|
||||
<>
|
||||
|
||||
@@ -67,6 +67,8 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
|
||||
const { t } = useTranslation('eval');
|
||||
const status: string | null | undefined = record.status;
|
||||
|
||||
// return <div>{status}</div>;
|
||||
|
||||
if (!status || status === 'pending')
|
||||
return <Badge status="default" text={<BadgeText>{t('run.status.pending')}</BadgeText>} />;
|
||||
|
||||
@@ -86,6 +88,17 @@ const StatusBadge = memo<{ record: any }>(({ record }) => {
|
||||
if (status === 'timeout')
|
||||
return <Badge color="orange" text={<BadgeText>{t('run.status.timeout')}</BadgeText>} />;
|
||||
|
||||
if (status === 'external') {
|
||||
const badge = <Badge color="purple" text={<BadgeText>{t('run.status.external')}</BadgeText>} />;
|
||||
return <Tooltip title={t('run.status.external.tooltip')}>{badge}</Tooltip>;
|
||||
}
|
||||
|
||||
if (status === 'completed') {
|
||||
// 完成代表运行完成 + 评测完成,不代表结果一定通过
|
||||
const badge = <Badge color="blue" text={<BadgeText>{t('run.status.completed')}</BadgeText>} />;
|
||||
return <Tooltip title={t('run.status.completed.tooltip')}>{badge}</Tooltip>;
|
||||
}
|
||||
|
||||
return <Badge status="default" text={<BadgeText>{status}</BadgeText>} />;
|
||||
});
|
||||
|
||||
@@ -99,15 +112,29 @@ const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => (
|
||||
|
||||
if (thread.passed === true) {
|
||||
color = cssVar.colorSuccess;
|
||||
} else if (thread.passed === false) {
|
||||
color = cssVar.colorError;
|
||||
}
|
||||
|
||||
if (thread.status === 'external') {
|
||||
color = cssVar.colorWarning;
|
||||
}
|
||||
|
||||
if (thread.status === 'completed') {
|
||||
color = cssVar.colorPrimary;
|
||||
}
|
||||
|
||||
const label = thread.error
|
||||
? 'error'
|
||||
: thread.passed === true
|
||||
? 'passed'
|
||||
: thread.passed === false
|
||||
: thread.passed === false && thread.status !== 'completed'
|
||||
? 'failed'
|
||||
: 'pending';
|
||||
: thread.status === 'external'
|
||||
? 'Awaiting for external evaluation'
|
||||
: thread.status === 'completed'
|
||||
? 'completed'
|
||||
: 'pending';
|
||||
|
||||
return (
|
||||
<Tooltip key={thread.threadId} title={label}>
|
||||
@@ -406,6 +433,8 @@ const CaseResultsTable = memo<CaseResultsTableProps>(
|
||||
{ label: t('table.filter.error'), value: 'error' },
|
||||
{ label: t('table.filter.running'), value: 'running' },
|
||||
{ label: t('run.status.pending'), value: 'pending' },
|
||||
{ label: t('run.status.external'), value: 'external' },
|
||||
{ label: t('run.status.completed'), value: 'completed' },
|
||||
]}
|
||||
onChange={setStatusFilter}
|
||||
/>
|
||||
|
||||
@@ -96,7 +96,7 @@ const useStyles = createStyles(({ css, token }) => ({
|
||||
`,
|
||||
}));
|
||||
|
||||
const PendingState = memo(() => {
|
||||
const PendingState = memo(({ hint }: { hint?: string }) => {
|
||||
const { t } = useTranslation('eval');
|
||||
const { cx, styles } = useStyles();
|
||||
|
||||
@@ -119,7 +119,7 @@ const PendingState = memo(() => {
|
||||
<Icon icon={Clock} size={18} />
|
||||
</div>
|
||||
</div>
|
||||
<div className={styles.hint}>{t('run.pending.hint')}</div>
|
||||
<div className={styles.hint}>{hint}</div>
|
||||
</div>
|
||||
);
|
||||
});
|
||||
|
||||
@@ -2,10 +2,19 @@
|
||||
|
||||
import { AGENT_PROFILE_URL } from '@lobechat/const';
|
||||
import type { AgentEvalRunDetail } from '@lobechat/types';
|
||||
import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
|
||||
import { ActionIcon, Avatar, copyToClipboard, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
|
||||
import { App, Button, Card, Tag, Typography } from 'antd';
|
||||
import { createStyles } from 'antd-style';
|
||||
import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react';
|
||||
import {
|
||||
ArrowLeft,
|
||||
ChevronDown,
|
||||
ChevronUp,
|
||||
Copy,
|
||||
Pencil,
|
||||
Play,
|
||||
Square,
|
||||
Trash2,
|
||||
} from 'lucide-react';
|
||||
import { memo, useState } from 'react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { Link, useNavigate } from 'react-router-dom';
|
||||
@@ -170,6 +179,14 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
|
||||
window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
|
||||
}
|
||||
};
|
||||
const handleCopyRunId = async () => {
|
||||
try {
|
||||
await copyToClipboard(run.id);
|
||||
message.success(t('run.detail.copyRunIdSuccess'));
|
||||
} catch {
|
||||
message.error(t('run.detail.copyRunIdFailed'));
|
||||
}
|
||||
};
|
||||
|
||||
const formatDate = (date?: Date | string) => {
|
||||
if (!date) return '';
|
||||
@@ -194,6 +211,12 @@ const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
|
||||
<Typography.Title level={4} style={{ margin: 0 }}>
|
||||
{run.name || run.id.slice(0, 8)}
|
||||
</Typography.Title>
|
||||
<ActionIcon
|
||||
icon={Copy}
|
||||
size="small"
|
||||
title={t('run.detail.copyRunId')}
|
||||
onClick={handleCopyRunId}
|
||||
/>
|
||||
<StatusBadge status={run.status} />
|
||||
</Flexbox>
|
||||
{/* Meta info row */}
|
||||
|
||||
@@ -104,7 +104,9 @@ const RunDetail = memo(() => {
|
||||
{runDetail.status === 'running' ? (
|
||||
<RunningState />
|
||||
) : runDetail.status === 'pending' ? (
|
||||
<PendingState />
|
||||
<PendingState hint={t('run.pending.hint')} />
|
||||
) : runDetail.status === 'external' ? (
|
||||
<PendingState hint={t('run.external.hint')} />
|
||||
) : (
|
||||
<IdleState run={runDetail} />
|
||||
)}
|
||||
|
||||
@@ -36,6 +36,26 @@ export interface DatasetPreset {
|
||||
}
|
||||
|
||||
export const DATASET_PRESETS: Record<string, DatasetPreset> = {
|
||||
'browsecomp': {
|
||||
id: 'browsecomp',
|
||||
category: 'research',
|
||||
name: 'BrowseComp',
|
||||
description: 'Measuring the ability for agents to browse the web, comprises 1,266 questions.',
|
||||
icon: Globe,
|
||||
formatDescription: 'format: Topic (category/tags), Question (input), Answer (expected)',
|
||||
requiredFields: ['question', 'answer', 'problem_topic', 'canary'],
|
||||
optionalFields: [],
|
||||
fieldInference: {
|
||||
input: ['question'],
|
||||
expected: ['answer'],
|
||||
choices: [],
|
||||
category: ['problem_topic'],
|
||||
},
|
||||
validation: {
|
||||
requireExpected: true,
|
||||
expectedFormat: 'string',
|
||||
},
|
||||
},
|
||||
// === Deep Research / QA Category ===
|
||||
'browsecomp-zh': {
|
||||
id: 'browsecomp-zh',
|
||||
@@ -58,6 +78,129 @@ export const DATASET_PRESETS: Record<string, DatasetPreset> = {
|
||||
},
|
||||
},
|
||||
|
||||
'widesearch': {
|
||||
id: 'widesearch',
|
||||
category: 'research',
|
||||
name: 'WideSearch',
|
||||
description:
|
||||
'Evaluating the capabilities of agents in broad information-seeking tasks, consisting of 200 questions.',
|
||||
icon: Globe,
|
||||
formatDescription: 'format: instance_id, query (input), evaluation (expected), language',
|
||||
requiredFields: ['instance_id', 'query', 'evaluation', 'language'],
|
||||
optionalFields: [],
|
||||
fieldInference: {
|
||||
input: ['query'],
|
||||
expected: ['evaluation'],
|
||||
choices: [],
|
||||
category: ['language'],
|
||||
sortOrder: [],
|
||||
},
|
||||
validation: {
|
||||
requireExpected: true,
|
||||
expectedFormat: 'string',
|
||||
},
|
||||
},
|
||||
|
||||
'hle-text': {
|
||||
id: 'hle-text',
|
||||
category: 'research',
|
||||
name: "Humanity's Last Exam, HLE (Text Only)",
|
||||
description:
|
||||
"Humanity's Last Exam (HLE) is a multi-modal benchmark at the frontier of human knowledge, consisting of 2150 questions.",
|
||||
icon: Globe,
|
||||
formatDescription:
|
||||
'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category',
|
||||
requiredFields: [
|
||||
'id',
|
||||
'question',
|
||||
'answer',
|
||||
'answer_type',
|
||||
'rationale',
|
||||
'raw_subject',
|
||||
'category',
|
||||
],
|
||||
optionalFields: ['canary'],
|
||||
fieldInference: {
|
||||
input: ['question'],
|
||||
expected: ['answer'],
|
||||
choices: [],
|
||||
category: ['category'],
|
||||
},
|
||||
},
|
||||
|
||||
'hle-verified': {
|
||||
id: 'hle-verified',
|
||||
category: 'research',
|
||||
name: "Humanity's Last Exam, HLE (Verified Answers)",
|
||||
description:
|
||||
"A subset of Humanity's Last Exam (HLE) with verified answers, designed to evaluate the ability to produce correct answers rather than just plausible ones.",
|
||||
icon: Globe,
|
||||
formatDescription:
|
||||
'format: id, question (input), answer (expected), answer_type, rationale, raw_subject, category, Verified_Classes',
|
||||
requiredFields: [
|
||||
'id',
|
||||
'question',
|
||||
'answer',
|
||||
'answer_type',
|
||||
'rationale',
|
||||
'raw_subject',
|
||||
'category',
|
||||
'Verified_Classes',
|
||||
],
|
||||
optionalFields: ['canary'],
|
||||
fieldInference: {
|
||||
input: ['question'],
|
||||
expected: ['answer'],
|
||||
choices: [],
|
||||
category: ['category'],
|
||||
},
|
||||
},
|
||||
|
||||
'deepsearchqa': {
|
||||
id: 'deepsearchqa',
|
||||
category: 'research',
|
||||
name: 'DeepSearchQA',
|
||||
description:
|
||||
'A 900-prompt factuality benchmark from Google DeepMind, designed to evaluate agents on difficult multi-step information-seeking tasks across 17 different fields.',
|
||||
icon: Globe,
|
||||
formatDescription: 'problem, problem_category, answer, answer_type',
|
||||
requiredFields: ['problem', 'answer', 'problem_category', 'answer_type'],
|
||||
optionalFields: [],
|
||||
fieldInference: {
|
||||
input: ['problem'],
|
||||
expected: ['answer'],
|
||||
choices: [],
|
||||
category: ['problem_category'],
|
||||
sortOrder: [],
|
||||
},
|
||||
validation: {
|
||||
requireExpected: true,
|
||||
expectedFormat: 'string',
|
||||
},
|
||||
},
|
||||
|
||||
'sealqa': {
|
||||
id: 'sealqa',
|
||||
category: 'research',
|
||||
name: 'SealQA',
|
||||
description:
|
||||
'SealQA is a new challenge benchmark for evaluating SEarch- Augmented Language models on fact-seeking questions where web search yields conflicting, noisy, or unhelpful results.',
|
||||
icon: Globe,
|
||||
formatDescription: 'format: question (input), answer (expected), topic (category)',
|
||||
requiredFields: ['question', 'answer', 'topic', 'canary'],
|
||||
optionalFields: [],
|
||||
fieldInference: {
|
||||
input: ['question'],
|
||||
expected: ['answer'],
|
||||
choices: [],
|
||||
category: ['topic'],
|
||||
},
|
||||
validation: {
|
||||
requireExpected: true,
|
||||
expectedFormat: 'string',
|
||||
},
|
||||
},
|
||||
|
||||
'xbench': {
|
||||
id: 'xbench',
|
||||
category: 'research',
|
||||
|
||||
@@ -157,6 +157,7 @@ const DatasetCreateModal = memo<DatasetCreateModalProps>(
|
||||
{ label: t('evalMode.equals'), value: 'equals' },
|
||||
{ label: t('evalMode.contains'), value: 'contains' },
|
||||
{ label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
|
||||
{ label: t('evalMode.external'), value: 'external' },
|
||||
]}
|
||||
/>
|
||||
</Form.Item>
|
||||
|
||||
@@ -131,14 +131,30 @@ const DatasetEditModal = memo<DatasetEditModalProps>(({ open, onCancel, dataset,
|
||||
{ label: t('evalMode.equals'), value: 'equals' },
|
||||
{ label: t('evalMode.contains'), value: 'contains' },
|
||||
{ label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
|
||||
{ label: t('evalMode.answer-relevance'), value: 'answer-relevance' },
|
||||
{ label: t('evalMode.external'), value: 'external' },
|
||||
]}
|
||||
/>
|
||||
</Form.Item>
|
||||
|
||||
{evalModeValue === 'llm-rubric' && (
|
||||
<Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
|
||||
<TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
|
||||
</Form.Item>
|
||||
{(evalModeValue === 'llm-rubric' || evalModeValue === 'answer-relevance') && (
|
||||
<>
|
||||
<Form.Item initialValue="aihubmix" label={'Provider'} name={['evalConfig', 'provider']}>
|
||||
<TextArea placeholder={'LLM provider (e.g. openai, azure)'} rows={1} />
|
||||
</Form.Item>
|
||||
<Form.Item initialValue="gpt-5-nano" label={'Model'} name={['evalConfig', 'model']}>
|
||||
<TextArea placeholder={'LLM model to use for evaluation (e.g. gpt-4)'} rows={1} />
|
||||
</Form.Item>
|
||||
<Form.Item label={'System Prompt'} name={['evalConfig', 'systemRole']}>
|
||||
<TextArea placeholder={'Optional system prompt for the LLM judge'} rows={3} />
|
||||
</Form.Item>
|
||||
<Form.Item label={'Eval Prompt'} name={['evalConfig', 'criteria']}>
|
||||
<TextArea placeholder={'Prompt template for the LLM judge'} rows={3} />
|
||||
</Form.Item>
|
||||
<Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
|
||||
<TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
|
||||
</Form.Item>
|
||||
</>
|
||||
)}
|
||||
|
||||
<Form.Item label={t('dataset.create.preset.label')} style={{ marginBottom: 0 }}>
|
||||
|
||||
@@ -92,6 +92,14 @@ const autoInferMapping = (
|
||||
? new Set(preset.fieldInference.sortOrder.map((s) => s.toLowerCase()))
|
||||
: SORT_ORDER_CANDIDATES;
|
||||
|
||||
const requiredCandidates = new Set<string>(
|
||||
preset ? preset.requiredFields.map((s) => s.toLowerCase()) : [],
|
||||
);
|
||||
|
||||
const optionalCandidates = new Set<string>(
|
||||
preset ? preset.optionalFields.map((s) => s.toLowerCase()) : [],
|
||||
);
|
||||
|
||||
for (const h of headers) {
|
||||
const lower = h.toLowerCase().trim();
|
||||
if (!inputFound && inputCandidates.has(lower)) {
|
||||
@@ -109,6 +117,10 @@ const autoInferMapping = (
|
||||
} else if (!sortOrderFound && sortOrderCandidates.has(lower)) {
|
||||
result[h] = 'sortOrder';
|
||||
sortOrderFound = true;
|
||||
} else if (requiredCandidates.has(lower) || optionalCandidates.has(lower)) {
|
||||
// If the field was claimed by the config but not matched by any candidate,
|
||||
// assign it to metadata to ensure it's not missed
|
||||
result[h] = 'metadata';
|
||||
} else {
|
||||
result[h] = 'ignore';
|
||||
}
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
|
||||
import { Icon } from '@lobehub/ui';
|
||||
import { createStaticStyles } from 'antd-style';
|
||||
import { Activity, CheckCircle2, Clock, Pause, XCircle } from 'lucide-react';
|
||||
import { Activity, CheckCircle2, Clock, Hourglass, Pause, XCircle } from 'lucide-react';
|
||||
import { memo } from 'react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
|
||||
const statusConfig: Record<string, { cls: string; icon: any }> = {
|
||||
aborted: { cls: 'default', icon: Pause },
|
||||
completed: { cls: 'success', icon: CheckCircle2 },
|
||||
external: { cls: 'warning', icon: Hourglass },
|
||||
failed: { cls: 'error', icon: XCircle },
|
||||
idle: { cls: 'default', icon: Clock },
|
||||
pending: { cls: 'warning', icon: Clock },
|
||||
|
||||
@@ -33,6 +33,7 @@ const rubricTypeSchema = z.enum([
|
||||
'similar',
|
||||
'levenshtein',
|
||||
'rubric',
|
||||
'external',
|
||||
]);
|
||||
|
||||
const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passthrough();
|
||||
@@ -621,7 +622,9 @@ export const agentEvalRouter = router({
|
||||
z.object({
|
||||
benchmarkId: z.string().optional(),
|
||||
datasetId: z.string().optional(),
|
||||
status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']).optional(),
|
||||
status: z
|
||||
.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted', 'external'])
|
||||
.optional(),
|
||||
limit: z.number().min(1).max(100).default(50).optional(),
|
||||
offset: z.number().min(0).default(0).optional(),
|
||||
}),
|
||||
@@ -871,7 +874,15 @@ export const agentEvalRouter = router({
|
||||
.input(
|
||||
z.object({
|
||||
id: z.string(),
|
||||
status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']),
|
||||
status: z.enum([
|
||||
'idle',
|
||||
'pending',
|
||||
'running',
|
||||
'completed',
|
||||
'failed',
|
||||
'aborted',
|
||||
'external',
|
||||
]),
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ input, ctx }) => {
|
||||
|
||||
514
src/server/routers/lambda/agentEvalExternal.ts
Normal file
514
src/server/routers/lambda/agentEvalExternal.ts
Normal file
@@ -0,0 +1,514 @@
|
||||
import type { EvalRunTopicResult, EvalThreadResult } from '@lobechat/types';
|
||||
import { TRPCError } from '@trpc/server';
|
||||
import { and, asc, eq, isNull } from 'drizzle-orm';
|
||||
import { z } from 'zod';
|
||||
|
||||
import {
|
||||
AgentEvalDatasetModel,
|
||||
AgentEvalRunModel,
|
||||
AgentEvalRunTopicModel,
|
||||
AgentEvalTestCaseModel,
|
||||
} from '@/database/models/agentEval';
|
||||
import { ThreadModel } from '@/database/models/thread';
|
||||
import { messages } from '@/database/schemas';
|
||||
import { authedProcedure, router } from '@/libs/trpc/lambda';
|
||||
import { serverDatabase } from '@/libs/trpc/lambda/middleware';
|
||||
import { AgentEvalRunService } from '@/server/services/agentEvalRun';
|
||||
|
||||
const runStatusSchema = z.enum([
|
||||
'idle',
|
||||
'pending',
|
||||
'running',
|
||||
'completed',
|
||||
'failed',
|
||||
'aborted',
|
||||
'external',
|
||||
]);
|
||||
|
||||
const reportResultItemSchema = z.object({
|
||||
correct: z.boolean(),
|
||||
result: z.record(z.unknown()).optional(),
|
||||
score: z.number(),
|
||||
threadId: z.string().optional(),
|
||||
topicId: z.string(),
|
||||
});
|
||||
|
||||
const toIsoString = (value?: Date | null) => (value ? value.toISOString() : undefined);
|
||||
|
||||
const agentEvalExternalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
|
||||
const { ctx } = opts;
|
||||
|
||||
return opts.next({
|
||||
ctx: {
|
||||
datasetModel: new AgentEvalDatasetModel(ctx.serverDB, ctx.userId),
|
||||
runModel: new AgentEvalRunModel(ctx.serverDB, ctx.userId),
|
||||
runService: new AgentEvalRunService(ctx.serverDB, ctx.userId),
|
||||
runTopicModel: new AgentEvalRunTopicModel(ctx.serverDB, ctx.userId),
|
||||
testCaseModel: new AgentEvalTestCaseModel(ctx.serverDB, ctx.userId),
|
||||
threadModel: new ThreadModel(ctx.serverDB, ctx.userId),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
type ReportResultInput = z.infer<typeof reportResultItemSchema> & { runId: string };
|
||||
|
||||
const recomputeRunAggregation = async (
|
||||
ctx: {
|
||||
runModel: AgentEvalRunModel;
|
||||
runService: AgentEvalRunService;
|
||||
runTopicModel: AgentEvalRunTopicModel;
|
||||
},
|
||||
runId: string,
|
||||
) => {
|
||||
const refreshedRun = await ctx.runModel.findById(runId);
|
||||
if (!refreshedRun) return undefined;
|
||||
|
||||
const refreshedTopics = await ctx.runTopicModel.findByRunId(runId);
|
||||
const metrics = await ctx.runService.evaluateAndFinalizeRun({
|
||||
run: {
|
||||
config: refreshedRun.config,
|
||||
id: refreshedRun.id,
|
||||
metrics: refreshedRun.metrics,
|
||||
startedAt: refreshedRun.startedAt,
|
||||
},
|
||||
runTopics: refreshedTopics,
|
||||
});
|
||||
|
||||
const hasAwaitingExternal = refreshedTopics.some(
|
||||
(topic) =>
|
||||
topic.status === 'external' ||
|
||||
(topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
|
||||
);
|
||||
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
|
||||
const status = hasAwaitingExternal
|
||||
? 'external'
|
||||
: nonSuccessCases >= metrics.totalCases
|
||||
? 'failed'
|
||||
: 'completed';
|
||||
|
||||
await ctx.runModel.update(runId, { metrics, status });
|
||||
|
||||
return status;
|
||||
};
|
||||
|
||||
const applyReportResult = async (
|
||||
ctx: {
|
||||
runModel: AgentEvalRunModel;
|
||||
runTopicModel: AgentEvalRunTopicModel;
|
||||
runService: AgentEvalRunService;
|
||||
threadModel: ThreadModel;
|
||||
},
|
||||
input: ReportResultInput,
|
||||
recomputeRun: boolean,
|
||||
) => {
|
||||
const run = await ctx.runModel.findById(input.runId);
|
||||
if (!run) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
|
||||
}
|
||||
|
||||
const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
|
||||
const runTopic = runTopics.find((item) => item.topicId === input.topicId);
|
||||
if (!runTopic) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run topic not found' });
|
||||
}
|
||||
|
||||
const runK = run.config?.k ?? 1;
|
||||
const rubricScores = [{ rubricId: 'external', score: input.score }];
|
||||
const existingEvalResult = (runTopic.evalResult ?? {}) as EvalRunTopicResult &
|
||||
Record<string, unknown>;
|
||||
const externalResult = input.result ?? {};
|
||||
|
||||
let idempotent = false;
|
||||
let reportedThreads: number;
|
||||
let totalThreads: number;
|
||||
let topicFinalized: boolean;
|
||||
|
||||
if (runK > 1) {
|
||||
if (!input.threadId) {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: 'threadId is required when k > 1',
|
||||
});
|
||||
}
|
||||
|
||||
const allThreads = await ctx.threadModel.queryByTopicId(input.topicId);
|
||||
const evalThreads = allThreads.filter((thread) => thread.type === 'eval');
|
||||
const sourceThreads = evalThreads.length > 0 ? evalThreads : allThreads;
|
||||
if (sourceThreads.length === 0) {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: 'No threads found for this topic',
|
||||
});
|
||||
}
|
||||
|
||||
const threads: EvalThreadResult[] =
|
||||
(existingEvalResult.threads as EvalThreadResult[] | undefined)?.map((thread) => ({
|
||||
...thread,
|
||||
})) ??
|
||||
sourceThreads.map((thread) => ({
|
||||
status: 'external',
|
||||
threadId: thread.id,
|
||||
}));
|
||||
|
||||
let targetIndex = threads.findIndex((thread) => thread.threadId === input.threadId);
|
||||
if (targetIndex < 0) {
|
||||
const existsInTopic = sourceThreads.some((thread) => thread.id === input.threadId);
|
||||
if (!existsInTopic) {
|
||||
throw new TRPCError({
|
||||
code: 'NOT_FOUND',
|
||||
message: 'Thread not found for this topic',
|
||||
});
|
||||
}
|
||||
|
||||
threads.push({ status: 'external', threadId: input.threadId });
|
||||
targetIndex = threads.length - 1;
|
||||
}
|
||||
|
||||
totalThreads = threads.length;
|
||||
const targetThread = threads[targetIndex];
|
||||
const alreadyReported =
|
||||
targetThread.status === 'completed' &&
|
||||
targetThread.score === input.score &&
|
||||
targetThread.passed === input.correct;
|
||||
if (alreadyReported) {
|
||||
idempotent = true;
|
||||
} else {
|
||||
threads[targetIndex] = {
|
||||
...targetThread,
|
||||
passed: input.correct,
|
||||
rubricScores,
|
||||
score: input.score,
|
||||
status: 'completed',
|
||||
};
|
||||
|
||||
const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
const nextEvalResult = {
|
||||
...existingEvalResult,
|
||||
awaitingExternalEval: true,
|
||||
externalThreadResults: {
|
||||
...existingThreadResults,
|
||||
[input.threadId]: externalResult,
|
||||
},
|
||||
threads,
|
||||
} satisfies EvalRunTopicResult & Record<string, unknown>;
|
||||
|
||||
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
|
||||
evalResult: nextEvalResult,
|
||||
status: 'external',
|
||||
});
|
||||
}
|
||||
|
||||
reportedThreads = threads.filter(
|
||||
(thread) => thread.status === 'completed' && typeof thread.score === 'number',
|
||||
).length;
|
||||
topicFinalized = reportedThreads >= totalThreads;
|
||||
|
||||
if (topicFinalized) {
|
||||
const finalThreads = threads;
|
||||
const totalScore = finalThreads.reduce((acc, thread) => acc + (thread.score ?? 0), 0);
|
||||
const avgScore = totalScore / finalThreads.length;
|
||||
const passAtK = finalThreads.some((thread) => thread.passed === true);
|
||||
const passAllK = finalThreads.every((thread) => thread.passed === true);
|
||||
|
||||
const existingThreadResults = (existingEvalResult.externalThreadResults ?? {}) as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
const nextEvalResult = {
|
||||
...existingEvalResult,
|
||||
awaitingExternalEval: false,
|
||||
externalThreadResults: {
|
||||
...existingThreadResults,
|
||||
[input.threadId]: externalResult,
|
||||
},
|
||||
passAllK,
|
||||
passAtK,
|
||||
rubricScores: [{ rubricId: 'external', score: avgScore }],
|
||||
threads: finalThreads,
|
||||
} satisfies EvalRunTopicResult & Record<string, unknown>;
|
||||
|
||||
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
|
||||
evalResult: nextEvalResult,
|
||||
passed: passAtK,
|
||||
score: avgScore,
|
||||
status: passAtK ? 'passed' : 'failed',
|
||||
});
|
||||
}
|
||||
} else {
|
||||
const alreadyReported =
|
||||
runTopic.status === (input.correct ? 'passed' : 'failed') &&
|
||||
runTopic.score === input.score &&
|
||||
runTopic.passed === input.correct;
|
||||
if (alreadyReported) {
|
||||
idempotent = true;
|
||||
} else {
|
||||
const nextEvalResult = {
|
||||
...existingEvalResult,
|
||||
awaitingExternalEval: false,
|
||||
externalResult,
|
||||
rubricScores,
|
||||
} satisfies EvalRunTopicResult & Record<string, unknown>;
|
||||
|
||||
await ctx.runTopicModel.updateByRunAndTopic(input.runId, input.topicId, {
|
||||
evalResult: nextEvalResult,
|
||||
passed: input.correct,
|
||||
score: input.score,
|
||||
status: input.correct ? 'passed' : 'failed',
|
||||
});
|
||||
}
|
||||
|
||||
reportedThreads = 1;
|
||||
totalThreads = 1;
|
||||
topicFinalized = true;
|
||||
}
|
||||
|
||||
let runStatus: string | undefined;
|
||||
if (recomputeRun) {
|
||||
runStatus = await recomputeRunAggregation(ctx, input.runId);
|
||||
}
|
||||
|
||||
return {
|
||||
idempotent,
|
||||
reportedThreads,
|
||||
runId: input.runId,
|
||||
runStatus,
|
||||
success: true,
|
||||
threadId: input.threadId,
|
||||
topicFinalized,
|
||||
topicId: input.topicId,
|
||||
totalThreads,
|
||||
};
|
||||
};
|
||||
|
||||
export const agentEvalExternalRouter = router({
|
||||
datasetGet: agentEvalExternalProcedure
|
||||
.input(z.object({ datasetId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const dataset = await ctx.datasetModel.findById(input.datasetId);
|
||||
if (!dataset) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Dataset not found' });
|
||||
}
|
||||
|
||||
const metadata = (dataset.metadata ?? {}) as Record<string, unknown>;
|
||||
|
||||
return {
|
||||
benchmarkId: dataset.benchmarkId,
|
||||
id: dataset.id,
|
||||
identifier: dataset.identifier,
|
||||
metadata,
|
||||
name: dataset.name,
|
||||
};
|
||||
}),
|
||||
|
||||
messagesList: agentEvalExternalProcedure
|
||||
.input(z.object({ threadId: z.string().optional(), topicId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const conditions = [
|
||||
eq(messages.userId, ctx.userId),
|
||||
eq(messages.topicId, input.topicId),
|
||||
isNull(messages.messageGroupId),
|
||||
];
|
||||
if (input.threadId) conditions.push(eq(messages.threadId, input.threadId));
|
||||
|
||||
const rows = await ctx.serverDB
|
||||
.select({
|
||||
content: messages.content,
|
||||
createdAt: messages.createdAt,
|
||||
id: messages.id,
|
||||
role: messages.role,
|
||||
threadId: messages.threadId,
|
||||
topicId: messages.topicId,
|
||||
})
|
||||
.from(messages)
|
||||
.where(and(...conditions))
|
||||
.orderBy(asc(messages.createdAt));
|
||||
|
||||
return rows.map((row) => ({
|
||||
content: row.content,
|
||||
createdAt: toIsoString(row.createdAt),
|
||||
id: row.id,
|
||||
role: row.role,
|
||||
threadId: row.threadId,
|
||||
topicId: row.topicId,
|
||||
}));
|
||||
}),
|
||||
|
||||
reportResult: agentEvalExternalProcedure
|
||||
.input(
|
||||
z.object({
|
||||
correct: z.boolean(),
|
||||
result: z.record(z.unknown()).optional(),
|
||||
runId: z.string(),
|
||||
score: z.number(),
|
||||
threadId: z.string().optional(),
|
||||
topicId: z.string(),
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
|
||||
|
||||
reportResultsBatch: agentEvalExternalProcedure
|
||||
.input(z.object({ items: z.array(reportResultItemSchema).min(1), runId: z.string() }))
|
||||
.mutation(async ({ ctx, input }) => {
|
||||
const receipts = [];
|
||||
|
||||
for (const item of input.items) {
|
||||
receipts.push(await applyReportResult(ctx, { ...item, runId: input.runId }, false));
|
||||
}
|
||||
|
||||
const runStatus = await recomputeRunAggregation(ctx, input.runId);
|
||||
|
||||
return {
|
||||
items: receipts,
|
||||
runId: input.runId,
|
||||
runStatus,
|
||||
success: true,
|
||||
};
|
||||
}),
|
||||
|
||||
runGet: agentEvalExternalProcedure
|
||||
.input(z.object({ runId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const run = await ctx.runModel.findById(input.runId);
|
||||
if (!run) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
|
||||
}
|
||||
const config = { ...run.config, k: run.config?.k ?? 1 };
|
||||
|
||||
return {
|
||||
config,
|
||||
createdAt: run.createdAt,
|
||||
datasetId: run.datasetId,
|
||||
id: run.id,
|
||||
metrics: run.metrics ?? undefined,
|
||||
name: run.name,
|
||||
startedAt: run.startedAt,
|
||||
status: run.status,
|
||||
targetAgentId: run.targetAgentId,
|
||||
};
|
||||
}),
|
||||
|
||||
runSetStatus: agentEvalExternalProcedure
|
||||
.input(z.object({ runId: z.string(), status: runStatusSchema }))
|
||||
.mutation(async ({ ctx, input }) => {
|
||||
const run = await ctx.runModel.findById(input.runId);
|
||||
if (!run) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
|
||||
}
|
||||
|
||||
if (input.status !== 'completed' && input.status !== 'external') {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: 'External endpoint only supports setting status to completed or external',
|
||||
});
|
||||
}
|
||||
|
||||
if (run.status !== 'external' && run.status !== 'completed') {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: `Only external runs can be finalized via this endpoint. current=${run.status}`,
|
||||
});
|
||||
}
|
||||
|
||||
if (input.status === 'completed') {
|
||||
const runTopics = await ctx.runTopicModel.findByRunId(input.runId);
|
||||
const hasAwaitingExternal = runTopics.some(
|
||||
(topic) =>
|
||||
topic.status === 'external' ||
|
||||
(topic.evalResult as Record<string, unknown> | null)?.awaitingExternalEval === true,
|
||||
);
|
||||
if (hasAwaitingExternal) {
|
||||
throw new TRPCError({
|
||||
code: 'BAD_REQUEST',
|
||||
message: 'Cannot set run to completed while external evaluation is pending',
|
||||
});
|
||||
}
|
||||
|
||||
const metrics = await ctx.runService.evaluateAndFinalizeRun({
|
||||
run: { config: run.config, id: run.id, metrics: run.metrics, startedAt: run.startedAt },
|
||||
runTopics,
|
||||
});
|
||||
const updated = await ctx.runModel.update(input.runId, { metrics, status: 'completed' });
|
||||
|
||||
return {
|
||||
metrics,
|
||||
runId: input.runId,
|
||||
status: updated?.status ?? 'completed',
|
||||
success: true,
|
||||
};
|
||||
}
|
||||
|
||||
const updated = await ctx.runModel.update(input.runId, { status: 'external' });
|
||||
|
||||
return {
|
||||
runId: input.runId,
|
||||
status: updated?.status ?? 'external',
|
||||
success: true,
|
||||
};
|
||||
}),
|
||||
|
||||
runTopicReportResult: agentEvalExternalProcedure
|
||||
.input(
|
||||
z.object({
|
||||
correct: z.boolean(),
|
||||
result: z.record(z.unknown()).optional(),
|
||||
runId: z.string(),
|
||||
score: z.number(),
|
||||
threadId: z.string().optional(),
|
||||
topicId: z.string(),
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ ctx, input }) => applyReportResult(ctx, input, true)),
|
||||
|
||||
runTopicsList: agentEvalExternalProcedure
|
||||
.input(z.object({ onlyExternal: z.boolean().default(false).optional(), runId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const run = await ctx.runModel.findById(input.runId);
|
||||
if (!run) {
|
||||
throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
|
||||
}
|
||||
|
||||
const allRunTopics = await ctx.runTopicModel.findByRunId(input.runId);
|
||||
const runTopics = input.onlyExternal
|
||||
? allRunTopics.filter((topic) => topic.status === 'external')
|
||||
: allRunTopics;
|
||||
|
||||
return runTopics.map((topic) => {
|
||||
const testCase = topic.testCase;
|
||||
|
||||
return {
|
||||
createdAt: topic.createdAt,
|
||||
evalResult: topic.evalResult,
|
||||
passed: topic.passed,
|
||||
runId: topic.runId,
|
||||
score: topic.score,
|
||||
status: topic.status,
|
||||
testCase,
|
||||
testCaseId: topic.testCaseId,
|
||||
topic: topic.topic,
|
||||
topicId: topic.topicId,
|
||||
};
|
||||
});
|
||||
}),
|
||||
|
||||
testCasesCount: agentEvalExternalProcedure
|
||||
.input(z.object({ datasetId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const count = await ctx.testCaseModel.countByDatasetId(input.datasetId);
|
||||
return { count };
|
||||
}),
|
||||
|
||||
threadsList: agentEvalExternalProcedure
|
||||
.input(z.object({ topicId: z.string() }))
|
||||
.query(async ({ ctx, input }) => {
|
||||
const threads = await ctx.threadModel.queryByTopicId(input.topicId);
|
||||
|
||||
return threads.map((thread) => ({
|
||||
id: thread.id,
|
||||
topicId: thread.topicId,
|
||||
type: thread.type,
|
||||
}));
|
||||
}),
|
||||
});
|
||||
@@ -12,6 +12,7 @@ import { agentRouter } from './agent';
|
||||
import { agentBotProviderRouter } from './agentBotProvider';
|
||||
import { agentCronJobRouter } from './agentCronJob';
|
||||
import { agentEvalRouter } from './agentEval';
|
||||
import { agentEvalExternalRouter } from './agentEvalExternal';
|
||||
import { agentGroupRouter } from './agentGroup';
|
||||
import { agentSkillsRouter } from './agentSkills';
|
||||
import { aiAgentRouter } from './aiAgent';
|
||||
@@ -57,6 +58,7 @@ export const lambdaRouter = router({
|
||||
agentBotProvider: agentBotProviderRouter,
|
||||
agentCronJob: agentCronJobRouter,
|
||||
agentEval: agentEvalRouter,
|
||||
agentEvalExternal: agentEvalExternalRouter,
|
||||
agentSkills: agentSkillsRouter,
|
||||
aiAgent: aiAgentRouter,
|
||||
aiChat: aiChatRouter,
|
||||
|
||||
@@ -512,6 +512,7 @@ export class AgentEvalRunService {
|
||||
const passedCases = allTopics.filter((t) => t.status === 'passed').length;
|
||||
const failedCases = allTopics.filter((t) => t.status === 'failed').length;
|
||||
const errorCases = allTopics.filter((t) => t.status === 'error').length;
|
||||
const externalCasesRT = allTopics.filter((t) => t.status === 'external').length;
|
||||
const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
|
||||
|
||||
let sumCost = 0;
|
||||
@@ -556,6 +557,7 @@ export class AgentEvalRunService {
|
||||
completedCases: completedCount,
|
||||
cost: sumCost ? roundCost(sumCost) : undefined,
|
||||
errorCases,
|
||||
externalCases: externalCasesRT || undefined,
|
||||
failedCases,
|
||||
llmCalls: sumLlmCalls || undefined,
|
||||
passedCases,
|
||||
@@ -667,6 +669,17 @@ export class AgentEvalRunService {
|
||||
const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
|
||||
const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
|
||||
|
||||
// ── External eval mode: agent finished, hand off to external scorer ──
|
||||
if (evalMode === 'external') {
|
||||
return {
|
||||
...baseMeta,
|
||||
awaitingExternalEval: true,
|
||||
passed: undefined,
|
||||
score: undefined,
|
||||
status: 'external',
|
||||
};
|
||||
}
|
||||
|
||||
let effectiveRubrics: EvalBenchmarkRubric[];
|
||||
if (evalMode) {
|
||||
effectiveRubrics = [
|
||||
@@ -722,6 +735,7 @@ export class AgentEvalRunService {
|
||||
passed?: boolean;
|
||||
rubricScores?: Array<{ reason?: string; rubricId: string; score: number }>;
|
||||
score?: number;
|
||||
status?: 'error' | 'external' | 'failed' | 'passed' | 'running' | 'timeout';
|
||||
steps?: number;
|
||||
threadId: string;
|
||||
tokens?: number;
|
||||
@@ -737,6 +751,14 @@ export class AgentEvalRunService {
|
||||
passed: meta.passed as boolean | undefined,
|
||||
rubricScores: meta.rubricScores as any,
|
||||
score: meta.score as number | undefined,
|
||||
status: meta.status as
|
||||
| 'error'
|
||||
| 'external'
|
||||
| 'failed'
|
||||
| 'passed'
|
||||
| 'running'
|
||||
| 'timeout'
|
||||
| undefined,
|
||||
steps: meta.steps as number | undefined,
|
||||
threadId: t.id,
|
||||
tokens: meta.tokens as number | undefined,
|
||||
@@ -744,6 +766,20 @@ export class AgentEvalRunService {
|
||||
};
|
||||
});
|
||||
|
||||
// ── External eval mode: if all threads await external scoring, propagate that status ──
|
||||
const allExternal = threadResults.every((t) => t.status === 'external');
|
||||
if (allExternal) {
|
||||
await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
|
||||
evalResult: {
|
||||
awaitingExternalEval: true,
|
||||
completionReason: 'external',
|
||||
threads: threadResults,
|
||||
} satisfies EvalRunTopicResult,
|
||||
status: 'external',
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// pass@k: at least one thread passed
|
||||
const anyPassed = threadResults.some((t) => t.passed === true);
|
||||
// pass^k: all threads passed
|
||||
@@ -888,7 +924,7 @@ export class AgentEvalRunService {
|
||||
if (runTopic) {
|
||||
// Skip if topic is already in a terminal state (e.g. timeout marked by checkAndHandleRunTimeout).
|
||||
// The interrupted agent still fires the completion webhook, but we must not overwrite the result.
|
||||
const terminalStates = ['passed', 'failed', 'error', 'timeout'];
|
||||
const terminalStates = ['passed', 'failed', 'error', 'timeout', 'external'];
|
||||
if (runTopic.status && terminalStates.includes(runTopic.status)) {
|
||||
// Fall through to progress tracking below without modifying this topic
|
||||
} else {
|
||||
@@ -945,11 +981,15 @@ export class AgentEvalRunService {
|
||||
// Aggregate real-time metrics from all RunTopics
|
||||
const allTopics = await this.runTopicModel.findByRunId(runId);
|
||||
const completedCount = allTopics.filter(
|
||||
(t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
|
||||
(t) =>
|
||||
(t.evalResult && 'completionReason' in t.evalResult) ||
|
||||
t.status === 'timeout' ||
|
||||
t.status === 'external',
|
||||
).length;
|
||||
const passedCases = allTopics.filter((t) => t.status === 'passed').length;
|
||||
const failedCases = allTopics.filter((t) => t.status === 'failed').length;
|
||||
const errorCases = allTopics.filter((t) => t.status === 'error').length;
|
||||
const externalCasesTraj = allTopics.filter((t) => t.status === 'external').length;
|
||||
const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
|
||||
|
||||
let sumCost = 0;
|
||||
@@ -995,6 +1035,7 @@ export class AgentEvalRunService {
|
||||
completedCases: completedCount,
|
||||
cost: sumCost ? roundCost(sumCost) : undefined,
|
||||
errorCases,
|
||||
externalCases: externalCasesTraj || undefined,
|
||||
failedCases,
|
||||
llmCalls: sumLlmCalls || undefined,
|
||||
passedCases,
|
||||
@@ -1048,6 +1089,7 @@ export class AgentEvalRunService {
|
||||
let passedCases = 0;
|
||||
let failedCases = 0;
|
||||
let errorCases = 0;
|
||||
let externalCases = 0;
|
||||
let timeoutCases = 0;
|
||||
let totalScore = 0;
|
||||
// Sum of per-case averages (for per-case display)
|
||||
@@ -1088,19 +1130,27 @@ export class AgentEvalRunService {
|
||||
failedCases++;
|
||||
} else if (runTopic.status === 'error') {
|
||||
errorCases++;
|
||||
} else if (runTopic.status === 'external') {
|
||||
externalCases++;
|
||||
} else if (runTopic.status === 'timeout') {
|
||||
timeoutCases++;
|
||||
}
|
||||
|
||||
// Only accumulate scores for evaluated (non-error, non-timeout) cases
|
||||
if (runTopic.status !== 'error' && runTopic.status !== 'timeout' && runTopic.score != null) {
|
||||
totalScore += runTopic.score;
|
||||
}
|
||||
|
||||
// Accumulate per-rubric scores from existing evalResult (exclude error/timeout cases)
|
||||
// Only accumulate scores for evaluated (non-error, non-timeout, non-external) cases
|
||||
if (
|
||||
runTopic.status !== 'error' &&
|
||||
runTopic.status !== 'timeout' &&
|
||||
runTopic.status !== 'external' &&
|
||||
runTopic.score != null
|
||||
) {
|
||||
totalScore += runTopic.score;
|
||||
}
|
||||
|
||||
// Accumulate per-rubric scores from existing evalResult (exclude error/timeout/external cases)
|
||||
if (
|
||||
runTopic.status !== 'error' &&
|
||||
runTopic.status !== 'timeout' &&
|
||||
runTopic.status !== 'external' &&
|
||||
existingResult?.rubricScores
|
||||
) {
|
||||
for (const rs of existingResult.rubricScores) {
|
||||
@@ -1138,6 +1188,7 @@ export class AgentEvalRunService {
|
||||
cost: sumCost ? roundCost(sumCost) : undefined,
|
||||
duration: wallClockDuration || undefined,
|
||||
errorCases,
|
||||
externalCases: externalCases || undefined,
|
||||
failedCases,
|
||||
llmCalls: sumLlmCalls || undefined,
|
||||
passRate: totalCases > 0 ? passedCases / totalCases : 0,
|
||||
@@ -1216,6 +1267,15 @@ export class AgentEvalRunService {
|
||||
const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
|
||||
const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
|
||||
|
||||
// ── External eval mode: agent finished, hand off to external scorer ──
|
||||
if (evalMode === 'external') {
|
||||
await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
|
||||
evalResult: { ...existingResult, awaitingExternalEval: true },
|
||||
status: 'external',
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
let effectiveRubrics: EvalBenchmarkRubric[];
|
||||
if (evalMode) {
|
||||
effectiveRubrics = [
|
||||
@@ -1324,7 +1384,13 @@ export class AgentEvalRunService {
|
||||
});
|
||||
|
||||
const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
|
||||
const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
|
||||
const externalCount = metrics.externalCases || 0;
|
||||
const runStatus =
|
||||
externalCount > 0
|
||||
? 'external'
|
||||
: nonSuccessCases >= metrics.totalCases
|
||||
? 'failed'
|
||||
: 'completed';
|
||||
|
||||
await this.runModel.update(run.id, { metrics, status: runStatus });
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user