mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-26 13:19:34 +07:00
✨ feat(eval): add external scoring mode (#12729)
* wip: add llm relevant & BrowseComp * wip: add widesearch desc * wip: dsqa, hle, widesearch * wip: add dsqa * wip: add awaiting eval status for runs * wip: add awaiting status for run * wip: adjust hle-verified * 🐛 fix: browsecomp topics * 📝 docs: add annotations * wip: add awaiting status for pass@k * wip: add complete status * wip: update theard dots * wip: update run status page * wip: remove useless impl * wip: update prompt * ✨ feat: add external eval routes * wip: add eval cli * 🐛 fix: support authoritize in no browser environment * wip: pass tests * ♻️ refactor: remove tests * ♻️ refactor: mo camel case
This commit is contained in:
285
apps/cli/src/commands/eval.test.ts
Normal file
285
apps/cli/src/commands/eval.test.ts
Normal file
@@ -0,0 +1,285 @@
|
||||
import { Command } from 'commander';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
const { mockTrpcClient } = vi.hoisted(() => ({
|
||||
mockTrpcClient: {
|
||||
agentEvalExternal: {
|
||||
datasetGet: { query: vi.fn() },
|
||||
messagesList: { query: vi.fn() },
|
||||
runGet: { query: vi.fn() },
|
||||
runSetStatus: { mutate: vi.fn() },
|
||||
runTopicReportResult: { mutate: vi.fn() },
|
||||
runTopicsList: { query: vi.fn() },
|
||||
testCasesCount: { query: vi.fn() },
|
||||
threadsList: { query: vi.fn() },
|
||||
},
|
||||
},
|
||||
}));
|
||||
|
||||
const { getTrpcClientMock } = vi.hoisted(() => ({
|
||||
getTrpcClientMock: vi.fn(),
|
||||
}));
|
||||
|
||||
vi.mock('../api/client', () => ({
|
||||
getTrpcClient: getTrpcClientMock,
|
||||
}));
|
||||
|
||||
vi.mock('../utils/logger', () => ({
|
||||
log: {
|
||||
debug: vi.fn(),
|
||||
error: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
},
|
||||
setVerbose: vi.fn(),
|
||||
}));
|
||||
|
||||
// eslint-disable-next-line import-x/first
|
||||
import { log } from '../utils/logger';
|
||||
// eslint-disable-next-line import-x/first
|
||||
import { registerEvalCommand } from './eval';
|
||||
|
||||
describe('eval command', () => {
|
||||
let exitSpy: ReturnType<typeof vi.spyOn>;
|
||||
let logSpy: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
getTrpcClientMock.mockResolvedValue(mockTrpcClient);
|
||||
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
|
||||
logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
|
||||
|
||||
for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
|
||||
for (const fn of Object.values(method)) {
|
||||
(fn as ReturnType<typeof vi.fn>).mockReset();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
exitSpy.mockRestore();
|
||||
logSpy.mockRestore();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
const createProgram = () => {
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerEvalCommand(program);
|
||||
return program;
|
||||
};
|
||||
|
||||
it('should call runGet and output json envelope', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
|
||||
config: { k: 1 },
|
||||
datasetId: 'dataset-1',
|
||||
id: 'run-1',
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
|
||||
|
||||
const payload = JSON.parse(logSpy.mock.calls[0][0]);
|
||||
expect(payload).toEqual({
|
||||
data: {
|
||||
config: { k: 1 },
|
||||
datasetId: 'dataset-1',
|
||||
id: 'run-1',
|
||||
},
|
||||
error: null,
|
||||
ok: true,
|
||||
version: 'v1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should call datasetGet and output json envelope', async () => {
|
||||
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
|
||||
id: 'dataset-1',
|
||||
metadata: { preset: 'deepsearchqa' },
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'dataset',
|
||||
'get',
|
||||
'--dataset-id',
|
||||
'dataset-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
|
||||
datasetId: 'dataset-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should pass onlyExternal to runTopicsList', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run-topics',
|
||||
'list',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--only-external',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
|
||||
onlyExternal: true,
|
||||
runId: 'run-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should pass topicId and threadId to messagesList', async () => {
|
||||
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'messages',
|
||||
'list',
|
||||
'--topic-id',
|
||||
'topic-1',
|
||||
'--thread-id',
|
||||
'thread-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
|
||||
threadId: 'thread-1',
|
||||
topicId: 'topic-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should parse and report run-topic result', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
|
||||
success: true,
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run-topic',
|
||||
'report-result',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--topic-id',
|
||||
'topic-1',
|
||||
'--thread-id',
|
||||
'thread-1',
|
||||
'--score',
|
||||
'0.91',
|
||||
'--correct',
|
||||
'true',
|
||||
'--result-json',
|
||||
'{"grade":"A"}',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
|
||||
correct: true,
|
||||
result: { grade: 'A' },
|
||||
runId: 'run-1',
|
||||
score: 0.91,
|
||||
threadId: 'thread-1',
|
||||
topicId: 'topic-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should update run status', async () => {
|
||||
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
|
||||
runId: 'run-1',
|
||||
status: 'completed',
|
||||
success: true,
|
||||
});
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run',
|
||||
'set-status',
|
||||
'--run-id',
|
||||
'run-1',
|
||||
'--status',
|
||||
'completed',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
|
||||
runId: 'run-1',
|
||||
status: 'completed',
|
||||
});
|
||||
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
|
||||
});
|
||||
|
||||
it('should output json error envelope when command fails', async () => {
|
||||
const error = Object.assign(new Error('Run not found'), {
|
||||
data: { code: 'NOT_FOUND' },
|
||||
});
|
||||
mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'run',
|
||||
'get',
|
||||
'--run-id',
|
||||
'run-404',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
const payload = JSON.parse(logSpy.mock.calls[0][0]);
|
||||
expect(payload).toEqual({
|
||||
data: null,
|
||||
error: { code: 'NOT_FOUND', message: 'Run not found' },
|
||||
ok: false,
|
||||
version: 'v1',
|
||||
});
|
||||
expect(exitSpy).toHaveBeenCalledWith(1);
|
||||
});
|
||||
|
||||
it('should query test case count', async () => {
|
||||
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync([
|
||||
'node',
|
||||
'test',
|
||||
'eval',
|
||||
'test-cases',
|
||||
'count',
|
||||
'--dataset-id',
|
||||
'dataset-1',
|
||||
'--json',
|
||||
]);
|
||||
|
||||
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
|
||||
datasetId: 'dataset-1',
|
||||
});
|
||||
});
|
||||
|
||||
it('should log plain error without --json', async () => {
|
||||
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
|
||||
|
||||
const program = createProgram();
|
||||
await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
|
||||
|
||||
expect(log.error).toHaveBeenCalledWith('boom');
|
||||
expect(exitSpy).toHaveBeenCalledWith(1);
|
||||
});
|
||||
});
|
||||
326
apps/cli/src/commands/eval.ts
Normal file
326
apps/cli/src/commands/eval.ts
Normal file
@@ -0,0 +1,326 @@
|
||||
import type { Command } from 'commander';
|
||||
import { InvalidArgumentError } from 'commander';
|
||||
import pc from 'picocolors';
|
||||
|
||||
import { getTrpcClient } from '../api/client';
|
||||
import { log } from '../utils/logger';
|
||||
|
||||
const JSON_VERSION = 'v1' as const;
|
||||
|
||||
interface JsonError {
|
||||
code?: string;
|
||||
message: string;
|
||||
}
|
||||
|
||||
interface JsonEnvelope<T> {
|
||||
data: T | null;
|
||||
error: JsonError | null;
|
||||
ok: boolean;
|
||||
version: typeof JSON_VERSION;
|
||||
}
|
||||
|
||||
interface JsonOption {
|
||||
json?: boolean;
|
||||
}
|
||||
|
||||
interface RunGetOptions extends JsonOption {
|
||||
runId: string;
|
||||
}
|
||||
|
||||
interface RunSetStatusOptions extends JsonOption {
|
||||
runId: string;
|
||||
status: 'completed' | 'external';
|
||||
}
|
||||
|
||||
interface DatasetGetOptions extends JsonOption {
|
||||
datasetId: string;
|
||||
}
|
||||
|
||||
interface RunTopicsListOptions extends JsonOption {
|
||||
onlyExternal?: boolean;
|
||||
runId: string;
|
||||
}
|
||||
|
||||
interface ThreadsListOptions extends JsonOption {
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
interface MessagesListOptions extends JsonOption {
|
||||
threadId?: string;
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
interface TestCasesCountOptions extends JsonOption {
|
||||
datasetId: string;
|
||||
}
|
||||
|
||||
interface RunTopicReportResultOptions extends JsonOption {
|
||||
correct: boolean;
|
||||
resultJson: Record<string, unknown>;
|
||||
runId: string;
|
||||
score: number;
|
||||
threadId?: string;
|
||||
topicId: string;
|
||||
}
|
||||
|
||||
const printJson = (data: unknown) => {
|
||||
console.log(JSON.stringify(data, null, 2));
|
||||
};
|
||||
|
||||
const outputJsonSuccess = (data: unknown) => {
|
||||
const payload: JsonEnvelope<unknown> = {
|
||||
data,
|
||||
error: null,
|
||||
ok: true,
|
||||
version: JSON_VERSION,
|
||||
};
|
||||
printJson(payload);
|
||||
};
|
||||
|
||||
const isRecord = (value: unknown): value is Record<string, unknown> =>
|
||||
typeof value === 'object' && value !== null;
|
||||
|
||||
const toJsonError = (error: unknown): JsonError => {
|
||||
if (error instanceof Error) {
|
||||
const maybeData = (error as Error & { data?: { code?: string } }).data;
|
||||
const code = maybeData?.code;
|
||||
|
||||
return {
|
||||
code: typeof code === 'string' ? code : undefined,
|
||||
message: error.message,
|
||||
};
|
||||
}
|
||||
|
||||
if (isRecord(error)) {
|
||||
const code = typeof error.code === 'string' ? error.code : undefined;
|
||||
const message = typeof error.message === 'string' ? error.message : 'Unknown error';
|
||||
return { code, message };
|
||||
}
|
||||
|
||||
return { message: String(error) };
|
||||
};
|
||||
|
||||
const handleCommandError = (error: unknown, json: boolean) => {
|
||||
const normalized = toJsonError(error);
|
||||
|
||||
if (json) {
|
||||
const payload: JsonEnvelope<null> = {
|
||||
data: null,
|
||||
error: normalized,
|
||||
ok: false,
|
||||
version: JSON_VERSION,
|
||||
};
|
||||
printJson(payload);
|
||||
} else {
|
||||
log.error(normalized.message);
|
||||
}
|
||||
|
||||
process.exit(1);
|
||||
};
|
||||
|
||||
const parseScore = (value: string) => {
|
||||
const score = Number(value);
|
||||
if (!Number.isFinite(score)) {
|
||||
throw new InvalidArgumentError(`Invalid score: ${value}`);
|
||||
}
|
||||
return score;
|
||||
};
|
||||
|
||||
const parseBoolean = (value: string) => {
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (['1', 'true', 'yes'].includes(normalized)) return true;
|
||||
if (['0', 'false', 'no'].includes(normalized)) return false;
|
||||
throw new InvalidArgumentError(`Invalid boolean value: ${value}`);
|
||||
};
|
||||
|
||||
const parseResultJson = (value: string) => {
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(value);
|
||||
} catch {
|
||||
throw new InvalidArgumentError('Invalid JSON value for --result-json');
|
||||
}
|
||||
|
||||
if (!isRecord(parsed) || Array.isArray(parsed)) {
|
||||
throw new InvalidArgumentError('--result-json must be a JSON object');
|
||||
}
|
||||
|
||||
return parsed;
|
||||
};
|
||||
|
||||
const parseRunStatus = (value: string) => {
|
||||
if (value !== 'completed' && value !== 'external') {
|
||||
throw new InvalidArgumentError("Only 'completed' and 'external' are supported");
|
||||
}
|
||||
|
||||
return value as 'completed' | 'external';
|
||||
};
|
||||
|
||||
const executeCommand = async (
|
||||
options: JsonOption,
|
||||
action: () => Promise<unknown>,
|
||||
successMessage?: string,
|
||||
) => {
|
||||
try {
|
||||
const data = await action();
|
||||
if (options.json) {
|
||||
outputJsonSuccess(data);
|
||||
return;
|
||||
}
|
||||
|
||||
if (successMessage) {
|
||||
console.log(`${pc.green('OK')} ${successMessage}`);
|
||||
return;
|
||||
}
|
||||
|
||||
printJson(data);
|
||||
} catch (error) {
|
||||
handleCommandError(error, Boolean(options.json));
|
||||
}
|
||||
};
|
||||
|
||||
export function registerEvalCommand(program: Command) {
|
||||
const evalCmd = program.command('eval').description('Manage external evaluation workflows');
|
||||
|
||||
const runCmd = evalCmd.command('run').description('Manage evaluation runs');
|
||||
|
||||
runCmd
|
||||
.command('get')
|
||||
.description('Get run information')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunGetOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runGet.query({ runId: options.runId });
|
||||
}),
|
||||
);
|
||||
|
||||
runCmd
|
||||
.command('set-status')
|
||||
.description('Set run status (external API supports completed or external)')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunSetStatusOptions) =>
|
||||
executeCommand(
|
||||
options,
|
||||
async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runSetStatus.mutate({
|
||||
runId: options.runId,
|
||||
status: options.status,
|
||||
});
|
||||
},
|
||||
`Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
|
||||
),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('dataset')
|
||||
.description('Manage evaluation datasets')
|
||||
.command('get')
|
||||
.description('Get dataset information')
|
||||
.requiredOption('--dataset-id <id>', 'Dataset ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: DatasetGetOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('run-topics')
|
||||
.description('Manage run topics')
|
||||
.command('list')
|
||||
.description('List topics in a run')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.option('--only-external', 'Only return topics pending external evaluation')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunTopicsListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runTopicsList.query({
|
||||
onlyExternal: Boolean(options.onlyExternal),
|
||||
runId: options.runId,
|
||||
});
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('threads')
|
||||
.description('Manage evaluation threads')
|
||||
.command('list')
|
||||
.description('List threads by topic')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: ThreadsListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('messages')
|
||||
.description('Manage evaluation messages')
|
||||
.command('list')
|
||||
.description('List messages by topic and optional thread')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--thread-id <id>', 'Thread ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: MessagesListOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.messagesList.query({
|
||||
threadId: options.threadId,
|
||||
topicId: options.topicId,
|
||||
});
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('test-cases')
|
||||
.description('Manage evaluation test cases')
|
||||
.command('count')
|
||||
.description('Count test cases by dataset')
|
||||
.requiredOption('--dataset-id <id>', 'Dataset ID')
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: TestCasesCountOptions) =>
|
||||
executeCommand(options, async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
|
||||
}),
|
||||
);
|
||||
|
||||
evalCmd
|
||||
.command('run-topic')
|
||||
.description('Manage evaluation run-topic reporting')
|
||||
.command('report-result')
|
||||
.description('Report one evaluation result for a run topic')
|
||||
.requiredOption('--run-id <id>', 'Run ID')
|
||||
.requiredOption('--topic-id <id>', 'Topic ID')
|
||||
.option('--thread-id <id>', 'Thread ID (required for k > 1)')
|
||||
.requiredOption('--score <score>', 'Evaluation score', parseScore)
|
||||
.requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
|
||||
.requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
|
||||
.option('--json', 'Output JSON envelope')
|
||||
.action(async (options: RunTopicReportResultOptions) =>
|
||||
executeCommand(
|
||||
options,
|
||||
async () => {
|
||||
const client = await getTrpcClient();
|
||||
return client.agentEvalExternal.runTopicReportResult.mutate({
|
||||
correct: options.correct,
|
||||
result: options.resultJson,
|
||||
runId: options.runId,
|
||||
score: options.score,
|
||||
threadId: options.threadId,
|
||||
topicId: options.topicId,
|
||||
});
|
||||
},
|
||||
`Reported result for topic ${pc.bold(options.topicId)}`,
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -7,6 +7,7 @@ import { registerDocCommand } from './commands/doc';
|
||||
import { registerFileCommand } from './commands/file';
|
||||
import { registerGenerateCommand } from './commands/generate';
|
||||
import { registerKbCommand } from './commands/kb';
|
||||
import { registerEvalCommand } from './commands/eval';
|
||||
import { registerLoginCommand } from './commands/login';
|
||||
import { registerLogoutCommand } from './commands/logout';
|
||||
import { registerMemoryCommand } from './commands/memory';
|
||||
@@ -44,5 +45,6 @@ registerModelCommand(program);
|
||||
registerProviderCommand(program);
|
||||
registerPluginCommand(program);
|
||||
registerConfigCommand(program);
|
||||
registerEvalCommand(program);
|
||||
|
||||
program.parse();
|
||||
|
||||
Reference in New Issue
Block a user