🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)

* 🔧 chore(search): reduce Exa default result count

* 🐛 fix(eval): relax run input schema limits

*  feat(agent): persist tool execution time in message metadata

* 🔧 chore(eval): add flow control to trajectory workflows

* 🧪 test: adjust Exa numResults expectation
This commit is contained in:
Rylan Cai
2026-03-18 10:29:49 +08:00
committed by GitHub
parent 3a789dc612
commit 69c24c714e
8 changed files with 109 additions and 6 deletions

View File

@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
reactions: z.array(EmojiReactionSchema).optional(),
scope: z.string().optional(),
subAgentId: z.string().optional(),
toolExecutionTimeMs: z.number().optional(),
});
export interface ModelUsage extends ModelTokensUsage {
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
taskTitle?: string;
// message content is multimodal, display content in the streaming, won't save to db
tempDisplayContent?: string;
/**
* Tool execution time for tool messages (ms)
*/
toolExecutionTimeMs?: number;
usage?: ModelUsage;
}

View File

@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
flowControl: {
key: 'agent-eval-run.run-agent-trajectory',
parallelism: 500,
ratePerSecond: 10,
ratePerSecond: 20,
},
qstashClient,
},

View File

@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
flowControl: {
key: 'agent-eval-run.run-thread-trajectory',
parallelism: 500,
ratePerSecond: 10,
ratePerSecond: 20,
},
qstashClient,
},

View File

@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
const toolMessage = await ctx.messageModel.create({
agentId: state.metadata!.agentId!,
content: executionResult.content,
metadata: { toolExecutionTimeMs: executionTime },
parentId: payload.parentMessageId,
plugin: chatToolPayload as any,
pluginError: executionResult.error,
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
const toolMessage = await ctx.messageModel.create({
agentId: state.metadata!.agentId!,
content: executionResult.content,
metadata: { toolExecutionTimeMs: executionTime },
parentId: parentMessageId,
plugin: chatToolPayload as any,
pluginError: executionResult.error,

View File

@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
);
});
it('should persist tool execution time in metadata when creating tool message', async () => {
const executors = createRuntimeExecutors(ctx);
const state = createMockState();
const instruction = {
payload: {
parentMessageId: 'assistant-msg-456',
toolCalling: {
apiName: 'crawl',
arguments: '{"url": "https://example.com"}',
id: 'tool-call-2',
identifier: 'web-browsing',
type: 'default' as const,
},
},
type: 'call_tool' as const,
};
await executors.call_tool!(instruction, state);
expect(mockMessageModel.create).toHaveBeenCalledWith(
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 100,
},
}),
);
});
it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
// Setup: mock messageModel.create to return a specific tool message ID
const toolMessageId = 'tool-msg-789';
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
expect(state.usage.tools.totalCalls).toBe(0);
});
it('should persist execution time metadata for each tool message in batch execution', async () => {
mockToolExecutionService.executeTool
.mockResolvedValueOnce({
content: 'Search result',
error: null,
executionTime: 150,
state: {},
success: true,
})
.mockResolvedValueOnce({
content: 'Crawl result',
error: null,
executionTime: 250,
state: {},
success: true,
});
const executors = createRuntimeExecutors(ctx);
const state = createMockState();
const instruction = {
payload: {
parentMessageId: 'assistant-msg-123',
toolsCalling: [
{
apiName: 'search',
arguments: '{"query": "test"}',
id: 'tool-call-1',
identifier: 'web-search',
type: 'default' as const,
},
{
apiName: 'crawl',
arguments: '{"url": "https://example.com"}',
id: 'tool-call-2',
identifier: 'web-browsing',
type: 'default' as const,
},
],
},
type: 'call_tools_batch' as const,
};
await executors.call_tools_batch!(instruction, state);
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 150,
},
}),
);
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 250,
},
}),
);
});
it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
const executors = createRuntimeExecutors(ctx);
const state = createMockState({

View File

@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth
const evalRunInputConfigSchema = z.object({
k: z.number().min(1).max(10).optional(),
maxConcurrency: z.number().min(1).max(10).optional(),
maxConcurrency: z.number().min(1).max(20).optional(),
maxSteps: z.number().min(1).max(1000).optional(),
timeout: z.number().min(60_000).max(3_600_000).optional(),
timeout: z
.number()
.min(60_000)
.max(6 * 3_600_000)
.optional(),
});
const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {

View File

@@ -178,7 +178,7 @@ describe('ExaImpl', () => {
const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
expect(body.query).toBe('my search query');
expect(body.numResults).toBe(15);
expect(body.numResults).toBe(10);
expect(body.type).toBe('auto');
});

View File

@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
const endpoint = urlJoin(this.baseUrl, '/search');
const defaultQueryParams: ExaSearchParameters = {
numResults: 15,
numResults: 10,
query,
type: 'auto',
};