🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)

* 🔧 chore(search): reduce Exa default result count

* 🐛 fix(eval): relax run input schema limits

*  feat(agent): persist tool execution time in message metadata

* 🔧 chore(eval): add flow control to trajectory workflows

* 🧪 test: adjust Exa numResults expectation
This commit is contained in:
Rylan Cai
2026-03-18 10:29:49 +08:00
committed by GitHub
parent 3a789dc612
commit 69c24c714e
8 changed files with 109 additions and 6 deletions

View File

@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
reactions: z.array(EmojiReactionSchema).optional(), reactions: z.array(EmojiReactionSchema).optional(),
scope: z.string().optional(), scope: z.string().optional(),
subAgentId: z.string().optional(), subAgentId: z.string().optional(),
toolExecutionTimeMs: z.number().optional(),
}); });
export interface ModelUsage extends ModelTokensUsage { export interface ModelUsage extends ModelTokensUsage {
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
taskTitle?: string; taskTitle?: string;
// message content is multimodal, display content in the streaming, won't save to db // message content is multimodal, display content in the streaming, won't save to db
tempDisplayContent?: string; tempDisplayContent?: string;
/**
* Tool execution time for tool messages (ms)
*/
toolExecutionTimeMs?: number;
usage?: ModelUsage; usage?: ModelUsage;
} }

View File

@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
flowControl: { flowControl: {
key: 'agent-eval-run.run-agent-trajectory', key: 'agent-eval-run.run-agent-trajectory',
parallelism: 500, parallelism: 500,
ratePerSecond: 10, ratePerSecond: 20,
}, },
qstashClient, qstashClient,
}, },

View File

@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
flowControl: { flowControl: {
key: 'agent-eval-run.run-thread-trajectory', key: 'agent-eval-run.run-thread-trajectory',
parallelism: 500, parallelism: 500,
ratePerSecond: 10, ratePerSecond: 20,
}, },
qstashClient, qstashClient,
}, },

View File

@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
const toolMessage = await ctx.messageModel.create({ const toolMessage = await ctx.messageModel.create({
agentId: state.metadata!.agentId!, agentId: state.metadata!.agentId!,
content: executionResult.content, content: executionResult.content,
metadata: { toolExecutionTimeMs: executionTime },
parentId: payload.parentMessageId, parentId: payload.parentMessageId,
plugin: chatToolPayload as any, plugin: chatToolPayload as any,
pluginError: executionResult.error, pluginError: executionResult.error,
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
const toolMessage = await ctx.messageModel.create({ const toolMessage = await ctx.messageModel.create({
agentId: state.metadata!.agentId!, agentId: state.metadata!.agentId!,
content: executionResult.content, content: executionResult.content,
metadata: { toolExecutionTimeMs: executionTime },
parentId: parentMessageId, parentId: parentMessageId,
plugin: chatToolPayload as any, plugin: chatToolPayload as any,
pluginError: executionResult.error, pluginError: executionResult.error,

View File

@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
); );
}); });
it('should persist tool execution time in metadata when creating tool message', async () => {
const executors = createRuntimeExecutors(ctx);
const state = createMockState();
const instruction = {
payload: {
parentMessageId: 'assistant-msg-456',
toolCalling: {
apiName: 'crawl',
arguments: '{"url": "https://example.com"}',
id: 'tool-call-2',
identifier: 'web-browsing',
type: 'default' as const,
},
},
type: 'call_tool' as const,
};
await executors.call_tool!(instruction, state);
expect(mockMessageModel.create).toHaveBeenCalledWith(
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 100,
},
}),
);
});
it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => { it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
// Setup: mock messageModel.create to return a specific tool message ID // Setup: mock messageModel.create to return a specific tool message ID
const toolMessageId = 'tool-msg-789'; const toolMessageId = 'tool-msg-789';
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
expect(state.usage.tools.totalCalls).toBe(0); expect(state.usage.tools.totalCalls).toBe(0);
}); });
it('should persist execution time metadata for each tool message in batch execution', async () => {
mockToolExecutionService.executeTool
.mockResolvedValueOnce({
content: 'Search result',
error: null,
executionTime: 150,
state: {},
success: true,
})
.mockResolvedValueOnce({
content: 'Crawl result',
error: null,
executionTime: 250,
state: {},
success: true,
});
const executors = createRuntimeExecutors(ctx);
const state = createMockState();
const instruction = {
payload: {
parentMessageId: 'assistant-msg-123',
toolsCalling: [
{
apiName: 'search',
arguments: '{"query": "test"}',
id: 'tool-call-1',
identifier: 'web-search',
type: 'default' as const,
},
{
apiName: 'crawl',
arguments: '{"url": "https://example.com"}',
id: 'tool-call-2',
identifier: 'web-browsing',
type: 'default' as const,
},
],
},
type: 'call_tools_batch' as const,
};
await executors.call_tools_batch!(instruction, state);
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 150,
},
}),
);
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
metadata: {
toolExecutionTimeMs: 250,
},
}),
);
});
it('should pass toolResultMaxLength from agentConfig to executeTool', async () => { it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
const executors = createRuntimeExecutors(ctx); const executors = createRuntimeExecutors(ctx);
const state = createMockState({ const state = createMockState({

View File

@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth
const evalRunInputConfigSchema = z.object({ const evalRunInputConfigSchema = z.object({
k: z.number().min(1).max(10).optional(), k: z.number().min(1).max(10).optional(),
maxConcurrency: z.number().min(1).max(10).optional(), maxConcurrency: z.number().min(1).max(20).optional(),
maxSteps: z.number().min(1).max(1000).optional(), maxSteps: z.number().min(1).max(1000).optional(),
timeout: z.number().min(60_000).max(3_600_000).optional(), timeout: z
.number()
.min(60_000)
.max(6 * 3_600_000)
.optional(),
}); });
const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => { const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {

View File

@@ -178,7 +178,7 @@ describe('ExaImpl', () => {
const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string); const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
expect(body.query).toBe('my search query'); expect(body.query).toBe('my search query');
expect(body.numResults).toBe(15); expect(body.numResults).toBe(10);
expect(body.type).toBe('auto'); expect(body.type).toBe('auto');
}); });

View File

@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
const endpoint = urlJoin(this.baseUrl, '/search'); const endpoint = urlJoin(this.baseUrl, '/search');
const defaultQueryParams: ExaSearchParameters = { const defaultQueryParams: ExaSearchParameters = {
numResults: 15, numResults: 10,
query, query,
type: 'auto', type: 'auto',
}; };