🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)

* 🔧 chore(search): reduce Exa default result count * 🐛 fix(eval): relax run input schema limits * ✨ feat(agent): persist tool execution time in message metadata * 🔧 chore(eval): add flow control to trajectory workflows * 🧪 test: adjust Exa numResults expectation
2026-03-26 13:19:34 +07:00 · 2026-03-18 10:29:49 +08:00
parent 3a789dc612
commit 69c24c714e
8 changed files with 109 additions and 6 deletions
--- a/packages/types/src/message/common/metadata.ts
+++ b/packages/types/src/message/common/metadata.ts
@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
  reactions: z.array(EmojiReactionSchema).optional(),
  scope: z.string().optional(),
  subAgentId: z.string().optional(),
+  toolExecutionTimeMs: z.number().optional(),
 });

 export interface ModelUsage extends ModelTokensUsage {
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
  taskTitle?: string;
  // message content is multimodal, display content in the streaming, won't save to db
  tempDisplayContent?: string;
+  /**
+   * Tool execution time for tool messages (ms)
+   */
+  toolExecutionTimeMs?: number;
  usage?: ModelUsage;
 }
--- a/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
    flowControl: {
      key: 'agent-eval-run.run-agent-trajectory',
      parallelism: 500,
-      ratePerSecond: 10,
+      ratePerSecond: 20,
    },
    qstashClient,
  },
--- a/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
    flowControl: {
      key: 'agent-eval-run.run-thread-trajectory',
      parallelism: 500,
-      ratePerSecond: 10,
+      ratePerSecond: 20,
    },
    qstashClient,
  },
--- a/src/server/modules/AgentRuntime/RuntimeExecutors.ts
+++ b/src/server/modules/AgentRuntime/RuntimeExecutors.ts
@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
        const toolMessage = await ctx.messageModel.create({
          agentId: state.metadata!.agentId!,
          content: executionResult.content,
+          metadata: { toolExecutionTimeMs: executionTime },
          parentId: payload.parentMessageId,
          plugin: chatToolPayload as any,
          pluginError: executionResult.error,
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
            const toolMessage = await ctx.messageModel.create({
              agentId: state.metadata!.agentId!,
              content: executionResult.content,
+              metadata: { toolExecutionTimeMs: executionTime },
              parentId: parentMessageId,
              plugin: chatToolPayload as any,
              pluginError: executionResult.error,
--- a/src/server/modules/AgentRuntime/tests/RuntimeExecutors.test.ts
+++ b/src/server/modules/AgentRuntime/tests/RuntimeExecutors.test.ts
@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
      );
    });

+    it('should persist tool execution time in metadata when creating tool message', async () => {
+      const executors = createRuntimeExecutors(ctx);
+      const state = createMockState();
+
+      const instruction = {
+        payload: {
+          parentMessageId: 'assistant-msg-456',
+          toolCalling: {
+            apiName: 'crawl',
+            arguments: '{"url": "https://example.com"}',
+            id: 'tool-call-2',
+            identifier: 'web-browsing',
+            type: 'default' as const,
+          },
+        },
+        type: 'call_tool' as const,
+      };
+
+      await executors.call_tool!(instruction, state);
+
+      expect(mockMessageModel.create).toHaveBeenCalledWith(
+        expect.objectContaining({
+          metadata: {
+            toolExecutionTimeMs: 100,
+          },
+        }),
+      );
+    });
+
    it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
      // Setup: mock messageModel.create to return a specific tool message ID
      const toolMessageId = 'tool-msg-789';
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
      expect(state.usage.tools.totalCalls).toBe(0);
    });

+    it('should persist execution time metadata for each tool message in batch execution', async () => {
+      mockToolExecutionService.executeTool
+        .mockResolvedValueOnce({
+          content: 'Search result',
+          error: null,
+          executionTime: 150,
+          state: {},
+          success: true,
+        })
+        .mockResolvedValueOnce({
+          content: 'Crawl result',
+          error: null,
+          executionTime: 250,
+          state: {},
+          success: true,
+        });
+
+      const executors = createRuntimeExecutors(ctx);
+      const state = createMockState();
+
+      const instruction = {
+        payload: {
+          parentMessageId: 'assistant-msg-123',
+          toolsCalling: [
+            {
+              apiName: 'search',
+              arguments: '{"query": "test"}',
+              id: 'tool-call-1',
+              identifier: 'web-search',
+              type: 'default' as const,
+            },
+            {
+              apiName: 'crawl',
+              arguments: '{"url": "https://example.com"}',
+              id: 'tool-call-2',
+              identifier: 'web-browsing',
+              type: 'default' as const,
+            },
+          ],
+        },
+        type: 'call_tools_batch' as const,
+      };
+
+      await executors.call_tools_batch!(instruction, state);
+
+      expect(mockMessageModel.create).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining({
+          metadata: {
+            toolExecutionTimeMs: 150,
+          },
+        }),
+      );
+      expect(mockMessageModel.create).toHaveBeenNthCalledWith(
+        2,
+        expect.objectContaining({
+          metadata: {
+            toolExecutionTimeMs: 250,
+          },
+        }),
+      );
+    });
+
    it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
      const executors = createRuntimeExecutors(ctx);
      const state = createMockState({
--- a/src/server/routers/lambda/agentEval.ts
+++ b/src/server/routers/lambda/agentEval.ts
@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth

 const evalRunInputConfigSchema = z.object({
  k: z.number().min(1).max(10).optional(),
-  maxConcurrency: z.number().min(1).max(10).optional(),
+  maxConcurrency: z.number().min(1).max(20).optional(),
  maxSteps: z.number().min(1).max(1000).optional(),
-  timeout: z.number().min(60_000).max(3_600_000).optional(),
+  timeout: z
+    .number()
+    .min(60_000)
+    .max(6 * 3_600_000)
+    .optional(),
 });

 const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
--- a/src/server/services/search/impls/exa/index.test.ts
+++ b/src/server/services/search/impls/exa/index.test.ts
@@ -178,7 +178,7 @@ describe('ExaImpl', () => {

      const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
      expect(body.query).toBe('my search query');
-      expect(body.numResults).toBe(15);
+      expect(body.numResults).toBe(10);
      expect(body.type).toBe('auto');
    });

--- a/src/server/services/search/impls/exa/index.ts
+++ b/src/server/services/search/impls/exa/index.ts
@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
    const endpoint = urlJoin(this.baseUrl, '/search');

    const defaultQueryParams: ExaSearchParameters = {
-      numResults: 15,
+      numResults: 10,
      query,
      type: 'auto',
    };