🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)

* 🔧 chore(search): reduce Exa default result count * 🐛 fix(eval): relax run input schema limits * ✨ feat(agent): persist tool execution time in message metadata * 🔧 chore(eval): add flow control to trajectory workflows * 🧪 test: adjust Exa numResults expectation
2026-03-26 13:19:34 +07:00 · 2026-03-18 10:29:49 +08:00
parent 3a789dc612
commit 69c24c714e
8 changed files with 109 additions and 6 deletions
--- a/packages/types/src/message/common/metadata.ts
+++ b/packages/types/src/message/common/metadata.ts
@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
  reactions: z.array(EmojiReactionSchema).optional(),
  scope: z.string().optional(),
  subAgentId: z.string().optional(),
  toolExecutionTimeMs: z.number().optional(),
 });
 export interface ModelUsage extends ModelTokensUsage {
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
  taskTitle?: string;
  // message content is multimodal, display content in the streaming, won't save to db
  tempDisplayContent?: string;
  /**
   * Tool execution time for tool messages (ms)
   */
  toolExecutionTimeMs?: number;
  usage?: ModelUsage;
 }
--- a/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
    flowControl: {
      key: 'agent-eval-run.run-agent-trajectory',
      parallelism: 500,
-      ratePerSecond: 10,
+      ratePerSecond: 20,
    },
    qstashClient,
  },
--- a/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
    flowControl: {
      key: 'agent-eval-run.run-thread-trajectory',
      parallelism: 500,
-      ratePerSecond: 10,
+      ratePerSecond: 20,
    },
    qstashClient,
  },
--- a/src/server/modules/AgentRuntime/RuntimeExecutors.ts
+++ b/src/server/modules/AgentRuntime/RuntimeExecutors.ts
@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
        const toolMessage = await ctx.messageModel.create({
          agentId: state.metadata!.agentId!,
          content: executionResult.content,
          metadata: { toolExecutionTimeMs: executionTime },
          parentId: payload.parentMessageId,
          plugin: chatToolPayload as any,
          pluginError: executionResult.error,
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
            const toolMessage = await ctx.messageModel.create({
              agentId: state.metadata!.agentId!,
              content: executionResult.content,
              metadata: { toolExecutionTimeMs: executionTime },
              parentId: parentMessageId,
              plugin: chatToolPayload as any,
              pluginError: executionResult.error,
--- a/src/server/modules/AgentRuntime/tests/RuntimeExecutors.test.ts
+++ b/src/server/modules/AgentRuntime/tests/RuntimeExecutors.test.ts
@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
      );
    });
    it('should persist tool execution time in metadata when creating tool message', async () => {
      const executors = createRuntimeExecutors(ctx);
      const state = createMockState();
      const instruction = {
        payload: {
          parentMessageId: 'assistant-msg-456',
          toolCalling: {
            apiName: 'crawl',
            arguments: '{"url": "https://example.com"}',
            id: 'tool-call-2',
            identifier: 'web-browsing',
            type: 'default' as const,
          },
        },
        type: 'call_tool' as const,
      };
      await executors.call_tool!(instruction, state);
      expect(mockMessageModel.create).toHaveBeenCalledWith(
        expect.objectContaining({
          metadata: {
            toolExecutionTimeMs: 100,
          },
        }),
      );
    });
    it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
      // Setup: mock messageModel.create to return a specific tool message ID
      const toolMessageId = 'tool-msg-789';
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
      expect(state.usage.tools.totalCalls).toBe(0);
    });
    it('should persist execution time metadata for each tool message in batch execution', async () => {
      mockToolExecutionService.executeTool
        .mockResolvedValueOnce({
          content: 'Search result',
          error: null,
          executionTime: 150,
          state: {},
          success: true,
        })
        .mockResolvedValueOnce({
          content: 'Crawl result',
          error: null,
          executionTime: 250,
          state: {},
          success: true,
        });
      const executors = createRuntimeExecutors(ctx);
      const state = createMockState();
      const instruction = {
        payload: {
          parentMessageId: 'assistant-msg-123',
          toolsCalling: [
            {
              apiName: 'search',
              arguments: '{"query": "test"}',
              id: 'tool-call-1',
              identifier: 'web-search',
              type: 'default' as const,
            },
            {
              apiName: 'crawl',
              arguments: '{"url": "https://example.com"}',
              id: 'tool-call-2',
              identifier: 'web-browsing',
              type: 'default' as const,
            },
          ],
        },
        type: 'call_tools_batch' as const,
      };
      await executors.call_tools_batch!(instruction, state);
      expect(mockMessageModel.create).toHaveBeenNthCalledWith(
        1,
        expect.objectContaining({
          metadata: {
            toolExecutionTimeMs: 150,
          },
        }),
      );
      expect(mockMessageModel.create).toHaveBeenNthCalledWith(
        2,
        expect.objectContaining({
          metadata: {
            toolExecutionTimeMs: 250,
          },
        }),
      );
    });
    it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
      const executors = createRuntimeExecutors(ctx);
      const state = createMockState({
--- a/src/server/routers/lambda/agentEval.ts
+++ b/src/server/routers/lambda/agentEval.ts
@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth
 const evalRunInputConfigSchema = z.object({
  k: z.number().min(1).max(10).optional(),
-  maxConcurrency: z.number().min(1).max(10).optional(),
+  maxConcurrency: z.number().min(1).max(20).optional(),
  maxSteps: z.number().min(1).max(1000).optional(),
-  timeout: z.number().min(60_000).max(3_600_000).optional(),
+  timeout: z
    .number()
    .min(60_000)
    .max(6 * 3_600_000)
    .optional(),
 });
 const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
--- a/src/server/services/search/impls/exa/index.test.ts
+++ b/src/server/services/search/impls/exa/index.test.ts
@@ -178,7 +178,7 @@ describe('ExaImpl', () => {
      const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
      expect(body.query).toBe('my search query');
-      expect(body.numResults).toBe(15);
+      expect(body.numResults).toBe(10);
      expect(body.type).toBe('auto');
    });
--- a/src/server/services/search/impls/exa/index.ts
+++ b/src/server/services/search/impls/exa/index.ts
@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
    const endpoint = urlJoin(this.baseUrl, '/search');
    const defaultQueryParams: ExaSearchParameters = {
-      numResults: 15,
+      numResults: 10,
      query,
      type: 'auto',
    };