mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-26 13:19:34 +07:00
🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)
* 🔧 chore(search): reduce Exa default result count * 🐛 fix(eval): relax run input schema limits * ✨ feat(agent): persist tool execution time in message metadata * 🔧 chore(eval): add flow control to trajectory workflows * 🧪 test: adjust Exa numResults expectation
This commit is contained in:
@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
|
||||
reactions: z.array(EmojiReactionSchema).optional(),
|
||||
scope: z.string().optional(),
|
||||
subAgentId: z.string().optional(),
|
||||
toolExecutionTimeMs: z.number().optional(),
|
||||
});
|
||||
|
||||
export interface ModelUsage extends ModelTokensUsage {
|
||||
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
|
||||
taskTitle?: string;
|
||||
// message content is multimodal, display content in the streaming, won't save to db
|
||||
tempDisplayContent?: string;
|
||||
/**
|
||||
* Tool execution time for tool messages (ms)
|
||||
*/
|
||||
toolExecutionTimeMs?: number;
|
||||
usage?: ModelUsage;
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
|
||||
flowControl: {
|
||||
key: 'agent-eval-run.run-agent-trajectory',
|
||||
parallelism: 500,
|
||||
ratePerSecond: 10,
|
||||
ratePerSecond: 20,
|
||||
},
|
||||
qstashClient,
|
||||
},
|
||||
|
||||
@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
|
||||
flowControl: {
|
||||
key: 'agent-eval-run.run-thread-trajectory',
|
||||
parallelism: 500,
|
||||
ratePerSecond: 10,
|
||||
ratePerSecond: 20,
|
||||
},
|
||||
qstashClient,
|
||||
},
|
||||
|
||||
@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
|
||||
const toolMessage = await ctx.messageModel.create({
|
||||
agentId: state.metadata!.agentId!,
|
||||
content: executionResult.content,
|
||||
metadata: { toolExecutionTimeMs: executionTime },
|
||||
parentId: payload.parentMessageId,
|
||||
plugin: chatToolPayload as any,
|
||||
pluginError: executionResult.error,
|
||||
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
|
||||
const toolMessage = await ctx.messageModel.create({
|
||||
agentId: state.metadata!.agentId!,
|
||||
content: executionResult.content,
|
||||
metadata: { toolExecutionTimeMs: executionTime },
|
||||
parentId: parentMessageId,
|
||||
plugin: chatToolPayload as any,
|
||||
pluginError: executionResult.error,
|
||||
|
||||
@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
|
||||
);
|
||||
});
|
||||
|
||||
it('should persist tool execution time in metadata when creating tool message', async () => {
|
||||
const executors = createRuntimeExecutors(ctx);
|
||||
const state = createMockState();
|
||||
|
||||
const instruction = {
|
||||
payload: {
|
||||
parentMessageId: 'assistant-msg-456',
|
||||
toolCalling: {
|
||||
apiName: 'crawl',
|
||||
arguments: '{"url": "https://example.com"}',
|
||||
id: 'tool-call-2',
|
||||
identifier: 'web-browsing',
|
||||
type: 'default' as const,
|
||||
},
|
||||
},
|
||||
type: 'call_tool' as const,
|
||||
};
|
||||
|
||||
await executors.call_tool!(instruction, state);
|
||||
|
||||
expect(mockMessageModel.create).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
metadata: {
|
||||
toolExecutionTimeMs: 100,
|
||||
},
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
|
||||
// Setup: mock messageModel.create to return a specific tool message ID
|
||||
const toolMessageId = 'tool-msg-789';
|
||||
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
|
||||
expect(state.usage.tools.totalCalls).toBe(0);
|
||||
});
|
||||
|
||||
it('should persist execution time metadata for each tool message in batch execution', async () => {
|
||||
mockToolExecutionService.executeTool
|
||||
.mockResolvedValueOnce({
|
||||
content: 'Search result',
|
||||
error: null,
|
||||
executionTime: 150,
|
||||
state: {},
|
||||
success: true,
|
||||
})
|
||||
.mockResolvedValueOnce({
|
||||
content: 'Crawl result',
|
||||
error: null,
|
||||
executionTime: 250,
|
||||
state: {},
|
||||
success: true,
|
||||
});
|
||||
|
||||
const executors = createRuntimeExecutors(ctx);
|
||||
const state = createMockState();
|
||||
|
||||
const instruction = {
|
||||
payload: {
|
||||
parentMessageId: 'assistant-msg-123',
|
||||
toolsCalling: [
|
||||
{
|
||||
apiName: 'search',
|
||||
arguments: '{"query": "test"}',
|
||||
id: 'tool-call-1',
|
||||
identifier: 'web-search',
|
||||
type: 'default' as const,
|
||||
},
|
||||
{
|
||||
apiName: 'crawl',
|
||||
arguments: '{"url": "https://example.com"}',
|
||||
id: 'tool-call-2',
|
||||
identifier: 'web-browsing',
|
||||
type: 'default' as const,
|
||||
},
|
||||
],
|
||||
},
|
||||
type: 'call_tools_batch' as const,
|
||||
};
|
||||
|
||||
await executors.call_tools_batch!(instruction, state);
|
||||
|
||||
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({
|
||||
metadata: {
|
||||
toolExecutionTimeMs: 150,
|
||||
},
|
||||
}),
|
||||
);
|
||||
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({
|
||||
metadata: {
|
||||
toolExecutionTimeMs: 250,
|
||||
},
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
|
||||
const executors = createRuntimeExecutors(ctx);
|
||||
const state = createMockState({
|
||||
|
||||
@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth
|
||||
|
||||
const evalRunInputConfigSchema = z.object({
|
||||
k: z.number().min(1).max(10).optional(),
|
||||
maxConcurrency: z.number().min(1).max(10).optional(),
|
||||
maxConcurrency: z.number().min(1).max(20).optional(),
|
||||
maxSteps: z.number().min(1).max(1000).optional(),
|
||||
timeout: z.number().min(60_000).max(3_600_000).optional(),
|
||||
timeout: z
|
||||
.number()
|
||||
.min(60_000)
|
||||
.max(6 * 3_600_000)
|
||||
.optional(),
|
||||
});
|
||||
|
||||
const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
|
||||
|
||||
@@ -178,7 +178,7 @@ describe('ExaImpl', () => {
|
||||
|
||||
const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
|
||||
expect(body.query).toBe('my search query');
|
||||
expect(body.numResults).toBe(15);
|
||||
expect(body.numResults).toBe(10);
|
||||
expect(body.type).toBe('auto');
|
||||
});
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
|
||||
const endpoint = urlJoin(this.baseUrl, '/search');
|
||||
|
||||
const defaultQueryParams: ExaSearchParameters = {
|
||||
numResults: 15,
|
||||
numResults: 10,
|
||||
query,
|
||||
type: 'auto',
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user