mirror of
https://github.com/lobehub/lobehub.git
synced 2026-03-26 13:19:34 +07:00
🔧 chore(eval): improve trajectory workflow controls and execution metadata (#13049)
* 🔧 chore(search): reduce Exa default result count * 🐛 fix(eval): relax run input schema limits * ✨ feat(agent): persist tool execution time in message metadata * 🔧 chore(eval): add flow control to trajectory workflows * 🧪 test: adjust Exa numResults expectation
This commit is contained in:
@@ -105,6 +105,7 @@ export const MessageMetadataSchema = ModelUsageSchema.merge(ModelPerformanceSche
|
|||||||
reactions: z.array(EmojiReactionSchema).optional(),
|
reactions: z.array(EmojiReactionSchema).optional(),
|
||||||
scope: z.string().optional(),
|
scope: z.string().optional(),
|
||||||
subAgentId: z.string().optional(),
|
subAgentId: z.string().optional(),
|
||||||
|
toolExecutionTimeMs: z.number().optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
export interface ModelUsage extends ModelTokensUsage {
|
export interface ModelUsage extends ModelTokensUsage {
|
||||||
@@ -193,5 +194,9 @@ export interface MessageMetadata extends ModelUsage, ModelPerformance {
|
|||||||
taskTitle?: string;
|
taskTitle?: string;
|
||||||
// message content is multimodal, display content in the streaming, won't save to db
|
// message content is multimodal, display content in the streaming, won't save to db
|
||||||
tempDisplayContent?: string;
|
tempDisplayContent?: string;
|
||||||
|
/**
|
||||||
|
* Tool execution time for tool messages (ms)
|
||||||
|
*/
|
||||||
|
toolExecutionTimeMs?: number;
|
||||||
usage?: ModelUsage;
|
usage?: ModelUsage;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ export const { POST } = serve<RunAgentTrajectoryPayload>(
|
|||||||
flowControl: {
|
flowControl: {
|
||||||
key: 'agent-eval-run.run-agent-trajectory',
|
key: 'agent-eval-run.run-agent-trajectory',
|
||||||
parallelism: 500,
|
parallelism: 500,
|
||||||
ratePerSecond: 10,
|
ratePerSecond: 20,
|
||||||
},
|
},
|
||||||
qstashClient,
|
qstashClient,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ export const { POST } = serve<RunThreadTrajectoryPayload>(
|
|||||||
flowControl: {
|
flowControl: {
|
||||||
key: 'agent-eval-run.run-thread-trajectory',
|
key: 'agent-eval-run.run-thread-trajectory',
|
||||||
parallelism: 500,
|
parallelism: 500,
|
||||||
ratePerSecond: 10,
|
ratePerSecond: 20,
|
||||||
},
|
},
|
||||||
qstashClient,
|
qstashClient,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -666,6 +666,7 @@ export const createRuntimeExecutors = (
|
|||||||
const toolMessage = await ctx.messageModel.create({
|
const toolMessage = await ctx.messageModel.create({
|
||||||
agentId: state.metadata!.agentId!,
|
agentId: state.metadata!.agentId!,
|
||||||
content: executionResult.content,
|
content: executionResult.content,
|
||||||
|
metadata: { toolExecutionTimeMs: executionTime },
|
||||||
parentId: payload.parentMessageId,
|
parentId: payload.parentMessageId,
|
||||||
plugin: chatToolPayload as any,
|
plugin: chatToolPayload as any,
|
||||||
pluginError: executionResult.error,
|
pluginError: executionResult.error,
|
||||||
@@ -882,6 +883,7 @@ export const createRuntimeExecutors = (
|
|||||||
const toolMessage = await ctx.messageModel.create({
|
const toolMessage = await ctx.messageModel.create({
|
||||||
agentId: state.metadata!.agentId!,
|
agentId: state.metadata!.agentId!,
|
||||||
content: executionResult.content,
|
content: executionResult.content,
|
||||||
|
metadata: { toolExecutionTimeMs: executionTime },
|
||||||
parentId: parentMessageId,
|
parentId: parentMessageId,
|
||||||
plugin: chatToolPayload as any,
|
plugin: chatToolPayload as any,
|
||||||
pluginError: executionResult.error,
|
pluginError: executionResult.error,
|
||||||
|
|||||||
@@ -836,6 +836,35 @@ describe('RuntimeExecutors', () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should persist tool execution time in metadata when creating tool message', async () => {
|
||||||
|
const executors = createRuntimeExecutors(ctx);
|
||||||
|
const state = createMockState();
|
||||||
|
|
||||||
|
const instruction = {
|
||||||
|
payload: {
|
||||||
|
parentMessageId: 'assistant-msg-456',
|
||||||
|
toolCalling: {
|
||||||
|
apiName: 'crawl',
|
||||||
|
arguments: '{"url": "https://example.com"}',
|
||||||
|
id: 'tool-call-2',
|
||||||
|
identifier: 'web-browsing',
|
||||||
|
type: 'default' as const,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
type: 'call_tool' as const,
|
||||||
|
};
|
||||||
|
|
||||||
|
await executors.call_tool!(instruction, state);
|
||||||
|
|
||||||
|
expect(mockMessageModel.create).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
metadata: {
|
||||||
|
toolExecutionTimeMs: 100,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
|
it('should return tool message ID as parentMessageId in nextContext for parentId chain', async () => {
|
||||||
// Setup: mock messageModel.create to return a specific tool message ID
|
// Setup: mock messageModel.create to return a specific tool message ID
|
||||||
const toolMessageId = 'tool-msg-789';
|
const toolMessageId = 'tool-msg-789';
|
||||||
@@ -1553,6 +1582,69 @@ describe('RuntimeExecutors', () => {
|
|||||||
expect(state.usage.tools.totalCalls).toBe(0);
|
expect(state.usage.tools.totalCalls).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should persist execution time metadata for each tool message in batch execution', async () => {
|
||||||
|
mockToolExecutionService.executeTool
|
||||||
|
.mockResolvedValueOnce({
|
||||||
|
content: 'Search result',
|
||||||
|
error: null,
|
||||||
|
executionTime: 150,
|
||||||
|
state: {},
|
||||||
|
success: true,
|
||||||
|
})
|
||||||
|
.mockResolvedValueOnce({
|
||||||
|
content: 'Crawl result',
|
||||||
|
error: null,
|
||||||
|
executionTime: 250,
|
||||||
|
state: {},
|
||||||
|
success: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const executors = createRuntimeExecutors(ctx);
|
||||||
|
const state = createMockState();
|
||||||
|
|
||||||
|
const instruction = {
|
||||||
|
payload: {
|
||||||
|
parentMessageId: 'assistant-msg-123',
|
||||||
|
toolsCalling: [
|
||||||
|
{
|
||||||
|
apiName: 'search',
|
||||||
|
arguments: '{"query": "test"}',
|
||||||
|
id: 'tool-call-1',
|
||||||
|
identifier: 'web-search',
|
||||||
|
type: 'default' as const,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
apiName: 'crawl',
|
||||||
|
arguments: '{"url": "https://example.com"}',
|
||||||
|
id: 'tool-call-2',
|
||||||
|
identifier: 'web-browsing',
|
||||||
|
type: 'default' as const,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
type: 'call_tools_batch' as const,
|
||||||
|
};
|
||||||
|
|
||||||
|
await executors.call_tools_batch!(instruction, state);
|
||||||
|
|
||||||
|
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
|
||||||
|
1,
|
||||||
|
expect.objectContaining({
|
||||||
|
metadata: {
|
||||||
|
toolExecutionTimeMs: 150,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
expect(mockMessageModel.create).toHaveBeenNthCalledWith(
|
||||||
|
2,
|
||||||
|
expect.objectContaining({
|
||||||
|
metadata: {
|
||||||
|
toolExecutionTimeMs: 250,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
|
it('should pass toolResultMaxLength from agentConfig to executeTool', async () => {
|
||||||
const executors = createRuntimeExecutors(ctx);
|
const executors = createRuntimeExecutors(ctx);
|
||||||
const state = createMockState({
|
const state = createMockState({
|
||||||
|
|||||||
@@ -40,9 +40,13 @@ const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passth
|
|||||||
|
|
||||||
const evalRunInputConfigSchema = z.object({
|
const evalRunInputConfigSchema = z.object({
|
||||||
k: z.number().min(1).max(10).optional(),
|
k: z.number().min(1).max(10).optional(),
|
||||||
maxConcurrency: z.number().min(1).max(10).optional(),
|
maxConcurrency: z.number().min(1).max(20).optional(),
|
||||||
maxSteps: z.number().min(1).max(1000).optional(),
|
maxSteps: z.number().min(1).max(1000).optional(),
|
||||||
timeout: z.number().min(60_000).max(3_600_000).optional(),
|
timeout: z
|
||||||
|
.number()
|
||||||
|
.min(60_000)
|
||||||
|
.max(6 * 3_600_000)
|
||||||
|
.optional(),
|
||||||
});
|
});
|
||||||
|
|
||||||
const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
|
const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
|
||||||
|
|||||||
@@ -178,7 +178,7 @@ describe('ExaImpl', () => {
|
|||||||
|
|
||||||
const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
|
const body = JSON.parse((vi.mocked(fetch).mock.calls[0][1] as RequestInit).body as string);
|
||||||
expect(body.query).toBe('my search query');
|
expect(body.query).toBe('my search query');
|
||||||
expect(body.numResults).toBe(15);
|
expect(body.numResults).toBe(10);
|
||||||
expect(body.type).toBe('auto');
|
expect(body.type).toBe('auto');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ export class ExaImpl implements SearchServiceImpl {
|
|||||||
const endpoint = urlJoin(this.baseUrl, '/search');
|
const endpoint = urlJoin(this.baseUrl, '/search');
|
||||||
|
|
||||||
const defaultQueryParams: ExaSearchParameters = {
|
const defaultQueryParams: ExaSearchParameters = {
|
||||||
numResults: 15,
|
numResults: 10,
|
||||||
query,
|
query,
|
||||||
type: 'auto',
|
type: 'auto',
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user