🐛 fix: fix multimodal content_part images rendered as base64 text (#12210)

2026-03-27 13:29:15 +07:00 · 2026-02-09 18:18:59 +08:00
parent 4ae90976a0
commit 00ff5b9c1b
6 changed files with 500 additions and 16 deletions
--- a/src/store/chat/slices/aiChat/actions/StreamingHandler.ts
+++ b/src/store/chat/slices/aiChat/actions/StreamingHandler.ts
@@ -417,12 +417,13 @@ export class StreamingHandler {
        : this.thinkingContent
          ? { content: this.thinkingContent, duration: this.thinkingDuration }
          : undefined,
+      hasContentImages
+        ? {
+            isMultimodal: true,
+            tempDisplayContent: serializePartsForStorage(this.contentParts),
+          }
+        : undefined,
    );
-
-    // If has content images, also notify with metadata for tempDisplayContent
-    if (hasContentImages) {
-      // This is handled in the main onContentUpdate callback
-    }
  }

  private buildReasoningState(): ReasoningState | undefined {
--- a/src/store/chat/slices/aiChat/actions/tests/StreamingHandler.test.ts
+++ b/src/store/chat/slices/aiChat/actions/tests/StreamingHandler.test.ts
@@ -0,0 +1,302 @@
+import { describe, expect, it, vi } from 'vitest';
+
+import { StreamingHandler } from '../StreamingHandler';
+import type { StreamChunk, StreamingCallbacks, StreamingContext } from '../types/streaming';
+
+// Helper to create a mock streaming context
+const createContext = (overrides: Partial<StreamingContext> = {}): StreamingContext => ({
+  agentId: 'test-agent',
+  messageId: 'test-message',
+  operationId: 'test-op',
+  ...overrides,
+});
+
+// Helper to create mock callbacks
+const createCallbacks = (overrides: Partial<StreamingCallbacks> = {}): StreamingCallbacks => ({
+  onContentUpdate: vi.fn(),
+  onGroundingUpdate: vi.fn(),
+  onImagesUpdate: vi.fn(),
+  onReasoningComplete: vi.fn(),
+  onReasoningStart: vi.fn().mockReturnValue('reasoning-op-id'),
+  onReasoningUpdate: vi.fn(),
+  onToolCallsUpdate: vi.fn(),
+  toggleToolCallingStreaming: vi.fn(),
+  transformToolCalls: vi.fn().mockReturnValue([]),
+  uploadBase64Image: vi
+    .fn()
+    .mockResolvedValue({ id: 'img-id', url: 'https://uploaded.url/img.png' }),
+  ...overrides,
+});
+
+describe('StreamingHandler', () => {
+  describe('content_part image handling', () => {
+    it('should pass contentMetadata with isMultimodal when content_part image chunks are received', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      // Send a text content_part
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'text',
+        content: 'Here is an image: ',
+      });
+
+      // Send an image content_part
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'image',
+        content: 'base64imagedata',
+        mimeType: 'image/png',
+      });
+
+      // The last onContentUpdate call should include contentMetadata
+      const lastCall = (callbacks.onContentUpdate as ReturnType<typeof vi.fn>).mock.calls.at(-1);
+      expect(lastCall).toBeDefined();
+
+      const [content, reasoning, contentMetadata] = lastCall!;
+      expect(content).toBe('Here is an image: ');
+      expect(contentMetadata).toBeDefined();
+      expect(contentMetadata.isMultimodal).toBe(true);
+      expect(contentMetadata.tempDisplayContent).toBeDefined();
+      // tempDisplayContent should be a serialized JSON string containing the parts
+      const parsed = JSON.parse(contentMetadata.tempDisplayContent);
+      expect(parsed).toHaveLength(2);
+      expect(parsed[0]).toEqual({ type: 'text', text: 'Here is an image: ' });
+      expect(parsed[1]).toEqual(
+        expect.objectContaining({
+          type: 'image',
+          image: expect.stringContaining('data:image/png;base64,'),
+        }),
+      );
+    });
+
+    it('should NOT pass contentMetadata when only text content_part chunks are received', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'text',
+        content: 'Hello ',
+      });
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'text',
+        content: 'world',
+      });
+
+      const lastCall = (callbacks.onContentUpdate as ReturnType<typeof vi.fn>).mock.calls.at(-1);
+      expect(lastCall).toBeDefined();
+
+      const [content, _reasoning, contentMetadata] = lastCall!;
+      expect(content).toBe('Hello world');
+      // No contentMetadata when there are no images
+      expect(contentMetadata).toBeUndefined();
+    });
+
+    it('should include isMultimodal in final result metadata when content has images', async () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      // Send mixed content
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'text',
+        content: 'A cat: ',
+      });
+      handler.handleChunk({
+        type: 'content_part',
+        partType: 'image',
+        content: 'base64catimage',
+        mimeType: 'image/jpeg',
+      });
+
+      const result = await handler.handleFinish({});
+
+      expect(result.metadata.isMultimodal).toBe(true);
+      // Content should be serialized JSON containing text + image parts
+      const parsed = JSON.parse(result.content);
+      expect(parsed).toHaveLength(2);
+      expect(parsed[0].type).toBe('text');
+      expect(parsed[1].type).toBe('image');
+    });
+
+    it('should NOT include isMultimodal in final result when only text content', async () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({ type: 'text', text: 'Hello world' });
+
+      const result = await handler.handleFinish({});
+
+      expect(result.metadata.isMultimodal).toBeUndefined();
+      expect(result.content).toBe('Hello world');
+    });
+  });
+
+  describe('text chunk handling', () => {
+    it('should accumulate text chunks and notify via onContentUpdate', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({ type: 'text', text: 'Hello' });
+      handler.handleChunk({ type: 'text', text: ' World' });
+
+      expect(callbacks.onContentUpdate).toHaveBeenCalledTimes(2);
+      expect(handler.getOutput()).toBe('Hello World');
+    });
+  });
+
+  describe('reasoning chunk handling', () => {
+    it('should track reasoning content and start/end timing', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      // Reasoning starts
+      handler.handleChunk({ type: 'reasoning', text: 'Let me think...' });
+      expect(callbacks.onReasoningStart).toHaveBeenCalledTimes(1);
+      expect(callbacks.onReasoningUpdate).toHaveBeenCalledWith({ content: 'Let me think...' });
+
+      // Text ends reasoning
+      handler.handleChunk({ type: 'text', text: 'Answer' });
+      expect(handler.getThinkingDuration()).toBeDefined();
+      expect(handler.getThinkingDuration()).toBeGreaterThanOrEqual(0);
+    });
+
+    it('should include reasoning in final result with signature', async () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({ type: 'reasoning', text: 'Thinking' });
+      handler.handleChunk({ type: 'text', text: 'Answer' });
+
+      const result = await handler.handleFinish({
+        reasoning: { content: 'Thinking', signature: 'test-sig' },
+      });
+
+      expect(result.metadata.reasoning).toBeDefined();
+      expect(result.metadata.reasoning?.content).toBe('Thinking');
+      expect(result.metadata.reasoning?.signature).toBe('test-sig');
+      // Duration may be 0 in fast tests (which becomes undefined due to `0 && ...` check)
+      // So we just verify it's a number or undefined
+      expect(
+        result.metadata.reasoning?.duration === undefined ||
+          typeof result.metadata.reasoning?.duration === 'number',
+      ).toBe(true);
+    });
+  });
+
+  describe('reasoning_part with images', () => {
+    it('should handle reasoning_part image chunks and report isMultimodal', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'reasoning_part',
+        partType: 'text',
+        content: 'Thinking about image: ',
+      });
+      handler.handleChunk({
+        type: 'reasoning_part',
+        partType: 'image',
+        content: 'base64data',
+        mimeType: 'image/png',
+      });
+
+      const lastCall = (callbacks.onReasoningUpdate as ReturnType<typeof vi.fn>).mock.calls.at(-1);
+      expect(lastCall).toBeDefined();
+      expect(lastCall![0].isMultimodal).toBe(true);
+      expect(lastCall![0].tempDisplayContent).toBeDefined();
+    });
+  });
+
+  describe('tool_calls handling', () => {
+    it('should mark as function call when tool_calls chunk is received', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'tool_calls',
+        tool_calls: [
+          { id: 'tool-1', type: 'function', function: { name: 'test', arguments: '{}' } },
+        ],
+        isAnimationActives: [true],
+      });
+
+      expect(handler.getIsFunctionCall()).toBe(true);
+      expect(callbacks.toggleToolCallingStreaming).toHaveBeenCalledWith('test-message', [true]);
+    });
+  });
+
+  describe('handleFinish with tool calls', () => {
+    it('should process final tool calls and set isFunctionCall', async () => {
+      const callbacks = createCallbacks({
+        transformToolCalls: vi.fn().mockReturnValue([{ identifier: 'test', arguments: '{}' }]),
+      });
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      const result = await handler.handleFinish({
+        toolCalls: [{ id: 'tool-1', type: 'function', function: { name: 'test', arguments: '' } }],
+      });
+
+      expect(result.isFunctionCall).toBe(true);
+      expect(result.tools).toBeDefined();
+      expect(result.tools).toHaveLength(1);
+    });
+  });
+
+  describe('base64_image handling', () => {
+    it('should dispatch images immediately and upload async', async () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'base64_image',
+        image: { id: 'img-1', data: 'base64data' },
+        images: [{ id: 'img-1', data: 'base64data' }],
+      });
+
+      expect(callbacks.onImagesUpdate).toHaveBeenCalledWith([
+        { alt: 'img-1', id: 'img-1', url: 'base64data' },
+      ]);
+
+      // After finish, uploaded images should be in the result
+      const result = await handler.handleFinish({});
+      expect(result.metadata.imageList).toBeDefined();
+      expect(result.metadata.imageList).toHaveLength(1);
+      expect(result.metadata.imageList![0].url).toBe('https://uploaded.url/img.png');
+    });
+  });
+
+  describe('grounding handling', () => {
+    it('should forward citations to onGroundingUpdate', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'grounding',
+        grounding: {
+          citations: [{ url: 'https://example.com', title: 'Example' }],
+          searchQueries: ['query'],
+        },
+      } as any);
+
+      expect(callbacks.onGroundingUpdate).toHaveBeenCalledWith({
+        citations: [{ url: 'https://example.com', title: 'Example' }],
+        searchQueries: ['query'],
+      });
+    });
+
+    it('should skip grounding when no citations', () => {
+      const callbacks = createCallbacks();
+      const handler = new StreamingHandler(createContext(), callbacks);
+
+      handler.handleChunk({
+        type: 'grounding',
+        grounding: { citations: [], searchQueries: [] },
+      } as any);
+
+      expect(callbacks.onGroundingUpdate).not.toHaveBeenCalled();
+    });
+  });
+});
--- a/src/store/chat/slices/aiChat/actions/tests/streamingExecutor.test.ts
+++ b/src/store/chat/slices/aiChat/actions/tests/streamingExecutor.test.ts
@@ -1791,6 +1791,174 @@ describe('StreamingExecutor actions', () => {
    });
  });

+  describe('content_part multimodal streaming', () => {
+    it('should dispatch isMultimodal metadata when content_part image chunks are received', async () => {
+      // Mock file store to prevent upload from hanging
+      const fileStoreMod = await import('@/store/file/store');
+      vi.spyOn(fileStoreMod, 'getFileStoreState').mockReturnValue({
+        uploadBase64FileWithProgress: vi
+          .fn()
+          .mockResolvedValue({ id: 'img-id', url: 'https://cdn.example.com/img.png' }),
+      } as any);
+
+      const { result } = renderHook(() => useChatStore());
+      const messages = [createMockMessage({ role: 'user' })];
+      const dispatchSpy = vi.spyOn(result.current, 'internal_dispatchMessage');
+
+      // Create operation for this test
+      const { operationId } = result.current.startOperation({
+        type: 'execAgentRuntime',
+        context: {
+          agentId: TEST_IDS.SESSION_ID,
+          topicId: null,
+          messageId: TEST_IDS.ASSISTANT_MESSAGE_ID,
+        },
+        label: 'Test AI Generation',
+      });
+
+      const streamSpy = vi
+        .spyOn(chatService, 'createAssistantMessageStream')
+        .mockImplementation(async ({ onMessageHandle, onFinish }) => {
+          // Send text content_part
+          await onMessageHandle?.({
+            type: 'content_part',
+            partType: 'text',
+            content: 'Here is a cat: ',
+          } as any);
+          // Send image content_part
+          await onMessageHandle?.({
+            type: 'content_part',
+            partType: 'image',
+            content: 'base64catimage',
+            mimeType: 'image/jpeg',
+          } as any);
+          await onFinish?.('Here is a cat: ', {} as any);
+        });
+
+      await act(async () => {
+        await result.current.internal_fetchAIChatMessage({
+          messages,
+          messageId: TEST_IDS.ASSISTANT_MESSAGE_ID,
+          model: 'gemini-3-pro-image-preview',
+          provider: 'google',
+          operationId,
+          agentConfig: createMockResolvedAgentConfig(),
+        });
+      });
+
+      // Find dispatch calls with metadata.isMultimodal
+      const multimodalDispatches = dispatchSpy.mock.calls.filter((call) => {
+        const dispatch = call[0];
+        return (
+          dispatch?.type === 'updateMessage' &&
+          'value' in dispatch &&
+          dispatch.value?.metadata?.isMultimodal === true
+        );
+      });
+
+      // Should have dispatched at least once with isMultimodal metadata during streaming
+      expect(multimodalDispatches.length).toBeGreaterThanOrEqual(1);
+
+      // Verify the dispatch includes tempDisplayContent
+      const firstMultimodalDispatch = multimodalDispatches[0][0] as any;
+      expect(firstMultimodalDispatch.value.metadata.tempDisplayContent).toBeDefined();
+
+      streamSpy.mockRestore();
+    });
+
+    it('should call optimisticUpdateMessageContent with isMultimodal metadata for multimodal content', async () => {
+      // Mock file store to prevent upload from hanging
+      const fileStoreMod = await import('@/store/file/store');
+      vi.spyOn(fileStoreMod, 'getFileStoreState').mockReturnValue({
+        uploadBase64FileWithProgress: vi
+          .fn()
+          .mockResolvedValue({ id: 'img-id', url: 'https://cdn.example.com/img.png' }),
+      } as any);
+
+      const { result } = renderHook(() => useChatStore());
+      const messages = [createMockMessage({ role: 'user' })];
+      const updateContentSpy = vi.spyOn(result.current, 'optimisticUpdateMessageContent');
+
+      const streamSpy = vi
+        .spyOn(chatService, 'createAssistantMessageStream')
+        .mockImplementation(async ({ onMessageHandle, onFinish }) => {
+          await onMessageHandle?.({
+            type: 'content_part',
+            partType: 'text',
+            content: 'Generated image: ',
+          } as any);
+          await onMessageHandle?.({
+            type: 'content_part',
+            partType: 'image',
+            content: 'base64imagedata',
+            mimeType: 'image/png',
+          } as any);
+          await onFinish?.('Generated image: ', {} as any);
+        });
+
+      await act(async () => {
+        await result.current.internal_fetchAIChatMessage({
+          messages,
+          messageId: TEST_IDS.ASSISTANT_MESSAGE_ID,
+          model: 'gemini-3-pro-image-preview',
+          provider: 'google',
+          agentConfig: createMockResolvedAgentConfig(),
+        });
+      });
+
+      // optimisticUpdateMessageContent should be called with metadata containing isMultimodal
+      expect(updateContentSpy).toHaveBeenCalledWith(
+        TEST_IDS.ASSISTANT_MESSAGE_ID,
+        expect.any(String), // serialized JSON content
+        expect.objectContaining({
+          metadata: expect.objectContaining({
+            isMultimodal: true,
+          }),
+        }),
+        expect.any(Object),
+      );
+
+      streamSpy.mockRestore();
+    });
+
+    it('should NOT dispatch isMultimodal metadata for plain text streaming', async () => {
+      const { result } = renderHook(() => useChatStore());
+      const messages = [createMockMessage({ role: 'user' })];
+      const dispatchSpy = vi.spyOn(result.current, 'internal_dispatchMessage');
+
+      const streamSpy = vi
+        .spyOn(chatService, 'createAssistantMessageStream')
+        .mockImplementation(async ({ onMessageHandle, onFinish }) => {
+          await onMessageHandle?.({ type: 'text', text: 'Hello World' } as any);
+          await onFinish?.('Hello World', {} as any);
+        });
+
+      await act(async () => {
+        await result.current.internal_fetchAIChatMessage({
+          messages,
+          messageId: TEST_IDS.ASSISTANT_MESSAGE_ID,
+          model: 'gpt-4o-mini',
+          provider: 'openai',
+          agentConfig: createMockResolvedAgentConfig(),
+        });
+      });
+
+      // No dispatch should contain isMultimodal metadata
+      const multimodalDispatches = dispatchSpy.mock.calls.filter((call) => {
+        const dispatch = call[0];
+        return (
+          dispatch?.type === 'updateMessage' &&
+          'value' in dispatch &&
+          dispatch.value?.metadata?.isMultimodal === true
+        );
+      });
+
+      expect(multimodalDispatches).toHaveLength(0);
+
+      streamSpy.mockRestore();
+    });
+  });
+
  describe('isSubTask filtering', () => {
    it('should filter out lobe-gtd tools when isSubTask is true', async () => {
      const { result } = renderHook(() => useChatStore());
--- a/src/store/chat/slices/aiChat/actions/streamingExecutor.ts
+++ b/src/store/chat/slices/aiChat/actions/streamingExecutor.ts
@@ -420,12 +420,21 @@ export const streamingExecutor: StateCreator<
    const handler = new StreamingHandler(
      { messageId, operationId, agentId, groupId, topicId },
      {
-        onContentUpdate: (content, reasoning) => {
+        onContentUpdate: (content, reasoning, contentMetadata) => {
          internal_dispatchMessage(
            {
              id: messageId,
              type: 'updateMessage',
-              value: { content, reasoning },
+              value: {
+                content,
+                reasoning,
+                ...(contentMetadata && {
+                  metadata: {
+                    isMultimodal: contentMetadata.isMultimodal,
+                    tempDisplayContent: contentMetadata.tempDisplayContent,
+                  },
+                }),
+              },
            },
            { operationId },
          );
--- a/src/store/chat/slices/aiChat/actions/types/streaming.ts
+++ b/src/store/chat/slices/aiChat/actions/types/streaming.ts
@@ -40,7 +40,11 @@ export type GroundingData = GroundingSearch;
 */
 export interface StreamingCallbacks {
  /** Content update */
-  onContentUpdate: (content: string, reasoning?: ReasoningState) => void;
+  onContentUpdate: (
+    content: string,
+    reasoning?: ReasoningState,
+    contentMetadata?: { isMultimodal: boolean; tempDisplayContent: string },
+  ) => void;
  /** Search grounding update */
  onGroundingUpdate: (grounding: GroundingData) => void;
  /** Image list update */
@@ -101,19 +105,19 @@ export interface StreamingResult {
 * Stream chunk types
 */
 export type StreamChunk =
-  | { text: string, type: 'text'; }
-  | { text: string, type: 'reasoning'; }
-  | { content: string; mimeType?: string, partType: 'text' | 'image'; type: 'reasoning_part'; }
-  | { content: string; mimeType?: string, partType: 'text' | 'image'; type: 'content_part'; }
+  | { text: string; type: 'text' }
+  | { text: string; type: 'reasoning' }
+  | { content: string; mimeType?: string; partType: 'text' | 'image'; type: 'reasoning_part' }
+  | { content: string; mimeType?: string; partType: 'text' | 'image'; type: 'content_part' }
  | {
      isAnimationActives?: boolean[];
      tool_calls: MessageToolCall[];
      type: 'tool_calls';
    }
-  | { grounding?: GroundingData, type: 'grounding'; }
+  | { grounding?: GroundingData; type: 'grounding' }
  | {
-      image: { data: string, id: string; };
-      images: { data: string, id: string; }[];
+      image: { data: string; id: string };
+      images: { data: string; id: string }[];
      type: 'base64_image';
    }
  | { type: 'stop' };
--- a/src/store/chat/slices/message/actions/optimisticUpdate.ts
+++ b/src/store/chat/slices/message/actions/optimisticUpdate.ts
@@ -223,7 +223,7 @@ export const messageOptimisticUpdate: StateCreator<
        {
          id,
          type: 'updateMessage',
-          value: { content },
+          value: { content, metadata: extra?.metadata },
        },
        context,
      );